def main(self, args): from aetros import keras_model_utils import aetros.const from aetros.backend import JobBackend from aetros.Trainer import Trainer parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' upload-weights') parser.add_argument('id', help='model name or job id') parser.add_argument('weights', help="Weights path") parser.add_argument( '--api-key', help="Secure key. Alternatively use API_KEY environment variable.") parser.add_argument( '--kpi', help="You can overwrite or set the KPI for this job") parser.add_argument( '--latest', action="store_true", help="Instead of best epoch we upload latest weights.") parsed_args = parser.parse_args(args) if not parsed_args.id or not parsed_args.weights: parser.print_help() return job_backend = JobBackend(api_key=parsed_args.api_key) if '/' in parsed_args.id and '@' not in parsed_args.id: job_backend.create(parsed_args.id) job_backend.load(parsed_args.id) if job_backend.job is None: raise Exception("Job not found") weights_path = parsed_args.weights if not os.path.exists(weights_path): raise Exception('Weights file does not exist in ' + weights_path) print("Uploading weights to %s of %s ..." % (job_backend.job_id, job_backend.model_id)) job_backend.upload_weights( 'weights.hdf5', weights_path, float(parsed_args.kpi) if parsed_args.kpi else None)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run') parser.add_argument('command', nargs='?', help="The command to run. Default read in configuration file") parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host.") parser.add_argument('--no-image', action='store_true', help="Forces not to use docker, even when image is defined in the configuration file.") parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed.") parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in configuration file") parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.") parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument('--offline', '-o', action='store_true', help="Whether the execution should happen offline.") parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.") parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.") parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.") parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container. Only when --local") parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local") parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env") parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.") parsed_args = parser.parse_args(args) if parsed_args.config and not os.path.exists(parsed_args.config): self.logger.error("fatal: file %s does not exist." % (parsed_args.config,)) sys.exit(2) config = find_config(parsed_args.config) home_config = read_home_config() if config['model'] and not parsed_args.model: parsed_args.model = config['model'] if not parsed_args.model: print("fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) if not parsed_args.local and parsed_args.volume: print("fatal: can not use volume with jobs on the cluster. Use datasets instead.") sys.exit(1) if parsed_args.local and parsed_args.priority: print("fatal: the priority can only be set for jobs in the cluster.") sys.exit(1) if config['image']: ensure_docker_installed(self.logger) env = {} if parsed_args.e: for item in parsed_args.e: if '=' in item: k, v = item.split('=') else: k = item v = os.getenv(k) env[k] = v if ('command' not in config or not config['command']) and not parsed_args.command: self.logger.error('No command given. Define the command in aetros.yml or use command argument.') sys.exit(1) job_backend = JobBackend(parsed_args.model, self.logger) ignore = [] if 'ignore' in config: ignore = config['ignore'] job_backend.job = {'config': {'ignore': ignore}} adding_files = loading_text("- Adding job files to index ... ") files_added, size_added = job_backend.add_files(config['root'], report=False) adding_files("done with %d file%s added (%s)." % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2))) create_info = { 'type': 'custom', 'config': config } incoming_hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"') name, value = param.split('=') incoming_hyperparameter[name] = value # first transform simple format in the full definition with parameter types # (string, number, group, choice_group, etc) full_hyperparameters = lose_parameters_to_full(config['parameters']) # now extract hyperparameters from full definition, and overwrite stuff using # incoming_hyperparameter if available hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter) create_info['config']['parameters'] = hyperparameter if parsed_args.rebuild_image: create_info['config']['rebuild_image'] = True if parsed_args.max_epochs: create_info['config']['maxEpochs'] = int(parsed_args.max_epochs) create_info['config']['priority'] = 0 if parsed_args.priority: create_info['config']['priority'] = float(parsed_args.priority) if parsed_args.max_time: create_info['config']['maxTime'] = float(parsed_args.max_time) if parsed_args.command: create_info['config']['command'] = parsed_args.command if parsed_args.image: # reset install options, since we can't make sure if the base image still fits if 'image' in config and config['image'] and config['image'] != parsed_args.image: create_info['config']['install'] = None # reset dockerfile, since we specified manually an image create_info['config']['dockerfile'] = None create_info['config']['image'] = parsed_args.image if parsed_args.no_image: create_info['config']['image'] = None if parsed_args.server: create_info['config']['servers'] = [] for name in parsed_args.server: create_info['config']['servers'].append(name) create_info['config']['resources'] = create_info['config'].get('resources', {}) resources = create_info['config']['resources'] default_cpu_and_memory = 1 if create_info['config']['image'] else 0 resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory)) resources['memory'] = int(parsed_args.memory or resources.get('memory', default_cpu_and_memory)) resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0)) resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0)) if parsed_args.local: create_info['server'] = 'local' # make sure we do not limit the resources to something that is not available on the local machine warning = [] cpu = cpuinfo.get_cpu_info() mem = psutil.virtual_memory().total gpu = 0 try: gpu = len(get_ordered_devices()) except CudaNotImplementedException: pass if not create_info['config']['image'] and not all([x == 0 for x in six.itervalues(resources)]): self.logger.warning("! No Docker virtualization since no `image` defined, resources limitation ignored.") if create_info['config']['image'] and resources['gpu'] > 0: if not (sys.platform == "linux" or sys.platform == "linux2"): self.logger.warning("! Your operating system does not support GPU allocation for " "Docker virtualization. " "NVIDIA-Docker2 is only supported on Linux.") local_max_resources = {'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu} if create_info['config']['image']: # read max hardware within Docker out = docker_call(['run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal']) cpus, memory = out.decode('utf-8').strip().split('\n') local_max_resources['cpu'] = int(cpus) memory = memory.replace('MemTotal:', '').replace('kB', '').strip() local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024) if local_max_resources['cpu'] < resources['cpu']: warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu'])) resources['cpu'] = local_max_resources['cpu'] if local_max_resources['memory'] < resources['memory']: warning.append('memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory'])) resources['memory'] = local_max_resources['memory'] if local_max_resources['gpu'] < resources['gpu']: warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu'])) resources['gpu'] = local_max_resources['gpu'] if warning: self.logger.warning("! Resources downgrade due to missing hardware: %s." % (', '.join(warning),)) if parsed_args.config and not create_info['config']['configPath']: create_info['config']['configPath'] = parsed_args.config create_info['config']['sourcesAttached'] = True creating_git_job = loading_text("- Create job in local Git ... ") if aetros.utils.git.get_current_commit_hash(): create_info['origin_git_source'] = { 'origin': aetros.utils.git.get_current_remote_url(), 'author': aetros.utils.git.get_current_commit_author(), 'message': aetros.utils.git.get_current_commit_message(), 'branch': aetros.utils.git.get_current_branch(), 'commit': aetros.utils.git.get_current_commit_hash(), } job_backend.create(create_info=create_info, server=None) creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name)) summary = "➤ Summary: Job running " if parsed_args.local: summary += 'locally' else: summary += 'on the cluster' if create_info['config']['image']: summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \ % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu']) else: summary += ' on host using all available resources.' print(summary) # tasks = [] # # if 'tasks' in config: # for name, task_config in six.iteritems(config['tasks']): # replica = 1 # if 'replica' in task_config: # replica = int(task_config['replica']) # for index in range(0, replica): # tasks.append(job_backend.create_task(job_id, task_config, name, index)) if parsed_args.offline: if not parsed_args.local: self.logger.warning("Can not create a remote job in offline mode.") sys.exit(1) self.logger.warning("Execution started offline.") else: adding_files = loading_text("- Connecting to "+home_config['host']+" ... ") if job_backend.connect(): adding_files("connected.") else: parsed_args.offline = True adding_files("failed. Continue in offline mode.") if not parsed_args.offline: sys.stdout.write("- Uploading job data ... ") job_backend.git.push() job_backend.client.wait_until_queue_empty(['files'], clear_end=False) sys.stdout.write(" done.\n") link = "%smodel/%s/job/%s" % (home_config['url'], job_backend.model_name, job_backend.job_id) sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link)) if parsed_args.local: job_backend.start(collect_system=False, offline=parsed_args.offline, push=False) if not parsed_args.offline: job_backend.git.start_push_sync() cpus = create_info['config']['resources']['cpu'] memory = create_info['config']['resources']['memory'] if not parsed_args.gpu_device and create_info['config']['resources']['gpu'] > 0: # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1] parsed_args.gpu_device = [] for i in range(0, create_info['config']['resources']['gpu']): parsed_args.gpu_device.append(i) start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device, offline=parsed_args.offline)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run') parser.add_argument('command', nargs='?', help="The command to run. Default read in aetros.yml") parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.") parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.") parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in aetros.yml") parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.") parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.") parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.") parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container.") parser.add_argument('--volume', '-v', action='append', help="Volume into docker") parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env") parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.") parsed_args = parser.parse_args(args) config = read_config(parsed_args.config or 'aetros.yml') env = {} if parsed_args.e: for item in parsed_args.e: if '=' in item: k, v = item.split('=') else: k = item v = os.getenv(k) env[k] = v if 'command' not in config and not parsed_args.command: self.logger.error('No "command" given in aetros.yml or as argument.') sys.exit(1) job = JobBackend(parsed_args.model, self.logger, parsed_args.config or 'aetros.yml') ignore = [] if 'ignore' in config: ignore = config['ignore'] job.job = {'config': {'ignore': ignore}} files_added, size_added = job.add_files() print("%d files added (%s)" % (files_added, human_size(size_added, 2))) create_info = { 'type': 'custom', 'config': config } incoming_hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"') name, value = param.split('=') incoming_hyperparameter[name] = value # first transform simple format in the full definition with parameter types # (string, number, group, choice_group, etc) full_hyperparameters = lose_parameters_to_full(config['parameters']) # now extract hyperparameters from full definition, and overwrite stuff using # incoming_hyperparameter if available hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter) create_info['config']['parameters'] = hyperparameter if parsed_args.max_epochs: create_info['config']['maxEpochs'] = int(parsed_args.max_epochs) create_info['config']['priority'] = 0 if parsed_args.priority: create_info['config']['priority'] = float(parsed_args.priority) if parsed_args.max_time: create_info['config']['maxTime'] = float(parsed_args.max_time) if parsed_args.command: create_info['config']['command'] = parsed_args.command if parsed_args.image: # reset install options, since we can't make sure if the base image still fits if 'image' in config and config['image'] and config['image'] != parsed_args.image: create_info['config']['install'] = None # reset dockerfile, since we specified manually an image create_info['config']['dockerfile'] = None create_info['config']['image'] = parsed_args.image if parsed_args.server: create_info['config']['servers'] = [] for name in parsed_args.server: create_info['config']['servers'].append(name) if 'resources' not in create_info['config']: create_info['config']['resources'] = {} if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory: if parsed_args.cpu: create_info['config']['resources']['cpu'] = float(parsed_args.cpu) if parsed_args.memory: create_info['config']['resources']['memory'] = float(parsed_args.memory) if parsed_args.gpu is not None: create_info['config']['resources']['gpu'] = float(parsed_args.gpu) if parsed_args.gpu_memory: create_info['config']['resources']['gpu_memory'] = float(parsed_args.gpu_memory) if parsed_args.local: # usually, the aetros server would assign resources at job root level from the assigned server # but since it's started locally, we just use the requested one. User should know what they do. # start.py will use 'config' stuff anyone for docker limitation, so we should make sure it is # being displayed. if 'image' in create_info['config'] and create_info['config']: resources = create_info['config']['resources'] create_info['resources_assigned'] = {'cpus': 1, 'memory': 1, 'gpus': []} if 'gpu' in resources and resources['gpu'] > 0: create_info['resources_assigned']['gpus'] = [1] * resources['gpu'] if 'cpu' in resources: create_info['resources_assigned']['cpus'] = resources['cpu'] if 'memory' in resources: create_info['resources_assigned']['memory'] = resources['memory'] else: # since this runs on the host, extract machine hardware and put int resources_assigned # so we see it at the job. pass if parsed_args.local: create_info['server'] = 'local' create_info['config']['sourcesAttached'] = True if aetros.utils.git.get_current_commit_hash(): create_info['origin_git_source'] = { 'origin': aetros.utils.git.get_current_remote_url(), 'author': aetros.utils.git.get_current_commit_author(), 'message': aetros.utils.git.get_current_commit_message(), 'branch': aetros.utils.git.get_current_branch(), 'commit': aetros.utils.git.get_current_commit_hash(), } job.create(create_info=create_info, server=None) print("Job %s/%s created." % (job.model_name, job.job_id)) if parsed_args.local: start(self.logger, job.model_name + '/' + job.job_id, fetch=False, env=env, volumes=parsed_args.volume, gpu_devices=parsed_args.gpu_device) else: if parsed_args.volume: print("Can not use volume with jobs on the cluster. Use datasets instead.") sys.exit(1) #todo, make it visible job.git.push() print("Open http://%s/model/%s/job/%s to monitor it." % (job.host, job.model_name, job.job_id))
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run') parser.add_argument( 'command', nargs='?', help="The command to run. Default read in configuration file") parser.add_argument( '-i', '--image', help= "Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host." ) parser.add_argument( '--no-image', action='store_true', help= "Forces not to use docker, even when image is defined in the configuration file." ) parser.add_argument( '-s', '--server', action='append', help= "Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed." ) parser.add_argument( '-m', '--model', help= "Under which model this job should be listed. Default read in configuration file" ) parser.add_argument( '-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument( '-c', '--config', help="Default aetros.yml in current working directory.") parser.add_argument( '--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument( '--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument( '--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument( '--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument( '--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument( '--offline', '-o', action='store_true', help="Whether the execution should happen offline.") parser.add_argument( '--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.") parser.add_argument( '--max-time', help= "Limit execution time in seconds. Sends SIGINT to the process group when reached." ) parser.add_argument( '--max-epochs', help= "Limit execution epochs. Sends SIGINT to the process group when reached." ) parser.add_argument( '--gpu-device', action='append', help= "Which device id should be mapped into the NVIDIA docker container. Only when --local" ) parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local") parser.add_argument( '-e', action='append', help= "Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env" ) parser.add_argument( '-p', '--param', action='append', help= "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed." ) parsed_args = parser.parse_args(args) if parsed_args.config and not os.path.exists(parsed_args.config): self.logger.error("fatal: file %s does not exist." % (parsed_args.config, )) sys.exit(2) config = find_config(parsed_args.config) home_config = read_home_config() if config['model'] and not parsed_args.model: parsed_args.model = config['model'] if not parsed_args.model: print( "fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'." ) sys.exit(2) if not parsed_args.local and parsed_args.volume: print( "fatal: can not use volume with jobs on the cluster. Use datasets instead." ) sys.exit(1) if parsed_args.local and parsed_args.priority: print( "fatal: the priority can only be set for jobs in the cluster.") sys.exit(1) if config['image']: ensure_docker_installed(self.logger) env = {} if parsed_args.e: for item in parsed_args.e: if '=' in item: k, v = item.split('=') else: k = item v = os.getenv(k) env[k] = v if ('command' not in config or not config['command']) and not parsed_args.command: self.logger.error( 'No command given. Define the command in aetros.yml or use command argument.' ) sys.exit(1) job_backend = JobBackend(parsed_args.model, self.logger) ignore = [] if 'ignore' in config: ignore = config['ignore'] job_backend.job = {'config': {'ignore': ignore}} adding_files = loading_text("- Adding job files to index ... ") files_added, size_added = job_backend.add_files(config['root'], report=False) adding_files("done with %d file%s added (%s)." % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2))) create_info = {'type': 'custom', 'config': config} incoming_hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception( '--param ' + param + ' does not contain a `=`. Please use "--param name=value"' ) name, value = param.split('=') incoming_hyperparameter[name] = value # first transform simple format in the full definition with parameter types # (string, number, group, choice_group, etc) full_hyperparameters = lose_parameters_to_full(config['parameters']) # now extract hyperparameters from full definition, and overwrite stuff using # incoming_hyperparameter if available hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter) create_info['config']['parameters'] = hyperparameter if parsed_args.rebuild_image: create_info['config']['rebuild_image'] = True if parsed_args.max_epochs: create_info['config']['maxEpochs'] = int(parsed_args.max_epochs) create_info['config']['priority'] = 0 if parsed_args.priority: create_info['config']['priority'] = float(parsed_args.priority) if parsed_args.max_time: create_info['config']['maxTime'] = float(parsed_args.max_time) if parsed_args.command: create_info['config']['command'] = parsed_args.command if parsed_args.image: # reset install options, since we can't make sure if the base image still fits if 'image' in config and config[ 'image'] and config['image'] != parsed_args.image: create_info['config']['install'] = None # reset dockerfile, since we specified manually an image create_info['config']['dockerfile'] = None create_info['config']['image'] = parsed_args.image if parsed_args.no_image: create_info['config']['image'] = None if parsed_args.server: create_info['config']['servers'] = [] for name in parsed_args.server: create_info['config']['servers'].append(name) create_info['config']['resources'] = create_info['config'].get( 'resources', {}) resources = create_info['config']['resources'] default_cpu_and_memory = 1 if create_info['config']['image'] else 0 resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory)) resources['memory'] = int( parsed_args.memory or resources.get('memory', default_cpu_and_memory)) resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0)) resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0)) if parsed_args.local: create_info['server'] = 'local' # make sure we do not limit the resources to something that is not available on the local machine warning = [] cpu = cpuinfo.get_cpu_info() mem = psutil.virtual_memory().total gpu = 0 try: gpu = len(get_ordered_devices()) except CudaNotImplementedException: pass if not create_info['config']['image'] and not all( [x == 0 for x in six.itervalues(resources)]): self.logger.warning( "! No Docker virtualization since no `image` defined, resources limitation ignored." ) if create_info['config']['image'] and resources['gpu'] > 0: if not (sys.platform == "linux" or sys.platform == "linux2"): self.logger.warning( "! Your operating system does not support GPU allocation for " "Docker virtualization. " "NVIDIA-Docker2 is only supported on Linux.") local_max_resources = { 'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu } if create_info['config']['image']: # read max hardware within Docker out = docker_call([ 'run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal' ]) cpus, memory = out.decode('utf-8').strip().split('\n') local_max_resources['cpu'] = int(cpus) memory = memory.replace('MemTotal:', '').replace('kB', '').strip() local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024) if local_max_resources['cpu'] < resources['cpu']: warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu'])) resources['cpu'] = local_max_resources['cpu'] if local_max_resources['memory'] < resources['memory']: warning.append( 'memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory'])) resources['memory'] = local_max_resources['memory'] if local_max_resources['gpu'] < resources['gpu']: warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu'])) resources['gpu'] = local_max_resources['gpu'] if warning: self.logger.warning( "! Resources downgrade due to missing hardware: %s." % (', '.join(warning), )) if parsed_args.config and not create_info['config']['configPath']: create_info['config']['configPath'] = parsed_args.config create_info['config']['sourcesAttached'] = True creating_git_job = loading_text("- Create job in local Git ... ") if aetros.utils.git.get_current_commit_hash(): create_info['origin_git_source'] = { 'origin': aetros.utils.git.get_current_remote_url(), 'author': aetros.utils.git.get_current_commit_author(), 'message': aetros.utils.git.get_current_commit_message(), 'branch': aetros.utils.git.get_current_branch(), 'commit': aetros.utils.git.get_current_commit_hash(), } job_backend.create(create_info=create_info, server=None) creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name)) summary = "➤ Summary: Job running " if parsed_args.local: summary += 'locally' else: summary += 'on the cluster' if create_info['config']['image']: summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \ % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu']) else: summary += ' on host using all available resources.' print(summary) # tasks = [] # # if 'tasks' in config: # for name, task_config in six.iteritems(config['tasks']): # replica = 1 # if 'replica' in task_config: # replica = int(task_config['replica']) # for index in range(0, replica): # tasks.append(job_backend.create_task(job_id, task_config, name, index)) if parsed_args.offline: if not parsed_args.local: self.logger.warning( "Can not create a remote job in offline mode.") sys.exit(1) self.logger.warning("Execution started offline.") else: adding_files = loading_text("- Connecting to " + home_config['host'] + " ... ") if job_backend.connect(): adding_files("connected.") else: parsed_args.offline = True adding_files("failed. Continue in offline mode.") if not parsed_args.offline: sys.stdout.write("- Uploading job data ... ") job_backend.git.push() job_backend.client.wait_until_queue_empty(['files'], clear_end=False) sys.stdout.write(" done.\n") link = "%s/model/%s/job/%s" % ( home_config['url'], job_backend.model_name, job_backend.job_id) sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link)) if parsed_args.local: job_backend.start(collect_system=False, offline=parsed_args.offline, push=False) if not parsed_args.offline: job_backend.git.start_push_sync() cpus = create_info['config']['resources']['cpu'] memory = create_info['config']['resources']['memory'] if not parsed_args.gpu_device and create_info['config'][ 'resources']['gpu'] > 0: # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1] parsed_args.gpu_device = [] for i in range(0, create_info['config']['resources']['gpu']): parsed_args.gpu_device.append(i) start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device, offline=parsed_args.offline)
class KerasIntegration(): def __init__(self, id, model, api_key, insights=False, confusion_matrix=False, insight_sample=None): """ :type id: basestring The actual model name available in AETROS Trainer. Example peter/mnist-cnn :type insights: bool :type confusion_matrix: bool :type insight_sample: basestring|None A path to a sample which is being used for the insights. Default is first sample of data_validation. """ self.confusion_matrix = confusion_matrix self.model = model if isinstance(model, Sequential) and not model.built: raise Exception('Sequential model is not built.') self.insight_sample = insight_sample self.id = id self.insights = insights self.model_type = 'custom' self.job_backend = JobBackend(api_token=api_key) copy = { 'fit': self.model.fit, 'fit_generator': self.model.fit_generator } def overwritten_fit(x, y, batch_size=32, nb_epoch=10, verbose=1, callbacks=[], validation_split=0., validation_data=None, shuffle=True, class_weight=None, sample_weight=None, **kwargs): callback = self.setup(x, nb_epoch, batch_size) callbacks.append(callback) copy['fit'](x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, True, class_weight, sample_weight, **kwargs) self.end() def overwritten_fit_generator(generator, samples_per_epoch, nb_epoch, verbose=1, callbacks=[], validation_data=None, nb_val_samples=None, class_weight={}, max_q_size=10, nb_worker=1, pickle_safe=False): callback = self.setup(generator, nb_epoch) self.trainer.nb_val_samples = nb_val_samples self.trainer.data_validation = validation_data callbacks.append(callback) copy['fit_generator'](generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size, nb_worker, pickle_safe) self.end() self.model.fit = overwritten_fit self.model.fit_generator = overwritten_fit_generator def setup(self, x=None, nb_epoch=1, batch_size=16): graph = self.model_to_graph(self.model) from keras.preprocessing.image import Iterator if isinstance(x, Iterator): batch_size = x.batch_size settings = { 'epochs': nb_epoch, 'batchSize': batch_size, 'optimizer': type(self.model.optimizer).__name__ if hasattr( self.model, 'optimizer') else '' } self.job_backend.ensure_model(self.id, self.model.to_json(), settings=settings, type=self.model_type, graph=graph) job_id = self.job_backend.create(self.id, insights=self.insights) self.job_backend.start() print( "AETROS job '%s' created and started. Open http://%s/trainer/app#/job=%s to monitor the training." % (job_id, self.job_backend.host, job_id)) job = self.job_backend.load_light_job() general_logger = GeneralLogger(job, job_backend=self.job_backend) self.trainer = Trainer(self.job_backend, general_logger) self.monitoringThread = MonitoringThread(self.job_backend, self.trainer) self.monitoringThread.daemon = True self.monitoringThread.start() self.trainer.model = self.model self.trainer.data_train = {'x': x} self.callback = KerasLogger(self.trainer, self.job_backend, general_logger) self.callback.log_epoch = False self.callback.model = self.model self.callback.confusion_matrix = self.confusion_matrix return self.callback def publish(self): graph = self.model_to_graph(self.model) self.job_backend.ensure_model(self.id, self.model.to_json(), type=self.model_type, graph=graph) def start(self, nb_epoch=1, nb_sample=1, title="TRAINING"): """ Starts $title :return: """ self.setup(nb_epoch) self.callback.params['nb_epoch'] = nb_epoch self.callback.params['nb_sample'] = nb_sample self.callback.on_train_begin() return self.callback def batch_begin(self, batch, size): logs = { 'batch': batch, 'size': size, } self.callback.on_batch_end(batch, logs) def batch_end(self, batch, size, loss=0, acc=0): logs = { 'loss': loss, 'acc': acc, 'batch': batch, 'size': size, } self.callback.on_batch_end(batch, logs) def epoch_end(self, epoch, loss=0, val_loss=0, acc=0, val_acc=0): """ :type epoch: integer starting with 0 """ logs = { 'loss': loss, 'val_loss': val_loss, 'acc': acc, 'val_acc': val_acc, 'epoch': epoch } self.callback.on_epoch_end(epoch, logs) def end(self): self.monitoringThread.stop() self.job_backend.sync_weights() self.job_backend.set_status('DONE') def model_to_graph(self, model): graph = {'nodes': [], 'links': [], 'groups': []} map = {'idx': {}, 'flatten': [], 'group_pointer': -1} def layer_to_dict(layer): info = {} if isinstance(layer, Dropout): info['dropout'] = layer.p if isinstance(layer, Dense): info['neurons'] = layer.output_dim info['activaton'] = layer.activation.__name__ if isinstance(layer, Convolution2D): info['receptiveField'] = [layer.nb_col, layer.nb_row] info['features'] = layer.nb_filter if isinstance(layer, MaxPooling2D): info['poolingArea'] = [layer.pool_size[0], layer.pool_size[1]] if isinstance(layer, Embedding): info['inputDim'] = layer.input_dim info['outputDim'] = layer.output_dim info['dropout'] = layer.dropout if isinstance(layer, Activation): info['activaton'] = layer.activation.__name__ if isinstance(layer, Merge): info['mode'] = layer.mode if isinstance(layer, RepeatVector): info['n'] = layer.n if isinstance(layer, InputLayer): info['inputShape'] = layer.input_shape info['outputShape'] = layer.output_shape return { 'name': layer.name, 'class': type(layer).__name__, 'width': 60, 'height': 40, 'info': info } def add_layer(layer): graph['nodes'].append(layer_to_dict(layer)) map['flatten'].append(layer) map['idx'][layer.name] = len(graph['nodes']) - 1 # if map['group_pointer'] >= 0: # graph['groups'][map['group_pointer']].append(len(graph['nodes'])-1) def get_idx(layer): return map['idx'][layer.name] def extract_layers(layers): for layer in layers: if layer not in map['flatten']: add_layer(layer) if hasattr(layer, 'layers') and isinstance( layer.layers, list): # graph['groups'].append([]) # map['group_pointer'] += 1 extract_layers(layer.layers) # map['group_pointer'] -= 1 else: for inbound_node in layer.inbound_nodes: extract_layers(inbound_node.inbound_layers) extract_layers(model.layers) # build edges for layer in map['flatten']: for inbound_node in layer.inbound_nodes: for inbound_layer in inbound_node.inbound_layers: graph['links'].append({ 'source': get_idx(inbound_layer), 'target': get_idx(layer), }) if hasattr(layer, 'layers') and isinstance(layer.layers, list): graph['links'].append({ 'source': get_idx(layer.layers[-1]), 'target': get_idx(layer), }) return graph def model_to_layers(self, model): layers = [] # from keras.models import Sequential # if isinstance(model, Sequential): # for layer in model.layers: # layers[] # 'fc': 'Dense', # 'conv': 'Convolutional2D', # 'pool': 'MaxPooling2D', # 'pool_average': 'AveragePooling2D', # 'zero_padding': 'ZeroPadding2D', # 'upsampling': 'UpSampling2D', # 'flatten': 'Flatten', # 'merge': 'Merge', layer_type_map = { 'InputLayer': 'fc', 'Dense': 'fc', 'Convolution2D': 'conv', 'MaxPooling2D': 'pool', 'AveragePooling2D': 'pool_average', 'ZeroPadding2D': 'zero_padding', 'UpSampling2D': 'upsampling', 'Flatten': 'flatten', 'Merge': 'merge', } def get_input_layer(layer): if isinstance(layer, Activation) or isinstance(layer, Dropout): return get_input_layer( layer.inbound_nodes[0].inbound_layers[0]) return layer for keras_layer in model.layers: name = type(keras_layer).__name__ if name in layer_type_map: typeStr = layer_type_map[name] else: typeStr = name layer = { 'id': keras_layer.name, 'name': keras_layer.name, 'type': typeStr, 'connectedTo': [], 'receptiveField': { 'width': 0, 'height': 0 }, 'poolingArea': { 'width': 0, 'height': 0 }, 'padding': [], 'features': 0, } if isinstance(keras_layer, Convolution2D): layer['receptiveField']['width'] = keras_layer.nb_col layer['receptiveField']['height'] = keras_layer.nb_row layer['features'] = keras_layer.nb_filter if isinstance(keras_layer, MaxPooling2D): layer['poolingArea']['width'] = keras_layer.pool_size[0] layer['poolingArea']['height'] = keras_layer.pool_size[1] if isinstance(keras_layer, InputLayer): if len(keras_layer.input_shape) == 4: # grayscale if keras_layer.input_shape[1] == 1: layer['inputType'] = 'image' layer['width'] = keras_layer.input_shape[2] layer['height'] = keras_layer.input_shape[3] elif keras_layer.input_shape[1] == 3: layer['inputType'] = 'image_rgb' layer['width'] = keras_layer.input_shape[2] layer['height'] = keras_layer.input_shape[3] elif len(keras_layer.input_shape) == 2: layer['inputType'] = 'list' layer['width'] = keras_layer.input_shape[1] layer['height'] = 1 else: layer['inputType'] = 'custom' layer['shape'] = keras_layer.input_shape if isinstance(keras_layer, Dense): layer['weight'] = keras_layer.output_dim if isinstance(keras_layer, Dropout): layers[-1][0]['dropout'] = keras_layer.p continue if isinstance(keras_layer, Activation): activation_function = str(keras_layer.activation) layers[-1][0][ 'activationFunction'] = activation_function.split(' ')[1] continue for inbound_node in keras_layer.inbound_nodes: for inbound_layer in inbound_node.inbound_layers: inbound_layer = get_input_layer(inbound_layer) layer['connectedTo'].append(inbound_layer.name) layers.append([layer]) return layers
def main(self, args): from aetros import keras_model_utils import aetros.const from aetros.backend import JobBackend from aetros.logger import GeneralLogger from aetros.Trainer import Trainer parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' upload-weights') parser.add_argument('id', nargs='?', help='model name or job id') parser.add_argument( '--secure-key', help="Secure key. Alternatively use API_KEY environment varibale.") parser.add_argument( '--weights', help= "Weights path. Per default we try to find it in the ./weights/ folder." ) parser.add_argument( '--accuracy', help= "If you specified model name, you should also specify the accuracy this weights got." ) parser.add_argument( '--latest', action="store_true", help="Instead of best epoch we upload latest weights.") parsed_args = parser.parse_args(args) job_backend = JobBackend(api_token=parsed_args.secure_key) if '/' in parsed_args.id and '@' not in parsed_args.id: job_backend.create(parsed_args.id) job_backend.load(parsed_args.id) if job_backend.job is None: raise Exception("Job not found") job_model = job_backend.get_job_model() weights_path = job_model.get_weights_filepath_best() if parsed_args.weights: weights_path = parsed_args.weights print(("Validate weights in %s ..." % (weights_path, ))) keras_model_utils.job_prepare(job_model) general_logger = GeneralLogger() trainer = Trainer(job_backend, general_logger) job_model.set_input_shape(trainer) print("Loading model ...") model_provider = job_model.get_model_provider() model = model_provider.get_model(trainer) loss = model_provider.get_loss(trainer) optimizer = model_provider.get_optimizer(trainer) print("Compiling ...") model_provider.compile(trainer, model, loss, optimizer) print(("Validate weights %s ..." % (weights_path, ))) job_model.load_weights(model, weights_path) print("Validated.") print("Uploading weights to %s of %s ..." % (job_backend.job_id, job_backend.model_id)) job_backend.upload_weights( 'best.hdf5', weights_path, float(parsed_args.accuracy) if parsed_args.accuracy else None) print("Done")