def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument('network_name', nargs='?', help='the network name, e.g. aetros/mnist-network') parser.add_argument('--insights', action='store_true', help="activates insights") parser.add_argument('--insights-sample', help="Path or url to the sample being used to generate the insights. Default is first training sample.") parser.add_argument('--dataset', help="Dataset id when network has placeholders") parser.add_argument('--gpu', action='store_true', help="Activates GPU if available") parser.add_argument('--device', help="Which device index should be used. Default 0 (which means with --gpu => 'gpu0')") parser.add_argument('--tf', action='store_true', help="Uses TensorFlow instead of Theano") parser.add_argument('--mp', help="Activates multithreading if available with given thread count.") parsed_args = parser.parse_args(args) if not parsed_args.network_name: parser.print_help() sys.exit() import os flags = os.environ['THEANO_FLAGS'] if 'THEANO_FLAGS' in os.environ else '' if parsed_args.gpu: if parsed_args.device: flags += ",device=gpu" + parsed_args.device else: flags += ",device=gpu" if parsed_args.mp: flags += ",openmp=True" os.environ['OMP_NUM_THREADS'] = parsed_args.mp os.environ['THEANO_FLAGS'] = flags if parsed_args.tf: os.environ['KERAS_BACKEND'] = 'tensorflow' start(parsed_args.network_name, dataset_id=parsed_args.dataset, insights=parsed_args.insights, insights_sample_path=parsed_args.insights_sample)
def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument( 'name', nargs='?', help= 'the model name, e.g. aetros/mnist-network to start new job, or job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5 to start a pre-created job.' ) parser.add_argument( '-i', '--image', help= "Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host." ) parser.add_argument( '-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument( '-s', '--server', action='append', help= "Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed." ) parser.add_argument( '-b', '--branch', help= "This overwrites the Git branch used when new job should be started." ) parser.add_argument( '--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument( '--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument( '--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument( '--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument( '--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument( '--gpu-device', action='append', help= "Which device id should be mapped into the NVIDIA docker container." ) parser.add_argument( '--max-time', help= "Limit execution time in seconds. Sends SIGINT to the process group when reached." ) parser.add_argument( '--max-epochs', help= "Limit execution epochs. Sends SIGINT to the process group when reached." ) parser.add_argument('--insights', action='store_true', help="activates insights. Only for simple models.") parser.add_argument( '--dataset', help= "Dataset id when model has placeholders. Only for simple models with placeholders as input/output." ) parser.add_argument( '-p', '--param', action='append', help= "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed." ) parsed_args = parser.parse_args(args) home_config = read_home_config() hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception( '--param ' + param + ' does not contain a `=`. Please use "--param name=value"' ) name, value = param.split('=') hyperparameter[name] = value job_config = {'insights': parsed_args.insights} if parsed_args.image: job_config['image'] = parsed_args.image if parsed_args.branch: job_config['sourceGitTree'] = parsed_args.branch if parsed_args.max_epochs: job_config['maxEpochs'] = int(parsed_args.max_epochs) if parsed_args.max_time: job_config['maxTime'] = float(parsed_args.max_time) job_config['priority'] = 0 if parsed_args.priority: job_config['priority'] = float(parsed_args.priority) if 'resources' not in job_config: job_config['resources'] = {} if parsed_args.server: job_config['servers'] = [] for name in parsed_args.server: job_config['servers'].append(name) if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory: if parsed_args.cpu: job_config['resources']['cpu'] = float(parsed_args.cpu) if parsed_args.memory: job_config['resources']['memory'] = float(parsed_args.memory) if parsed_args.gpu is not None: job_config['resources']['gpu'] = float(parsed_args.gpu) if parsed_args.gpu_memory: job_config['resources']['gpu_memory'] = float( parsed_args.gpu_memory) model_name = parsed_args.name if model_name.count('/') == 1: try: self.logger.debug("Create job ...") created = api.create_job(model_name, parsed_args.local, hyperparameter, parsed_args.dataset, config=job_config) except api.ApiError as e: if 'Connection refused' in e.reason: self.logger.error("You are offline") raise print("Job %s/%s created." % (model_name, created['id'])) if parsed_args.local: start(self.logger, model_name + '/' + created['id'], gpu_devices=parsed_args.gpu_device) else: print("Open http://%s/model/%s/job/%s to monitor it." % (home_config['host'], model_name, created['id'])) else: start(self.logger, model_name, gpu_devices=parsed_args.gpu_device)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run') parser.add_argument('command', nargs='?', help="The command to run. Default read in aetros.yml") parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.") parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.") parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in aetros.yml") parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.") parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.") parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.") parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container.") parser.add_argument('--volume', '-v', action='append', help="Volume into docker") parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env") parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.") parsed_args = parser.parse_args(args) config = read_config(parsed_args.config or 'aetros.yml') env = {} if parsed_args.e: for item in parsed_args.e: if '=' in item: k, v = item.split('=') else: k = item v = os.getenv(k) env[k] = v if 'command' not in config and not parsed_args.command: self.logger.error('No "command" given in aetros.yml or as argument.') sys.exit(1) job = JobBackend(parsed_args.model, self.logger, parsed_args.config or 'aetros.yml') ignore = [] if 'ignore' in config: ignore = config['ignore'] job.job = {'config': {'ignore': ignore}} files_added, size_added = job.add_files() print("%d files added (%s)" % (files_added, human_size(size_added, 2))) create_info = { 'type': 'custom', 'config': config } incoming_hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"') name, value = param.split('=') incoming_hyperparameter[name] = value # first transform simple format in the full definition with parameter types # (string, number, group, choice_group, etc) full_hyperparameters = lose_parameters_to_full(config['parameters']) # now extract hyperparameters from full definition, and overwrite stuff using # incoming_hyperparameter if available hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter) create_info['config']['parameters'] = hyperparameter if parsed_args.max_epochs: create_info['config']['maxEpochs'] = int(parsed_args.max_epochs) create_info['config']['priority'] = 0 if parsed_args.priority: create_info['config']['priority'] = float(parsed_args.priority) if parsed_args.max_time: create_info['config']['maxTime'] = float(parsed_args.max_time) if parsed_args.command: create_info['config']['command'] = parsed_args.command if parsed_args.image: # reset install options, since we can't make sure if the base image still fits if 'image' in config and config['image'] and config['image'] != parsed_args.image: create_info['config']['install'] = None # reset dockerfile, since we specified manually an image create_info['config']['dockerfile'] = None create_info['config']['image'] = parsed_args.image if parsed_args.server: create_info['config']['servers'] = [] for name in parsed_args.server: create_info['config']['servers'].append(name) if 'resources' not in create_info['config']: create_info['config']['resources'] = {} if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory: if parsed_args.cpu: create_info['config']['resources']['cpu'] = float(parsed_args.cpu) if parsed_args.memory: create_info['config']['resources']['memory'] = float(parsed_args.memory) if parsed_args.gpu is not None: create_info['config']['resources']['gpu'] = float(parsed_args.gpu) if parsed_args.gpu_memory: create_info['config']['resources']['gpu_memory'] = float(parsed_args.gpu_memory) if parsed_args.local: # usually, the aetros server would assign resources at job root level from the assigned server # but since it's started locally, we just use the requested one. User should know what they do. # start.py will use 'config' stuff anyone for docker limitation, so we should make sure it is # being displayed. if 'image' in create_info['config'] and create_info['config']: resources = create_info['config']['resources'] create_info['resources_assigned'] = {'cpus': 1, 'memory': 1, 'gpus': []} if 'gpu' in resources and resources['gpu'] > 0: create_info['resources_assigned']['gpus'] = [1] * resources['gpu'] if 'cpu' in resources: create_info['resources_assigned']['cpus'] = resources['cpu'] if 'memory' in resources: create_info['resources_assigned']['memory'] = resources['memory'] else: # since this runs on the host, extract machine hardware and put int resources_assigned # so we see it at the job. pass if parsed_args.local: create_info['server'] = 'local' create_info['config']['sourcesAttached'] = True if aetros.utils.git.get_current_commit_hash(): create_info['origin_git_source'] = { 'origin': aetros.utils.git.get_current_remote_url(), 'author': aetros.utils.git.get_current_commit_author(), 'message': aetros.utils.git.get_current_commit_message(), 'branch': aetros.utils.git.get_current_branch(), 'commit': aetros.utils.git.get_current_commit_hash(), } job.create(create_info=create_info, server=None) print("Job %s/%s created." % (job.model_name, job.job_id)) if parsed_args.local: start(self.logger, job.model_name + '/' + job.job_id, fetch=False, env=env, volumes=parsed_args.volume, gpu_devices=parsed_args.gpu_device) else: if parsed_args.volume: print("Can not use volume with jobs on the cluster. Use datasets instead.") sys.exit(1) #todo, make it visible job.git.push() print("Open http://%s/model/%s/job/%s to monitor it." % (job.host, job.model_name, job.job_id))
def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument('network_name', nargs='?', help='the network name, e.g. aetros/mnist-network') parser.add_argument('--insights', action='store_true', help="activates insights") parser.add_argument( '--insights-sample', help= "Path or url to the sample being used to generate the insights. Default is first training sample." ) parser.add_argument('--dataset', help="Dataset id when network has placeholders") parser.add_argument('--gpu', action='store_true', help="Activates GPU if available") parser.add_argument( '--device', help= "Which device index should be used. Default 0 (which means with --gpu => 'gpu0')" ) parser.add_argument('--tf', action='store_true', help="Uses TensorFlow instead of Theano") parser.add_argument( '--mp', help= "Activates multithreading if available with given thread count.") parsed_args = parser.parse_args(args) if not parsed_args.network_name: parser.print_help() sys.exit() flags = os.environ[ 'THEANO_FLAGS'] if 'THEANO_FLAGS' in os.environ else '' if parsed_args.gpu: if parsed_args.device: flags += ",device=gpu" + parsed_args.device else: flags += ",device=gpu" if parsed_args.mp: flags += ",openmp=True" os.environ['OMP_NUM_THREADS'] = parsed_args.mp os.environ['THEANO_FLAGS'] = flags if parsed_args.tf: os.environ['KERAS_BACKEND'] = 'tensorflow' start(parsed_args.network_name, dataset_id=parsed_args.dataset, insights=parsed_args.insights, insights_sample_path=parsed_args.insights_sample)
def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument( 'name', nargs='?', help= 'the model name, e.g. aetros/mnist-network, or job id, e.g. 1WPXxQP0j.' ) parser.add_argument('--insights', action='store_true', help="activates insights. Only for Keras models.") parser.add_argument( '--insights-sample', help= "Path or url to the sample being used to generate the insights. Default is first model sample. Only for Keras models." ) parser.add_argument( '--dataset', help= "Dataset id when model has placeholders. Only for Keras models with placeholders as input/output." ) parser.add_argument( '--secure-key', help="Secure key. Alternatively use API_KEY environment varibale.") parser.add_argument( '--gpu', action='store_true', help="Activates GPU if available. Only for Theano models.") parser.add_argument( '--device', help= "Which device index should be used. Default 0 (which means with --gpu => 'gpu0'). Only for Theano models." ) parser.add_argument( '--tf', action='store_true', help="Uses TensorFlow instead of Theano. Only for Keras models.") parser.add_argument( '--mp', help= "Activates multithreading if available with given thread count. Only for Theano models." ) parser.add_argument('--no-hardware-monitoring', action='store_true', help="Deactivates hardware monitoring") parser.add_argument( '--param', action='append', help= "Sets a hyperparameter, example '--param name:value'. Multiple --param allowed." ) parsed_args = parser.parse_args(args) if not parsed_args.name: parser.print_help() sys.exit() flags = os.environ[ 'THEANO_FLAGS'] if 'THEANO_FLAGS' in os.environ else '' if parsed_args.gpu: if parsed_args.device: flags += ",device=gpu" + parsed_args.device else: flags += ",device=gpu" if parsed_args.mp: flags += ",openmp=True" os.environ['OMP_NUM_THREADS'] = parsed_args.mp os.environ['THEANO_FLAGS'] = flags os.environ[ 'KERAS_BACKEND'] = 'tensorflow' if parsed_args.tf else 'theano' hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if ':' not in param: raise Exception( '--param ' + param + ' does not contain a :. Please use "--param name:value"' ) name, value = param.split(':') hyperparameter[name] = value start( parsed_args.name, hyperparameter=hyperparameter, dataset_id=parsed_args.dataset, insights=parsed_args.insights, insights_sample_path=parsed_args.insights_sample, api_token=parsed_args.secure_key, )
def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument( 'name', nargs='?', help= 'the model name, e.g. aetros/mnist-network, or job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5.' ) parser.add_argument('--insights', action='store_true', help="activates insights. Only for simple models.") parser.add_argument( '--dataset', help= "Dataset id when model has placeholders. Only for simple models with placeholders as input/output." ) parser.add_argument( '--gpu', action='store_true', help="Activates GPU if available. Only for Theano models.") parser.add_argument( '--device', help= "Which device index should be used. Default 0 (which means with --gpu => 'gpu0'). Only for Theano models." ) parser.add_argument( '--tf', action='store_true', help="Force TensorFlow as library. Only for simple models.") parser.add_argument( '--th', action='store_true', help="Force Theano as library. Only for simple models.") parser.add_argument( '--mp', help= "Activates multithreading if available with given thread count. Only when Theano is active." ) parser.add_argument('--no-hardware-monitoring', action='store_true', help="Deactivates hardware monitoring") parser.add_argument( '--param', action='append', help= "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed." ) parsed_args = parser.parse_args(args) if not parsed_args.name: parser.print_help() sys.exit() flags = os.environ[ 'THEANO_FLAGS'] if 'THEANO_FLAGS' in os.environ else '' if parsed_args.gpu: if parsed_args.device: flags += ",device=gpu" + parsed_args.device else: flags += ",device=gpu" if parsed_args.mp: flags += ",openmp=True" os.environ['OMP_NUM_THREADS'] = parsed_args.mp os.environ['THEANO_FLAGS'] = flags if parsed_args.tf: os.environ['KERAS_BACKEND'] = 'tensorflow' if parsed_args.th: os.environ['KERAS_BACKEND'] = 'theano' hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception( '--param ' + param + ' does not contain a =. Please use "--param name=value"' ) name, value = param.split('=') hyperparameter[name] = value start(self.logger, parsed_args.name, hyperparameter=hyperparameter, dataset_id=parsed_args.dataset, insights=parsed_args.insights)
def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument('name', help='the model name, e.g. aetros/mnist-network to start new job, or job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5 to start a pre-created job.') parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.") parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.") parser.add_argument('-b', '--branch', help="This overwrites the Git branch used when new job should be started.") parser.add_argument('-c', '--config', help="Default /aetros.yml in Git root.") parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.") parser.add_argument('--gpu-device', action='append', help="Which GPU device id should be mapped into the Docker container. Only with --local.") parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.") parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.") parser.add_argument('--insights', action='store_true', help="activates insights. Only for simple models.") parser.add_argument('--dataset', help="Dataset id when model has placeholders. Only for simple models with placeholders as input/output.") parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.") parsed_args = parser.parse_args(args) if not parsed_args.name: print("fatal: no model defined. 'aetros start user/model-name'.") sys.exit(2) if parsed_args.name and parsed_args.name.count('/') > 1: # start a concrete job, used by server command gpu_devices = [] if parsed_args.gpu_device: gpu_devices = [int(x) for x in parsed_args.gpu_device] start(self.logger, parsed_args.name, cpus=int(parsed_args.cpu), memory=int(parsed_args.memory), gpu_devices=gpu_devices) return home_config = read_home_config() model_name = parsed_args.name # create a new job hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"') name, value = param.split('=') hyperparameter[name] = value job_config = {'insights': parsed_args.insights} if parsed_args.image: job_config['image'] = parsed_args.image if parsed_args.branch: job_config['sourceGitTree'] = parsed_args.branch if parsed_args.max_epochs: job_config['maxEpochs'] = int(parsed_args.max_epochs) if parsed_args.max_time: job_config['maxTime'] = float(parsed_args.max_time) job_config['priority'] = 0 if parsed_args.priority: job_config['priority'] = float(parsed_args.priority) if parsed_args.rebuild_image: job_config['config']['rebuild_image'] = True if parsed_args.server: job_config['servers'] = [] for name in parsed_args.server: job_config['servers'].append(name) job_config['resources'] = {} if parsed_args.cpu: job_config['resources']['cpu'] = int(parsed_args.cpu) if parsed_args.memory: job_config['resources']['memory'] = int(parsed_args.memory) if parsed_args.gpu: job_config['resources']['gpu'] = int(parsed_args.gpu) if parsed_args.gpu_memory: job_config['resources']['gpu_memory'] = int(parsed_args.gpu_memory) config_path = parsed_args.config or 'aetros.yml' try: self.logger.debug("Create job ...") created = api.create_job(model_name, config_path, parsed_args.local, hyperparameter, parsed_args.dataset, config=job_config) except api.ApiError as e: if 'Connection refused' in e.error: self.logger.error("You are offline") raise self.logger.info("Job %s/%s created." % (model_name, created['id'])) if parsed_args.local: start(self.logger, model_name + '/' + created['id'], gpu_devices=parsed_args.gpu_device) else: print("Open http://%s/model/%s/job/%s to monitor it." % (home_config['host'], model_name, created['id']))