def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-pull') parser.add_argument( 'id', help= "Short or long job id, like ef8009d83a9892968097cec05b9467c685d45453" ) parser.add_argument( '--model', help="Model name like peter/mnist. Per default from configuration." ) parser.add_argument( '-c', '--config', help= "Default aetros.yml in current working directory or directories above." ) parsed_args = parser.parse_args(args) if not parsed_args.id: parser.print_help() sys.exit(1) home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print( "No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'." ) sys.exit(2) full_id = git_has_remote_job(home_config, model, parsed_args.id) if not full_id: print("Error: Job not found on remote.") sys.exit(1) ref = 'refs/aetros/job/' + full_id git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') git_remote_url = 'git@%s:%s.git' % (home_config['host'], model) if not os.path.isdir(git_dir): subprocess.call([ home_config['git'], '--bare', 'clone', git_remote_url, git_dir ]) print('Pull job %s of %s' % (parsed_args.id, model)) setup_git_ssh(home_config) subprocess.call([ home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref + ':' + ref ])
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' id') parsed_args = parser.parse_args(args) config = read_home_config() try: user = api.user() except KeyNotConfiguredException as e: self.logger.error(str(e)) sys.exit(1) print("Logged in as %s (%s) on %s" % (user['username'], user['name'], config['host'])) if len(user['accounts']) > 0: for orga in six.itervalues(user['accounts']): print(" %s of organisation %s (%s)." % ("Owner" if orga['memberType'] == 1 else "Member", orga['username'], orga['name'])) else: print(" Without membership to an organisation.")
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' model') parsed_args = parser.parse_args(args) home_config = read_home_config() config_path = find_config_path() if not config_path: print( "No model configuration file (aetros.yml). Switch to a directory first.." ) sys.exit(1) config = find_config(error_on_missing=True) print("Model %s in %s used in all aetros commands." % (config['model'], os.path.dirname(config_path))) git_remote_url = 'git@%s:%s.git' % (home_config['host'], config['model']) print("Git url: %s" % (git_remote_url, ))
def request(path, query=None, body=None, method='get', config=None): query = query or {} if isinstance(query, dict): query = urlencode(query) if '?' in path: path += '&' + query else: path += '?' + query config = read_home_config() if config is None else config if method == 'get' and body is not None: method = 'post' ssh_stream = create_ssh_stream(config) stdin, stdout, stderr = ssh_stream.exec_command('api ' + method + ' ' + simplejson.dumps(path)) if body is not None: input = six.b(simplejson.dumps(body)) stdin.write(input) stdin.flush() stdin.channel.shutdown_write() stdout = drain_stream(stdout) stderr = drain_stream(stderr) if len(stderr) > 0: if hasattr(stderr, 'decode'): stderr = stderr.decode('utf-8') raise ApiError('Could not request api: ' + config['host'] + path, stderr) return stdout
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-diff') parser.add_argument('id_from', help="Short or long job id like ed4d6a204.") parser.add_argument('id_to', nargs='?', help="Short or long job id like d55df24a7 or file path") parser.add_argument('limit', nargs='?', help="Limit files to diff") parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.") parsed_args = parser.parse_args(args) home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') id_map = {} for job_id in [parsed_args.id_from, parsed_args.id_to]: if os.path.exists(job_id): continue full_id = git_has_local_job(home_config, model, job_id) id_map[job_id] = full_id if not full_id: full_id = git_has_remote_job(home_config, model, job_id) id_map[job_id] = full_id if full_id: print("Pull job %s to local ... " % (job_id, )) ref = 'refs/aetros/job/' + full_id subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref]) else: print("Job %s not found." % (job_id, )) sys.exit(2) print("Diff jobs %s and %s of %s." %(parsed_args.id_from, parsed_args.id_to, model)) from_ref = 'refs/aetros/job/' + id_map[parsed_args.id_from] args = [home_config['git'], '--bare', '--git-dir', git_dir] if os.path.exists(parsed_args.id_to): args += ['--work-tree', os.path.abspath(parsed_args.id_to), 'diff', from_ref] else: to_ref = 'refs/aetros/job/' + id_map[parsed_args.id_to] args += ['diff', from_ref+'...'+to_ref] if parsed_args.limit: args += ['--', parsed_args.limit] subprocess.call(args)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-files') parser.add_argument('job_id', help="Short or long job id like ed4d6a204") parser.add_argument('folder', nargs='?', help="Limit files list to folder. Default root ./") parser.add_argument('-r', action='store_true', help="Recursive files tree") parser.add_argument('--model', help="Model name like peter/mnist. Per default from found configuration.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.") parsed_args = parser.parse_args(args) if not parsed_args.job_id: parser.print_help() sys.exit() home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') id_map = {} for job_id in [parsed_args.job_id]: full_id = git_has_local_job(home_config, model, job_id) id_map[job_id] = full_id if not full_id: full_id = git_has_remote_job(home_config, model, job_id) id_map[job_id] = full_id if full_id: print("Pull job %s to local ... " % (job_id, )) ref = 'refs/aetros/job/' + full_id subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref]) else: print("Job %s not found." % (job_id, )) sys.exit(2) ref = 'refs/aetros/job/' + id_map[parsed_args.job_id] print("List job files of %s of %s" % (parsed_args.job_id, model)) args = [home_config['git'], '--bare', '--git-dir', git_dir, 'ls-tree', '--long'] if parsed_args.r: args.append('-r') args.append(ref) if parsed_args.folder: args.append(parsed_args.folder) subprocess.call(args)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' jobs') parser.add_argument('--all', '-a', action='store_true', help="Show remote jobs as well") parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.") parsed_args = parser.parse_args(args) home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) print("Show jobs of model " + model + ' ('+home_config['host']+')') setup_git_ssh(home_config) local_job_ids = git_local_job_ids(home_config, model) remote_job_ids = [] try: remote_job_ids = git_remote_job_ids(home_config, model) except: pass job_map = OrderedDict() for job_id in local_job_ids: job_map[job_id] = {'local': Color('{autogreen}Yes{/autogreen}'), 'remote': Color('{autored}No{/autored}'),} for job_id in remote_job_ids: if job_id in job_map: job_map[job_id]['remote'] = Color('{autogreen}Yes{/autogreen}') elif parsed_args.all: job_map[job_id] = {'local': Color('{autored}No{/autored}'), 'remote': Color('{autogreen}Yes{/autogreen}')} print("%d jobs found. (%d synced to remote)" % (len(job_map), len(remote_job_ids))) if not parsed_args.all: print("Use --all to show remote-only jobs as well.") table_data = [['Short Job ID', 'Local', 'Remote', 'Long Job ID']] for job_id, info in six.iteritems(job_map): table_data.append([job_id[0:9], info['local'], info['remote'], job_id]) table = AsciiTable(table_data) print(table.table)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-commits') parser.add_argument('job_id', help="Short or long job id like ed4d6a204.") parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.") parsed_args = parser.parse_args(args) if not parsed_args.job_id: parser.print_help() sys.exit(1) home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') id_map = {} for job_id in [parsed_args.job_id]: full_id = git_has_local_job(home_config, model, job_id) id_map[job_id] = full_id if not full_id: full_id = git_has_remote_job(home_config, model, job_id) id_map[job_id] = full_id if full_id: print("Pull job %s to local ... " % (job_id, )) ref = 'refs/aetros/job/' + full_id subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref]) else: print("Job %s not found." % (job_id, )) sys.exit(2) ref = 'refs/aetros/job/' + id_map[parsed_args.job_id] args = [home_config['git'], '--bare', '--git-dir', git_dir] args += ['log', '--stat', ref] subprocess.call(args)
def http_request(path, query='', json_body=None, method='get', config=None, handle_common_errors=True): config = read_home_config() if config is None else config try: import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) except Exception: pass if query is not None: if isinstance(query, dict): query = urlencode(query) if '?' in path: path += '&' + query else: path += '?' + query url = config['url'] + '/api/' + path auth = None if 'auth_user' in config: auth = HTTPBasicAuth(config['auth_user'], config['auth_pw']) if json_body is not None and method == 'get': method = 'post' try: response = requests.request( method, url, data=json_body, auth=auth, verify=config['ssl_verify'], headers={'Accept': 'application/json'} ) except requests.exceptions.SSLError: if not handle_common_errors: raise print("Error: Could not connect to " + url + ". Make sure to install a valid SSL cert or disable ssl check by" "setting aetros home-config ssl_verify false") sys.exit(1) if response.status_code >= 400: raise_response_exception('Failed request ' + url, response) return parse_json(response.content.decode('utf-8'))
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' model') parsed_args = parser.parse_args(args) home_config = read_home_config() config_path = find_config_path() if not config_path: print("No model configuration file (aetros.yml). Switch to a directory first..") sys.exit(1) config = find_config(error_on_missing=True) print("Model %s in %s used in all aetros commands." % (config['model'], os.path.dirname(config_path))) git_remote_url = 'git@%s:%s.git' % (home_config['host'], config['model']) print("Git url: %s" % (git_remote_url,))
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-pull') parser.add_argument('id', help="Short or long job id, like ef8009d83a9892968097cec05b9467c685d45453") parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.") parsed_args = parser.parse_args(args) if not parsed_args.id: parser.print_help() sys.exit(1) home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) full_id = git_has_remote_job(home_config, model, parsed_args.id) if not full_id: print("Error: Job not found on remote.") sys.exit(1) ref = 'refs/aetros/job/' + full_id git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') git_remote_url = 'git@%s:%s.git' % (home_config['host'], model) if not os.path.isdir(git_dir): subprocess.call([home_config['git'], '--bare', 'clone', git_remote_url, git_dir]) print('Pull job %s of %s' % (parsed_args.id, model)) setup_git_ssh(home_config) subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref])
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run') parser.add_argument('name', nargs='?', help="Model name") parser.add_argument( '--private', action='store_true', help= "Make the model private. Example: aetros init my-model --private") home_config = read_home_config() parsed_args = parser.parse_args(args) if not parsed_args.name: parser.print_help() sys.exit(1) if os.path.exists('aetros.yml'): config = yaml.safe_load(open('aetros.yml', 'r')) if isinstance(config, dict) and 'model' in config: print( "failed: aetros.yml already exists with a linked model to " + config['model']) sys.exit(1) name = api.create_model( parsed_args.name or (os.path.basename(os.getcwd())), parsed_args.private) with open('aetros.yml', 'w') as f: f.write('model: ' + name) print("aetros.yml created linked with model " + name + ' in ' + os.getcwd()) print("Open AETROS Trainer to see the model at https://" + home_config['host'] + '/model/' + name)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-push') parser.add_argument('id', help="Short or long job id, like ef8009d83a9892968097cec05b9467c685d45453") parser.add_argument('--model', help="Model name like peter/mnist. Per default from current directory") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.") parsed_args = parser.parse_args(args) if not parsed_args.id: parser.print_help() sys.exit(1) home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) full_id = git_has_local_job(home_config, model, parsed_args.id) if not full_id: print("Error: Job not found on local.") sys.exit(1) ref = 'refs/aetros/job/' + full_id git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') if not os.path.isdir(git_dir): self.logger.error("Git repository for model %s in %s not found." % (full_id, git_dir)) self.logger.error("You seem not to have any job created on this machine for model " + model) sys.exit(1) print('Push job %s of %s' % (full_id, model)) setup_git_ssh(home_config) subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'push', 'origin', ref])
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' id') parsed_args = parser.parse_args(args) config = read_home_config() try: user = api.user() except KeyNotConfiguredException as e: self.logger.error(str(e)) sys.exit(1) print("Logged in as %s (%s) on %s" % (user['username'], user['name'], config['host'])) if len(user['accounts']) > 0: for orga in six.itervalues(user['accounts']): print(" %s of organisation %s (%s)." % ("Owner" if orga['memberType'] == 1 else "Member", orga['username'], orga['name'])) else: print(" Without membership to an organisation.")
def http_request(path, query='', json_body=None, method='get', config=None): config = read_home_config() if config is None else config try: import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) except Exception: pass if query is not None: if isinstance(query, dict): query = urlencode(query) if '?' in path: path += '&' + query else: path += '?' + query url = 'https://' + config['host'] + '/api/' + path auth = None if 'auth_user' in config: auth = HTTPBasicAuth(config['auth_user'], config['auth_pw']) if json_body is not None and method == 'get': method = 'post' response = requests.request(method, url, data=json_body, auth=auth, verify=config['ssl_verify'], headers={'Accept': 'application/json'}) if response.status_code >= 400: raise_response_exception('Failed request ' + path, response) return parse_json(response.content.decode('utf-8'))
def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument( 'name', nargs='?', help= 'the model name, e.g. aetros/mnist-network to start new job, or job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5 to start a pre-created job.' ) parser.add_argument( '-i', '--image', help= "Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host." ) parser.add_argument( '-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument( '-s', '--server', action='append', help= "Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed." ) parser.add_argument( '-b', '--branch', help= "This overwrites the Git branch used when new job should be started." ) parser.add_argument( '--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument( '--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument( '--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument( '--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument( '--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument( '--gpu-device', action='append', help= "Which device id should be mapped into the NVIDIA docker container." ) parser.add_argument( '--max-time', help= "Limit execution time in seconds. Sends SIGINT to the process group when reached." ) parser.add_argument( '--max-epochs', help= "Limit execution epochs. Sends SIGINT to the process group when reached." ) parser.add_argument('--insights', action='store_true', help="activates insights. Only for simple models.") parser.add_argument( '--dataset', help= "Dataset id when model has placeholders. Only for simple models with placeholders as input/output." ) parser.add_argument( '-p', '--param', action='append', help= "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed." ) parsed_args = parser.parse_args(args) home_config = read_home_config() hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception( '--param ' + param + ' does not contain a `=`. Please use "--param name=value"' ) name, value = param.split('=') hyperparameter[name] = value job_config = {'insights': parsed_args.insights} if parsed_args.image: job_config['image'] = parsed_args.image if parsed_args.branch: job_config['sourceGitTree'] = parsed_args.branch if parsed_args.max_epochs: job_config['maxEpochs'] = int(parsed_args.max_epochs) if parsed_args.max_time: job_config['maxTime'] = float(parsed_args.max_time) job_config['priority'] = 0 if parsed_args.priority: job_config['priority'] = float(parsed_args.priority) if 'resources' not in job_config: job_config['resources'] = {} if parsed_args.server: job_config['servers'] = [] for name in parsed_args.server: job_config['servers'].append(name) if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory: if parsed_args.cpu: job_config['resources']['cpu'] = float(parsed_args.cpu) if parsed_args.memory: job_config['resources']['memory'] = float(parsed_args.memory) if parsed_args.gpu is not None: job_config['resources']['gpu'] = float(parsed_args.gpu) if parsed_args.gpu_memory: job_config['resources']['gpu_memory'] = float( parsed_args.gpu_memory) model_name = parsed_args.name if model_name.count('/') == 1: try: self.logger.debug("Create job ...") created = api.create_job(model_name, parsed_args.local, hyperparameter, parsed_args.dataset, config=job_config) except api.ApiError as e: if 'Connection refused' in e.reason: self.logger.error("You are offline") raise print("Job %s/%s created." % (model_name, created['id'])) if parsed_args.local: start(self.logger, model_name + '/' + created['id'], gpu_devices=parsed_args.gpu_device) else: print("Open http://%s/model/%s/job/%s to monitor it." % (home_config['host'], model_name, created['id'])) else: start(self.logger, model_name, gpu_devices=parsed_args.gpu_device)
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' authenticate', description= 'Authenticates the machine with a new pair of SSH keys with a user account.' ) parsed_args = parser.parse_args(args) home_config = read_home_config() host = home_config['host'] installed_key = get_ssh_key_for_host(host) key_exists_and_valid = False if installed_key: try: create_ssh_stream(home_config, exit_on_failure=False) key_exists_and_valid = True except Exception: pass if key_exists_and_valid: choice = six.moves.input( "You have already configured a valid SSH (ssk_key: " + installed_key + ") " "for " + host + ".\nWant to create a new key? (y/N): ").lower() if choice != 'y' and choice != 'yes': print("Aborted.") sys.exit(1) ssh_key = paramiko.RSAKey.generate(4096) ssh_key_private = ssh_key.key.private_bytes( serialization.Encoding.PEM, serialization.PrivateFormat.TraditionalOpenSSL, serialization.NoEncryption()).decode() ssh_key_public = 'rsa ' + ssh_key.get_base64() fingerprint = hashlib.md5(ssh_key.__str__()).hexdigest() fingerprint = ':'.join( a + b for a, b in zip(fingerprint[::2], fingerprint[1::2])) token = api.http_request('machine-token', None, { 'host': socket.getfqdn(), 'key': ssh_key_public }) print( "Open following link and login to confirm this machine's SSH key in your account." ) print("Public Key Fingerprint: MD5:" + fingerprint) print("\n https://" + host + "/confirm-machine/" + token) print("\nWaiting for confirmation ...") while True: time.sleep(3) response = api.http_request('machine-token/authorized?id=' + token, method='post') if response['status'] == 'confirmed': print( "\n" + response['username'] + ' confirmed the public key. Test with "aetros id" or "ssh git@' + host + '".') private_key_path = os.path.expanduser('~/.ssh/aetros_' + response['username'] + '_rsa') public_key_path = os.path.expanduser('~/.ssh/aetros_' + response['username'] + '_rsa.pub') if not os.path.exists(os.path.dirname(private_key_path)): os.makedirs(os.path.dirname(private_key_path)) with open(private_key_path, 'w') as f: f.write(ssh_key_private) with open(public_key_path, 'w') as f: f.write(ssh_key_public) os.chmod(private_key_path, 0o600) os.chmod(public_key_path, 0o600) ssh_config_path = os.path.expanduser('~/.ssh/config') if not os.path.exists(os.path.dirname(ssh_config_path)): os.makedirs(os.path.dirname(ssh_config_path)) host_section = 'host ' + host + '\n' identity_section = ' IdentityFile ~/.ssh/aetros_' + response[ 'username'] + '_rsa\n' if os.path.exists(ssh_config_path): import re regex = re.compile(r"^host\s+" + re.escape(host) + '\s*', re.IGNORECASE | re.MULTILINE) with open(ssh_config_path, 'r+') as f: config = f.read() if regex.match(config): config = regex.sub(host_section + identity_section, config, 1) else: config = host_section + identity_section + config f.seek(0) f.write(config) else: with open(ssh_config_path, 'w') as f: f.write(host_section + identity_section) print("Private key " + private_key_path + " installed in ~/.ssh/config for " + host + ".\n") user = api.user() print("Key installed of account %s (%s)." % (user['username'], user['name'])) sys.exit(0) if response['status'] == 'expired': print("Token expired.") sys.exit(1)
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' jobs') parser.add_argument('--all', '-a', action='store_true', help="Show remote jobs as well") parser.add_argument( '--model', help="Model name like peter/mnist. Per default from configuration." ) parser.add_argument( '-c', '--config', help= "Default aetros.yml in current working directory or directories above." ) parsed_args = parser.parse_args(args) home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print( "No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'." ) sys.exit(2) print("Show jobs of model " + model + ' (' + home_config['host'] + ')') setup_git_ssh(home_config) local_job_ids = git_local_job_ids(home_config, model) remote_job_ids = [] try: remote_job_ids = git_remote_job_ids(home_config, model) except: pass job_map = OrderedDict() for job_id in local_job_ids: job_map[job_id] = { 'local': Color('{autogreen}Yes{/autogreen}'), 'remote': Color('{autored}No{/autored}'), } for job_id in remote_job_ids: if job_id in job_map: job_map[job_id]['remote'] = Color('{autogreen}Yes{/autogreen}') elif parsed_args.all: job_map[job_id] = { 'local': Color('{autored}No{/autored}'), 'remote': Color('{autogreen}Yes{/autogreen}') } print("%d jobs found. (%d synced to remote)" % (len(job_map), len(remote_job_ids))) if not parsed_args.all: print("Use --all to show remote-only jobs as well.") table_data = [['Short Job ID', 'Local', 'Remote', 'Long Job ID']] for job_id, info in six.iteritems(job_map): table_data.append( [job_id[0:9], info['local'], info['remote'], job_id]) table = AsciiTable(table_data) print(table.table)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' authenticate', description='Authenticates the machine with a new pair of SSH keys with a user account.') parsed_args = parser.parse_args(args) home_config = read_home_config() host = home_config['host'] installed_key = get_ssh_key_for_host(host) key_exists_and_valid = False if installed_key: try: create_ssh_stream(home_config, exit_on_failure=False) key_exists_and_valid = True except Exception: pass if key_exists_and_valid: choice = six.moves.input("You have already configured a valid SSH (ssk_key: "+installed_key+") " "for "+host+".\nWant to create a new key? The old won't be removed. (y/N): ").lower() if choice != 'y' and choice != 'yes': print("Aborted.") sys.exit(1) ssh_key = paramiko.RSAKey.generate(4096) ssh_key_private = ssh_key.key.private_bytes( serialization.Encoding.PEM, serialization.PrivateFormat.TraditionalOpenSSL, serialization.NoEncryption() ).decode() ssh_key_public = 'rsa ' + ssh_key.get_base64() string_key = ssh_key.__str__() if not isinstance(string_key, six.binary_type): string_key = string_key.encode('utf-8') md5 = hashlib.md5(string_key) fingerprint = md5.hexdigest() fingerprint = ':'.join(a + b for a, b in zip(fingerprint[::2], fingerprint[1::2])) try: token = api.http_request('machine-token', None, { 'host': socket.getfqdn(), 'key': ssh_key_public }) except requests.exceptions.SSLError: sys.exit(1) print("Open following link and login to confirm this machine's SSH key in your account.") print("Public Key Fingerprint: MD5:" + fingerprint) print("\n " + home_config['url'] + "/confirm-machine/" + token) print("\nWaiting for confirmation ...") key_prefix = home_config['host'] + '_' while True: time.sleep(3) response = api.http_request('machine-token/authorized?id=' + token, method='post') if response['status'] == 'confirmed': print("\n" + response['username'] + ' confirmed the public key. Test with "aetros id" or "ssh git@'+host+'".') private_key_path = os.path.expanduser('~/.ssh/' + key_prefix + response['username']+'_rsa') public_key_path = os.path.expanduser('~/.ssh/' + key_prefix + response['username']+'_rsa.pub') if not os.path.exists(os.path.dirname(private_key_path)): os.makedirs(os.path.dirname(private_key_path)) with open(private_key_path, 'w') as f: f.write(ssh_key_private) with open(public_key_path, 'w') as f: f.write(ssh_key_public) os.chmod(private_key_path, 0o600) os.chmod(public_key_path, 0o600) ssh_config_path = os.path.expanduser('~/.ssh/config') if not os.path.exists(os.path.dirname(ssh_config_path)): os.makedirs(os.path.dirname(ssh_config_path)) host_section = 'host '+host+'\n' identity_section = ' IdentityFile ~/.ssh/' + key_prefix + response['username']+'_rsa\n' if os.path.exists(ssh_config_path): import re regex = re.compile(r"^host\s+" + re.escape(host)+'\s*', re.IGNORECASE | re.MULTILINE) with open(ssh_config_path, 'r+') as f: config = f.read() if regex.match(config): config = regex.sub(host_section + identity_section, config, 1) else: config = host_section + identity_section + config f.seek(0) f.write(config) else: with open(ssh_config_path, 'w') as f: f.write(host_section + identity_section) print("Private key " + private_key_path + " installed in ~/.ssh/config for "+host+".\n") user = api.user() print("Key installed of account %s (%s)." % (user['username'], user['name'])) sys.exit(0) if response['status'] == 'expired': print("Token expired.") sys.exit(1)
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' init') parser.add_argument('name', help="Model name") parser.add_argument('directory', nargs='?', help="Directory, default in current.") parser.add_argument( '--organisation', '-o', help= "Create the model in the organisation instead of the user account." ) parser.add_argument( '--space', '-s', help= "Create the model in given space. If space does not exist, create it." ) parser.add_argument( '--private', action='store_true', help= "Make the model private. Example: aetros init my-model --private") parser.add_argument( '--force', '-f', action='store_true', help="Force overwriting of already existing configuration file.") home_config = read_home_config() parsed_args = parser.parse_args(args) if not parsed_args.name: parser.print_help() sys.exit(1) path = os.getcwd() if parsed_args.directory: path = os.path.abspath(parsed_args.directory) if os.path.exists(path) and not os.path.isdir(path): sys.stderr.write('Path already exist and is not a directory: ' + path) if not os.path.exists(path): os.makedirs(path) yaml = ruamel.yaml.YAML() config = {} if os.path.exists(path + '/aetros.yml'): with open(path + '/aetros.yml', 'r') as f: config = yaml.load(f) if isinstance( config, dict) and 'model' in config and not parsed_args.force: print( "failed: aetros.yml already exists in with a linked model to " + config['model'] + '. Use -f to force.') sys.exit(1) if not parsed_args.private: print( "Warning: creating public model. Use --private to create private models." ) if '/' in parsed_args.name: sys.stderr.write( 'No / allowed in name. Use -o if thie model should be created in an organisation.' ) sys.exit(1) response = api.create_model( parsed_args.name or (os.path.basename(os.getcwd())), parsed_args.organisation, parsed_args.space, parsed_args.private) name = response['name'] if response['already_exists']: print("Notice: Model already exists remotely.") config['model'] = name with open(path + '/aetros.yml', 'w+') as f: yaml.dump(config, f) print("aetros.yml created and linked with model " + name + ' in ' + path) print("Open AETROS Trainer to see the model at https://" + home_config['host'] + '/model/' + name) git_remote_url = 'git@%s:%s.git' % (home_config['host'], name) print( "Use git to store your source code. Each model has its own Git repository." ) print(" $ cd " + path) print(" $ git init") print(" $ git remote add origin " + git_remote_url) print(" $ git add .") print(" $ git commit -m 'first commit'") print(" $ git push origin master")
def start_command(logger, job_backend, env=None, volumes=None, gpu_devices=None): work_tree = job_backend.git.work_tree home_config = read_home_config() if not env: env = {} if 'PYTHONPATH' not in env: env['PYTHONPATH'] = os.getenv('PYTHONPATH', '') env['PYTHONPATH'] += ':' + os.getcwd() env['AETROS_MODEL_NAME'] = job_backend.model_name env['AETROS_JOB_ID'] = str(job_backend.job_id) env['DEBUG'] = os.getenv('DEBUG', '') env['AETROS_ATTY'] = '1' env['AETROS_GIT'] = job_backend.git.get_base_command() if os.getenv('AETROS_SSH_KEY_BASE64'): env['AETROS_SSH_KEY_BASE64'] = os.getenv('AETROS_SSH_KEY_BASE64') elif get_ssh_key_for_host(home_config['host']): # we need to read the key into env so the docker container can connect to AETROS env['AETROS_SSH_KEY_BASE64'] = open( get_ssh_key_for_host(home_config['host']), 'r').read() job_config = job_backend.job['config'] if 'command' not in job_config: job_backend.fail( 'No "command" given. See Configuration section in the documentation.' ) command = job_config['command'] image = job_config['image'] if job_backend.is_simple_model(): if image: command = ['python'] else: command = [sys.executable] command += [ '-m', 'aetros', 'start-simple', job_backend.model_name + '/' + job_backend.job_id ] if command is None: raise Exception('No command specified.') # replace {{batch_size}} parameters if isinstance(job_config['parameters'], dict): for key, value in six.iteritems( flatten_parameters(job_config['parameters'])): if isinstance(command, list): for pos, v in enumerate(command): if isinstance(command[pos], six.string_types): command[pos] = command[pos].replace( '{{' + key + '}}', json.dumps(value)) elif isinstance(command, six.string_types): command = command.replace('{{' + key + '}}', json.dumps(value)) logger.info("Switch working directory to " + work_tree) os.chdir(job_backend.git.work_tree) docker_image_built = False if job_config['dockerfile'] or job_config['install']: dockerfile = job_config['dockerfile'] if isinstance(dockerfile, six.string_types) and os.path.exists(dockerfile): pass else: if isinstance(dockerfile, six.string_types): dockerfile_content = dockerfile elif isinstance(dockerfile, list) and len(dockerfile) > 0: dockerfile_content = "\n".join(dockerfile) else: if image is None: job_backend.fail( "Image name missing, needed by `install` in aetros.yml" ) dockerfile_content = 'FROM ' + image + '\nRUN ' if isinstance(job_config['install'], list): dockerfile_content += '\n RUN '.join(job_config['install']) else: dockerfile_content += job_config['install'] dockerfile_content = '# CREATED BY AETROS because of "install" or "dockerfile" config in aetros.yml.\n' \ + dockerfile_content with open('Dockerfile.aetros', 'w') as f: f.write(dockerfile_content) dockerfile = 'Dockerfile.aetros' job_backend.commit_file('Dockerfile.aetros') job_backend.set_system_info('image/dockerfile', dockerfile) docker_build = [ home_config['docker'], 'build', '-t', job_backend.model_name, '-f', dockerfile, '.', ] logger.info("Prepare docker image: $ " + (' '.join(docker_build))) job_backend.set_status('IMAGE BUILD') p = execute_command(args=docker_build, bufsize=1, stderr=subprocess.PIPE, stdout=subprocess.PIPE) if p.returncode: job_backend.fail('Image build error') sys.exit(p.returncode) docker_image_built = True image = job_backend.model_name docker_command = None if image: if not docker_image_built: logger.info("Pull docker image: $ " + image) job_backend.set_status('IMAGE PULL') execute_command(args=[home_config['docker'], 'pull', image], bufsize=1, stderr=subprocess.PIPE, stdout=subprocess.PIPE) inspections = execute_command_stdout( [home_config['docker'], 'inspect', image]) inspections = json.loads(inspections.decode('utf-8')) if inspections: inspection = inspections[0] with job_backend.git.batch_commit('Docker image'): job_backend.set_system_info('image/id', inspection['Id']) job_backend.set_system_info('image/docker_version', inspection['DockerVersion']) job_backend.set_system_info('image/created', inspection['Created']) job_backend.set_system_info('image/container', inspection['Container']) job_backend.set_system_info('image/architecture', inspection['Architecture']) job_backend.set_system_info('image/os', inspection['Os']) job_backend.set_system_info('image/size', inspection['Size']) job_backend.set_system_info('image/rootfs', inspection['RootFS']) # make sure old container is removed subprocess.Popen([home_config['docker'], 'rm', job_backend.job_id], stderr=subprocess.PIPE).wait() docker_command = [ home_config['docker'], 'run', '-t', '--name', job_backend.job_id ] docker_command += home_config['docker_options'] env['AETROS_GIT_WORK_DIR'] = '/job' docker_command += [ '--mount', 'type=bind,source=' + job_backend.git.work_tree + ',destination=/job' ] env['AETROS_STORAGE_DIR'] = '/aetros' docker_command += [ '--mount', 'type=bind,source=' + job_backend.git.git_path + ',destination=' + '/aetros/' + job_backend.model_name + '.git' ] home_config_path = os.path.expanduser('~/aetros.yml') if os.path.exists(home_config_path): env['AETROS_HOME_CONFIG_FILE'] = '/aetros/aetros.yml' docker_command += [ '--mount', 'type=bind,source=' + home_config_path + ',destination=' + '/aetros/aetros.yml' ] docker_command += ['-w', '/job'] # make sure the docker command receives all environment variables for k in six.iterkeys(env): docker_command += ['-e', k] if volumes: for volume in volumes: docker_command += ['-v', volume] if 'resources' in job_backend.job: assigned_resources = job_backend.job['resources'] cpus = 1 if 'cpu' in assigned_resources and assigned_resources['cpu']: cpus = assigned_resources['cpu'] docker_command += ['--cpus', str(cpus)] memory = 1 if 'memory' in assigned_resources and assigned_resources['memory']: memory = assigned_resources['memory'] docker_command += ['--memory', str(memory * 1024 * 1024 * 1024)] if gpu_devices and (sys.platform == "linux" or sys.platform == "linux2"): # only supported on linux docker_command += ['--runtime', 'nvidia'] docker_command += [ '-e', 'NVIDIA_VISIBLE_DEVICES=' + (','.join(gpu_devices)) ] # support nvidia-docker1 as well # docker_command += ['--device', '/dev/nvidia1'] docker_command.append(image) # since linux doesnt handle SIGINT when pid=1 process has no signal listener, # we need to make sure, we attached one to the pid=1 process trap = 'trapIt () { "$@"& pid="$!"; trap "kill -INT $pid" INT TERM; ' \ 'while kill -0 $pid > /dev/null 2>&1; do wait $pid; ec="$?"; done; exit $ec;};' if isinstance(command, list): command = ' '.join(command) docker_command += ['sh', '-c', trap + 'trapIt ' + command] command = docker_command job_backend.set_system_info('image/name', str(image)) if not isinstance(command, list): command = ['sh', '-c', command] p = None exited = False wait_stdout = None wait_stderr = None try: job_backend.set_status('STARTED') logger.warning("$ %s " % (' '.join([json.dumps(a) for a in command]))) command_env = os.environ.copy() command_env.update(env) # make sure maxTime limitation is correctly calculated job_backend.monitoring_thread.handle_max_time = True job_backend.monitoring_thread.handle_max_time_time = time.time() # Since JobBackend sends SIGINT to its current process group, it sends also to its parents when same pg. # We need to change the process group of the process, so this won't happen. # If we don't this, the master process receives the SIGINT as well. kwargs = {} if os.name == 'nt': kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP else: kwargs['preexec_fn'] = os.setsid p = subprocess.Popen(args=command, bufsize=1, stderr=subprocess.PIPE, stdout=subprocess.PIPE, env=command_env, **kwargs) wait_stdout = sys.stdout.attach(p.stdout) wait_stderr = sys.stderr.attach(p.stderr) p.wait() wait_stdout() wait_stderr() exited = True sys.exit(p.returncode) except SystemExit: # We can not send a SIGINT to the child process # as we don't know whether it received it already (pressing CTRL+C) or not (sending SIGINT to this process only # instead of to the group), so we assume it received it. A second signal would force the exit. # sys.__stdout__.write("SystemExit with " + str(p.returncode) + ', exited: ' + str(exited) + ", early: "+str(job_backend.in_early_stop)+"\n") # make sure the process dies if docker_command: # docker run does not proxy INT signals to the docker-engine, # so we need to do it on our own directly. subprocess.Popen([ home_config['docker'], 'kill', '--signal', 'INT', job_backend.job_id ], stderr=subprocess.PIPE, stdout=subprocess.PIPE).wait() subprocess.Popen( [home_config['docker'], 'wait', job_backend.job_id], stdout=subprocess.PIPE).wait() elif not exited and p and p.poll() is None: p.kill() # sends SIGINT p.wait() if exited: if p.returncode == 0: job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_DONE) elif p.returncode == 1: job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_ABORTED) else: job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_FAILED) else: # master received SIGINT before the actual command exited. if not job_backend.in_early_stop: # master did not receive early_stop signal (maxTime limitation) # if not, the master received a stop signal by server or by hand (ctrl+c), so mark as aborted job_backend.abort() else: # let the on_shutdown listener handle the rest pass
def start_command(logger, job_backend, env_overwrite=None, volumes=None, cpus=1, memory=1, gpu_devices=None, offline=False): home_config = read_home_config() env = {} if env_overwrite: env.update(env_overwrite) start_time = time.time() env['AETROS_MODEL_NAME'] = job_backend.model_name env['AETROS_JOB_ID'] = str(job_backend.job_id) env['AETROS_OFFLINE'] = '1' if offline else '' env['AETROS_GIT_INDEX_FILE'] = job_backend.git.index_path env['DEBUG'] = os.getenv('DEBUG', '') env['PYTHONUNBUFFERED'] = os.getenv('PYTHONUNBUFFERED', '1') env['PYTHONIOENCODING'] = os.getenv('PYTHONIOENCODING', 'UTF-8') env['AETROS_ATTY'] = '1' env['AETROS_GIT'] = job_backend.git.get_base_command() env['PATH'] = os.getenv('PATH', '') if 'PYTHONPATH' not in env: env['PYTHONPATH'] = os.getenv('PYTHONPATH', '') if os.getenv('AETROS_SSH_KEY_BASE64'): env['AETROS_SSH_KEY_BASE64'] = os.getenv('AETROS_SSH_KEY_BASE64') elif get_ssh_key_for_host(home_config['host']): # we need to read the key into env so the docker container can connect to AETROS env['AETROS_SSH_KEY_BASE64'] = open( get_ssh_key_for_host(home_config['host']), 'r').read() job_config = job_backend.job['config'] job = job_backend.get_job_model() if 'command' not in job_config: job_backend.fail( 'No "command" given. See Configuration section in the documentation.' ) job_commands = job_config['command'] docker_image = job_config['image'] if job_backend.is_simple_model(): if docker_image: simple_command = ['python'] else: simple_command = [sys.executable] simple_command += [ '-m', 'aetros', 'start-simple', job_backend.model_name + '/' + job_backend.job_id ] job_commands = {'run': ' '.join(simple_command)} if job_commands is None: raise Exception('No command specified.') if not isinstance(job_commands, list) and not isinstance( job_commands, dict): job_commands = [job_commands] # replace {{batch_size}} parameters if isinstance(job_config['parameters'], dict): for key, value in six.iteritems( flatten_parameters(job_config['parameters'])): if isinstance(job_commands, list): for k, v in enumerate(job_commands): if isinstance(job_commands[k], six.string_types): job_commands[k] = job_commands[k].replace( '{{' + key + '}}', simplejson.dumps(value)) elif isinstance(job_commands, dict): for k, v in six.iteritems(job_commands): if isinstance(job_commands[k], six.string_types): job_commands[k] = job_commands[k].replace( '{{' + key + '}}', simplejson.dumps(value)) job_backend.set_system_info('commands', job_commands) os.chdir(job_backend.git.work_tree) docker_image_built = False if docker_image and (job_config['dockerfile'] or job_config['install']): rebuild_image = job_config[ 'rebuild_image'] if 'rebuild_image' in job_config else False docker_image = docker_build_image(logger, home_config, job_backend, rebuild_image) docker_image_built = True job_backend.collect_device_information(gpu_devices) state = {'last_process': None} job_backend.set_system_info('processRunning', False, True) def pause(): if not state['last_process'] or state['last_process'].poll( ) is not None: # no running process return if docker_image: if docker_pause(logger, home_config, job_backend): job_backend.set_paused(True) else: os.killpg(os.getpgid(state['last_process'].pid), signal.SIGSTOP) job_backend.set_paused(True) def cont(): if not state['last_process'] or state['last_process'].poll( ) is not None: # no running process return job_backend.set_paused(False) if docker_image: docker_continue(logger, home_config, job_backend) else: os.killpg(os.getpgid(state['last_process'].pid), signal.SIGCONT) job_backend.on_pause = pause job_backend.on_continue = cont if docker_image: env['AETROS_GIT_INDEX_FILE'] = '/aetros/' + job_backend.model_name + '.git/' + os.path.basename( env['AETROS_GIT_INDEX_FILE']) with job_backend.git.batch_commit('JOB_SYSTEM_INFORMATION'): aetros_environment = { 'aetros_version': __version__, 'variables': env.copy() } if 'AETROS_SSH_KEY' in aetros_environment['variables']: del aetros_environment['variables']['AETROS_SSH_KEY'] if 'AETROS_SSH_KEY_BASE64' in aetros_environment['variables']: del aetros_environment['variables']['AETROS_SSH_KEY_BASE64'] job_backend.set_system_info('environment', aetros_environment) job_backend.set_system_info('memory_total', memory * 1024 * 1024 * 1024) import cpuinfo cpu = cpuinfo.get_cpu_info() job_backend.set_system_info('cpu_name', cpu['brand']) job_backend.set_system_info('cpu', [cpu['hz_actual_raw'][0], cpus]) job_backend.start_monitoring(cpu_cores=cpus, gpu_devices=gpu_devices, docker_container=job_backend.job_id) if not docker_image_built: docker_pull_image(logger, home_config, job_backend) docker_image_information(logger, home_config, job_backend) # make sure old container is removed subprocess.Popen([home_config['docker'], 'rm', job_backend.job_id], stderr=subprocess.PIPE).wait() command = docker_command_wrapper(logger, home_config, job_backend, volumes, cpus, memory, gpu_devices, env) # since linux doesnt handle SIGINT when pid=1 process has no signal listener, # we need to make sure, we attached one to the pid=1 process trap = 'trapIt () { "$@"& pid="$!"; trap "kill -INT $pid" INT TERM; ' \ 'while kill -0 $pid > /dev/null 2>&1; do wait $pid; ec="$?"; done; exit $ec;};' command.append(docker_image) command += [ '/bin/sh', '-c', trap + 'trapIt /bin/sh /job/aetros/command.sh' ] else: # non-docker # env['PYTHONPATH'] += ':' + os.getcwd() job_backend.collect_system_information() job_backend.collect_environment(env) job_backend.start_monitoring(gpu_devices=gpu_devices) command = ['/bin/sh', job_backend.git.work_tree + '/aetros/command.sh'] logger.debug("$ %s " % (' '.join([simplejson.dumps(a) for a in command]))) job_backend.set_system_info('image/name', str(docker_image)) p = None exited = False last_return_code = None state['last_process'] = None all_done = False command_stats = None def clean(): # clear working tree shutil.rmtree(job_backend.git.work_tree) def on_force_exit(): # make sure the process dies clean() with open(os.devnull, 'r+b', 0) as DEVNULL: if docker_image: # docker run does not proxy INT signals to the docker-engine, # so we need to do it on our own directly. subprocess.Popen( args=[home_config['docker'], 'kill', job_backend.job_id], stdout=DEVNULL, stderr=DEVNULL).wait() elif not exited and state[ 'last_process'] and state['last_process'].poll() is None: # wait for last command os.killpg(os.getpgid(state['last_process'].pid), signal.SIGKILL) job_backend.on_force_exit = on_force_exit try: job_backend.set_status('STARTED', add_section=False) # logger.warning("$ %s " % (str(command),)) # make sure maxTime limitation is correctly calculated job_backend.monitoring_thread.handle_max_time = True job_backend.monitoring_thread.handle_max_time_time = time.time() # Since JobBackend sends SIGINT to its current process group, it sends also to its parents when same pg. # We need to change the process group of the process, so this won't happen. # If we don't this, the master process (server command e.g.) receives the SIGINT as well. kwargs = {} if os.name == 'nt': kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP else: kwargs['preexec_fn'] = os.setsid # only use full env when no image used command_env = env if not docker_image: command_env = os.environ.copy() command_env.update(env) if os.environ.get('LD_LIBRARY_PATH', None): command_env['LD_LIBRARY_PATH_ORI'] = command_env[ 'LD_LIBRARY_PATH'] def write_command_sh(job_command): f = open(job_backend.git.work_tree + '/aetros/command.sh', 'w+') if not docker_image: # new shells unset LD_LIBRARY_PATH automatically, so we make sure it will be there again f.write('export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_ORI;\n') if job.get_working_dir(): f.write('cd %s;\n' % (job.get_working_dir(), )) f.write(job_command) f.close() def read_line(line): handled, filtered_line, failed = extract_api_calls( line, job_backend.handle_stdout_api, print_traceback=True, logger=logger) if is_debug(): for call in handled: logger.debug('STDOUT API CALL: ' + str(call)) for fail in failed: logger.warning( "API call failed '%s': %s %s" % (str(fail['line']), str(type( fail['exception']).__name__), str(fail['exception']))) return filtered_line def exec_command(index, command, job_command): write_command_sh(job_command) working_dir = '/' if job.get_working_dir(): working_dir = job.get_working_dir() + '/' print('%s $ %s' % (working_dir, job_command.strip())) args = command logger.debug('$ ' + ' '.join([simplejson.dumps(a) for a in args])) command_stats[index]['started'] = time.time() - start_time job_backend.set_system_info('command_stats', command_stats, True) # important to prefix it, otherwise name='master' would reset all stats in controller backend command_env['AETROS_JOB_NAME'] = 'command_' + str(index) state['last_process'] = subprocess.Popen(args=args, bufsize=0, stderr=subprocess.PIPE, stdout=subprocess.PIPE, env=command_env, **kwargs) job_backend.set_system_info('processRunning', True, True) wait_stdout = sys.stdout.attach(state['last_process'].stdout, read_line=read_line) wait_stderr = sys.stderr.attach(state['last_process'].stderr) state['last_process'].wait() command_stats[index]['rc'] = last_return_code command_stats[index]['ended'] = time.time() - start_time job_backend.set_system_info('command_stats', command_stats, True) job_backend.set_system_info('processRunning', True, False) wait_stdout() wait_stderr() # make sure a new line is printed after a command print("") return state['last_process'] done = 0 total = len(job_commands) job_backend.set_system_info('command_stats', command_stats, True) if isinstance(job_commands, list): command_stats = [{ 'rc': None, 'started': None, 'ended': None } for x in job_commands] for k, job_command in enumerate(job_commands): job_backend.set_status('Command ' + str(k + 1)) p = exec_command(k, command, job_command) last_return_code = p.poll() if last_return_code == 0: done += 1 else: # one failed, so exit and don't execute next break if isinstance(job_commands, dict): command_stats = {} for name, job_command in six.iteritems(job_commands): command_stats[name] = { 'rc': None, 'started': None, 'ended': None } for name, job_command in six.iteritems(job_commands): job_backend.set_status('Command ' + name) p = exec_command(name, command, job_command) last_return_code = p.poll() if last_return_code == 0: done += 1 else: # one failed, so exit and don't execute next break all_done = done == total exited = True if state['last_process']: sys.exit(state['last_process'].poll()) else: sys.exit(1) except SystemExit: # since we started the command in a new process group, a SIGINT or CTRL+C on this process won't affect # our actual command process. So we need to take care that we stop everything. logger.debug( "SystemExit, exited=%s, all-done=%s, has-last-process=%s, pid=%s" % (str(exited), str(all_done), state['last_process'] is not None, state['last_process'].poll() if state['last_process'] else None)) # make sure the process dies if docker_image: # docker run does not proxy INT signals to the docker-engine, # so we need to do it on our own directly. p = subprocess.Popen( args=[home_config['docker'], 'inspect', job_backend.job_id], stderr=subprocess.PIPE, stdout=subprocess.PIPE) p.wait() if p.poll() == 0: subprocess.Popen( args=[home_config['docker'], 'kill', job_backend.job_id ]).wait() elif not exited and state[ 'last_process'] and state['last_process'].poll() is None: # wait for last command os.killpg(os.getpgid(state['last_process'].pid), signal.SIGINT) state['last_process'].wait() if 'output' in job_config and job_config['output']: upload_output_files(job_backend, job_config['output']) if exited: if all_done: job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_DONE) else: job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_FAILED) else: # master received SIGINT before the all job commands exited. if not job_backend.in_early_stop: # in_early_stop indicates whether we want to have a planned stop (maxTime limitation for example), # which should mark the job as done, not as abort(). # if this is not set, we the master received a SIGINT without early_stop, so mark as aborted. job_backend.abort() else: # let the on_shutdown listener handle the rest pass clean()
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run') parser.add_argument('command', nargs='?', help="The command to run. Default read in configuration file") parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host.") parser.add_argument('--no-image', action='store_true', help="Forces not to use docker, even when image is defined in the configuration file.") parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed.") parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in configuration file") parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.") parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument('--offline', '-o', action='store_true', help="Whether the execution should happen offline.") parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.") parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.") parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.") parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container. Only when --local") parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local") parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env") parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.") parsed_args = parser.parse_args(args) if parsed_args.config and not os.path.exists(parsed_args.config): self.logger.error("fatal: file %s does not exist." % (parsed_args.config,)) sys.exit(2) config = find_config(parsed_args.config) home_config = read_home_config() if config['model'] and not parsed_args.model: parsed_args.model = config['model'] if not parsed_args.model: print("fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) if not parsed_args.local and parsed_args.volume: print("fatal: can not use volume with jobs on the cluster. Use datasets instead.") sys.exit(1) if parsed_args.local and parsed_args.priority: print("fatal: the priority can only be set for jobs in the cluster.") sys.exit(1) if config['image']: ensure_docker_installed(self.logger) env = {} if parsed_args.e: for item in parsed_args.e: if '=' in item: k, v = item.split('=') else: k = item v = os.getenv(k) env[k] = v if ('command' not in config or not config['command']) and not parsed_args.command: self.logger.error('No command given. Define the command in aetros.yml or use command argument.') sys.exit(1) job_backend = JobBackend(parsed_args.model, self.logger) ignore = [] if 'ignore' in config: ignore = config['ignore'] job_backend.job = {'config': {'ignore': ignore}} adding_files = loading_text("- Adding job files to index ... ") files_added, size_added = job_backend.add_files(config['root'], report=False) adding_files("done with %d file%s added (%s)." % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2))) create_info = { 'type': 'custom', 'config': config } incoming_hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"') name, value = param.split('=') incoming_hyperparameter[name] = value # first transform simple format in the full definition with parameter types # (string, number, group, choice_group, etc) full_hyperparameters = lose_parameters_to_full(config['parameters']) # now extract hyperparameters from full definition, and overwrite stuff using # incoming_hyperparameter if available hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter) create_info['config']['parameters'] = hyperparameter if parsed_args.rebuild_image: create_info['config']['rebuild_image'] = True if parsed_args.max_epochs: create_info['config']['maxEpochs'] = int(parsed_args.max_epochs) create_info['config']['priority'] = 0 if parsed_args.priority: create_info['config']['priority'] = float(parsed_args.priority) if parsed_args.max_time: create_info['config']['maxTime'] = float(parsed_args.max_time) if parsed_args.command: create_info['config']['command'] = parsed_args.command if parsed_args.image: # reset install options, since we can't make sure if the base image still fits if 'image' in config and config['image'] and config['image'] != parsed_args.image: create_info['config']['install'] = None # reset dockerfile, since we specified manually an image create_info['config']['dockerfile'] = None create_info['config']['image'] = parsed_args.image if parsed_args.no_image: create_info['config']['image'] = None if parsed_args.server: create_info['config']['servers'] = [] for name in parsed_args.server: create_info['config']['servers'].append(name) create_info['config']['resources'] = create_info['config'].get('resources', {}) resources = create_info['config']['resources'] default_cpu_and_memory = 1 if create_info['config']['image'] else 0 resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory)) resources['memory'] = int(parsed_args.memory or resources.get('memory', default_cpu_and_memory)) resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0)) resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0)) if parsed_args.local: create_info['server'] = 'local' # make sure we do not limit the resources to something that is not available on the local machine warning = [] cpu = cpuinfo.get_cpu_info() mem = psutil.virtual_memory().total gpu = 0 try: gpu = len(get_ordered_devices()) except CudaNotImplementedException: pass if not create_info['config']['image'] and not all([x == 0 for x in six.itervalues(resources)]): self.logger.warning("! No Docker virtualization since no `image` defined, resources limitation ignored.") if create_info['config']['image'] and resources['gpu'] > 0: if not (sys.platform == "linux" or sys.platform == "linux2"): self.logger.warning("! Your operating system does not support GPU allocation for " "Docker virtualization. " "NVIDIA-Docker2 is only supported on Linux.") local_max_resources = {'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu} if create_info['config']['image']: # read max hardware within Docker out = docker_call(['run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal']) cpus, memory = out.decode('utf-8').strip().split('\n') local_max_resources['cpu'] = int(cpus) memory = memory.replace('MemTotal:', '').replace('kB', '').strip() local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024) if local_max_resources['cpu'] < resources['cpu']: warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu'])) resources['cpu'] = local_max_resources['cpu'] if local_max_resources['memory'] < resources['memory']: warning.append('memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory'])) resources['memory'] = local_max_resources['memory'] if local_max_resources['gpu'] < resources['gpu']: warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu'])) resources['gpu'] = local_max_resources['gpu'] if warning: self.logger.warning("! Resources downgrade due to missing hardware: %s." % (', '.join(warning),)) if parsed_args.config and not create_info['config']['configPath']: create_info['config']['configPath'] = parsed_args.config create_info['config']['sourcesAttached'] = True creating_git_job = loading_text("- Create job in local Git ... ") if aetros.utils.git.get_current_commit_hash(): create_info['origin_git_source'] = { 'origin': aetros.utils.git.get_current_remote_url(), 'author': aetros.utils.git.get_current_commit_author(), 'message': aetros.utils.git.get_current_commit_message(), 'branch': aetros.utils.git.get_current_branch(), 'commit': aetros.utils.git.get_current_commit_hash(), } job_backend.create(create_info=create_info, server=None) creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name)) summary = "➤ Summary: Job running " if parsed_args.local: summary += 'locally' else: summary += 'on the cluster' if create_info['config']['image']: summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \ % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu']) else: summary += ' on host using all available resources.' print(summary) # tasks = [] # # if 'tasks' in config: # for name, task_config in six.iteritems(config['tasks']): # replica = 1 # if 'replica' in task_config: # replica = int(task_config['replica']) # for index in range(0, replica): # tasks.append(job_backend.create_task(job_id, task_config, name, index)) if parsed_args.offline: if not parsed_args.local: self.logger.warning("Can not create a remote job in offline mode.") sys.exit(1) self.logger.warning("Execution started offline.") else: adding_files = loading_text("- Connecting to "+home_config['host']+" ... ") if job_backend.connect(): adding_files("connected.") else: parsed_args.offline = True adding_files("failed. Continue in offline mode.") if not parsed_args.offline: sys.stdout.write("- Uploading job data ... ") job_backend.git.push() job_backend.client.wait_until_queue_empty(['files'], clear_end=False) sys.stdout.write(" done.\n") link = "%smodel/%s/job/%s" % (home_config['url'], job_backend.model_name, job_backend.job_id) sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link)) if parsed_args.local: job_backend.start(collect_system=False, offline=parsed_args.offline, push=False) if not parsed_args.offline: job_backend.git.start_push_sync() cpus = create_info['config']['resources']['cpu'] memory = create_info['config']['resources']['memory'] if not parsed_args.gpu_device and create_info['config']['resources']['gpu'] > 0: # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1] parsed_args.gpu_device = [] for i in range(0, create_info['config']['resources']['gpu']): parsed_args.gpu_device.append(i) start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device, offline=parsed_args.offline)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-checkout') parser.add_argument('job_id', help="Short or long job id like ed4d6a204") parser.add_argument('file', nargs='*', help="Checkout only one file.") parser.add_argument('--target', '-t', help="Target directory where job files (or a single file) should be saved. Default current folder") parser.add_argument('--overwrite', '-p', help="Overwrite existing files.") parser.add_argument('--model', help="Model name like peter/mnist. Per default from current directory") parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.") parsed_args = parser.parse_args(args) if not parsed_args.job_id: parser.print_help() sys.exit() home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.") sys.exit(2) target = os.path.normpath(os.path.abspath(parsed_args.target if parsed_args.target else os.getcwd())) git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') if not parsed_args.file and not os.path.exists(target): os.makedirs(target) if parsed_args.file and not os.path.exists(target): os.makedirs(target) id_map = {} for job_id in [parsed_args.job_id]: full_id = git_has_local_job(home_config, model, job_id) id_map[job_id] = full_id if not full_id: full_id = git_has_remote_job(home_config, model, job_id) id_map[job_id] = full_id if full_id: print("Pull job %s to local ... " % (job_id, )) ref = 'refs/aetros/job/' + full_id subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref]) else: print("Job %s not found." % (job_id, )) sys.exit(2) ref = 'refs/aetros/job/' + id_map[parsed_args.job_id] if not parsed_args.file: print("Checkout all job files %s %s into %s ... " % (model, id_map[parsed_args.job_id], target)) else: print("Checkout job files %s %s into %s ... " % (model, id_map[parsed_args.job_id], target)) paths = parsed_args.file if parsed_args.file else ['.'] subprocess.call( [home_config['git'], '--bare', '--git-dir', git_dir, '--work-tree', target, 'checkout', ref, '--'] + paths )
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' server') parser.add_argument('name', nargs='?', help="Server name") parser.add_argument('--generate-ssh-key', help="Generates automatically a ssh key, register them in AETROS in " "your account, and delete them when the server exits. " "You should prefer 'aetros register' command as its safer.") parser.add_argument('--allow-host-execution', action='store_true', help="Whether a job can run on this server " "directly, without a virtual (docker) container.\nSecurity risk and makes resource limitation useless.") parser.add_argument('--max-memory', help="How many RAM is available. In gigabyte. Per default all available memory.") parser.add_argument('--max-cpus', help="How many cores are available. Per default all available CPU cores.") parser.add_argument('--max-gpus', help="How many GPUs are available. Comma separate list of device ids (pciBusId)." "Per default all available GPU cards. Use 'aetros gpu' too see the ids.") parser.add_argument('--no-gpus', action='store_true', help="Disable all GPUs") parser.add_argument('--max-jobs', help="How many jobs are allowed to run in total until the process exists automatically.") parser.add_argument('--host', help="Default trainer.aetros.com. Read from the global configuration ~/aetros.yml.") parser.add_argument('--show-stdout', action='store_true', help="Show all stdout of all jobs. Only for debugging necessary.") parsed_args = parser.parse_args(args) if not parsed_args.name: parser.print_help() sys.exit() self.config = read_home_config() if parsed_args.max_jobs: self.max_jobs = int(parsed_args.max_jobs) if parsed_args.max_memory: self.resources_limit['memory'] = int(parsed_args.max_memory) if parsed_args.max_cpus: self.resources_limit['cpus'] = int(parsed_args.max_cpus) self.resources_limit['host_execution'] = parsed_args.allow_host_execution gpus = [] try: gpus = aetros.cuda_gpu.get_ordered_devices() for i in range(len(gpus)): self.enabled_gpus.append(i) except Exception: pass if parsed_args.max_gpus: self.enabled_gpus = [] for i in parsed_args.max_gpus.split(','): i = int(i) if i < 0 or i >= len(gpus): raise Exception('--max-gpus ' + str(i) + ' not available on the system. GPUs ' + str([i for i in range(len(gpus))])+ ' detected.') self.enabled_gpus.append(i) elif parsed_args.no_gpus: self.enabled_gpus = [] if parsed_args.show_stdout: self.show_stdout = True event_listener = EventListener() event_listener.on('registration', self.registration_complete) event_listener.on('failed', self.connection_failed) event_listener.on('jobs', self.sync_jobs) event_listener.on('close', self.on_client_close) if hasattr(signal, 'SIGUSR1'): signal.signal(signal.SIGUSR1, self.on_signusr1) ssh_key_registered = False if parsed_args.generate_ssh_key: self.logger.info('Generate SSH key') ssh_key = paramiko.RSAKey.generate(4096) self.ssh_key_private = ssh_key.key.private_bytes( serialization.Encoding.PEM, serialization.PrivateFormat.TraditionalOpenSSL, serialization.NoEncryption() ).decode() self.ssh_key_public = 'rsa ' + ssh_key.get_base64() + ' ' + parsed_args.name self.logger.info('Register SSH key at ' + self.config['host']) data = { 'name': parsed_args.name, 'secure_key': parsed_args.generate_ssh_key, 'key': self.ssh_key_public, } response = aetros.api.http_request('server/ssh-key', json_body=data, method='post') ssh_key_registered = response == True def delete_ssh_key(): self.logger.info('Delete SSH key at ' + self.config['host']) data = { 'secure_key': parsed_args.generate_ssh_key, 'key': self.ssh_key_public, } response = aetros.api.http_request('server/ssh-key/delete', json_body=data) if not response: self.logger.error('Could not delete SSH key in AETROS Trainer.') if parsed_args.generate_ssh_key and ssh_key_registered: import atexit atexit.register(delete_ssh_key) if parsed_args.host: self.config['host'] = parsed_args.host if self.ssh_key_private: self.config['ssh_key_base64'] = self.ssh_key_private self.server = ServerClient(self.config, event_listener, self.logger) self.general_logger_stdout = GeneralLogger(job_backend=self, redirect_to=sys.__stdout__) self.general_logger_stderr = GeneralLogger(job_backend=self, redirect_to=sys.__stderr__) sys.stdout = self.general_logger_stdout sys.stderr = self.general_logger_stderr self.server.configure(parsed_args.name) self.logger.debug('Connecting to ' + self.config['host']) self.server.start() self.write_log("\n") try: while self.active: if self.registered: self.server.send_message({'type': 'utilization', 'values': self.collect_system_utilization()}) self.check_finished_jobs() time.sleep(1) except KeyboardInterrupt: self.logger.warning('Aborted') self.stop()
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-files') parser.add_argument('job_id', help="Short or long job id like ed4d6a204") parser.add_argument('folder', nargs='?', help="Limit files list to folder. Default root ./") parser.add_argument('-r', action='store_true', help="Recursive files tree") parser.add_argument( '--model', help= "Model name like peter/mnist. Per default from found configuration." ) parser.add_argument( '-c', '--config', help= "Default aetros.yml in current working directory or directories above." ) parsed_args = parser.parse_args(args) if not parsed_args.job_id: parser.print_help() sys.exit() home_config = read_home_config() config = find_config(parsed_args.config) model = parsed_args.model if parsed_args.model else config['model'] if not model: print( "No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'." ) sys.exit(2) git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git') id_map = {} for job_id in [parsed_args.job_id]: full_id = git_has_local_job(home_config, model, job_id) id_map[job_id] = full_id if not full_id: full_id = git_has_remote_job(home_config, model, job_id) id_map[job_id] = full_id if full_id: print("Pull job %s to local ... " % (job_id, )) ref = 'refs/aetros/job/' + full_id subprocess.call([ home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref + ':' + ref ]) else: print("Job %s not found." % (job_id, )) sys.exit(2) ref = 'refs/aetros/job/' + id_map[parsed_args.job_id] print("List job files of %s of %s" % (parsed_args.job_id, model)) args = [ home_config['git'], '--bare', '--git-dir', git_dir, 'ls-tree', '--long' ] if parsed_args.r: args.append('-r') args.append(ref) if parsed_args.folder: args.append(parsed_args.folder) subprocess.call(args)
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run') parser.add_argument( 'command', nargs='?', help="The command to run. Default read in configuration file") parser.add_argument( '-i', '--image', help= "Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host." ) parser.add_argument( '--no-image', action='store_true', help= "Forces not to use docker, even when image is defined in the configuration file." ) parser.add_argument( '-s', '--server', action='append', help= "Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed." ) parser.add_argument( '-m', '--model', help= "Under which model this job should be listed. Default read in configuration file" ) parser.add_argument( '-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument( '-c', '--config', help="Default aetros.yml in current working directory.") parser.add_argument( '--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument( '--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument( '--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument( '--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument( '--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument( '--offline', '-o', action='store_true', help="Whether the execution should happen offline.") parser.add_argument( '--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.") parser.add_argument( '--max-time', help= "Limit execution time in seconds. Sends SIGINT to the process group when reached." ) parser.add_argument( '--max-epochs', help= "Limit execution epochs. Sends SIGINT to the process group when reached." ) parser.add_argument( '--gpu-device', action='append', help= "Which device id should be mapped into the NVIDIA docker container. Only when --local" ) parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local") parser.add_argument( '-e', action='append', help= "Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env" ) parser.add_argument( '-p', '--param', action='append', help= "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed." ) parsed_args = parser.parse_args(args) if parsed_args.config and not os.path.exists(parsed_args.config): self.logger.error("fatal: file %s does not exist." % (parsed_args.config, )) sys.exit(2) config = find_config(parsed_args.config) home_config = read_home_config() if config['model'] and not parsed_args.model: parsed_args.model = config['model'] if not parsed_args.model: print( "fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'." ) sys.exit(2) if not parsed_args.local and parsed_args.volume: print( "fatal: can not use volume with jobs on the cluster. Use datasets instead." ) sys.exit(1) if parsed_args.local and parsed_args.priority: print( "fatal: the priority can only be set for jobs in the cluster.") sys.exit(1) if config['image']: ensure_docker_installed(self.logger) env = {} if parsed_args.e: for item in parsed_args.e: if '=' in item: k, v = item.split('=') else: k = item v = os.getenv(k) env[k] = v if ('command' not in config or not config['command']) and not parsed_args.command: self.logger.error( 'No command given. Define the command in aetros.yml or use command argument.' ) sys.exit(1) job_backend = JobBackend(parsed_args.model, self.logger) ignore = [] if 'ignore' in config: ignore = config['ignore'] job_backend.job = {'config': {'ignore': ignore}} adding_files = loading_text("- Adding job files to index ... ") files_added, size_added = job_backend.add_files(config['root'], report=False) adding_files("done with %d file%s added (%s)." % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2))) create_info = {'type': 'custom', 'config': config} incoming_hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception( '--param ' + param + ' does not contain a `=`. Please use "--param name=value"' ) name, value = param.split('=') incoming_hyperparameter[name] = value # first transform simple format in the full definition with parameter types # (string, number, group, choice_group, etc) full_hyperparameters = lose_parameters_to_full(config['parameters']) # now extract hyperparameters from full definition, and overwrite stuff using # incoming_hyperparameter if available hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter) create_info['config']['parameters'] = hyperparameter if parsed_args.rebuild_image: create_info['config']['rebuild_image'] = True if parsed_args.max_epochs: create_info['config']['maxEpochs'] = int(parsed_args.max_epochs) create_info['config']['priority'] = 0 if parsed_args.priority: create_info['config']['priority'] = float(parsed_args.priority) if parsed_args.max_time: create_info['config']['maxTime'] = float(parsed_args.max_time) if parsed_args.command: create_info['config']['command'] = parsed_args.command if parsed_args.image: # reset install options, since we can't make sure if the base image still fits if 'image' in config and config[ 'image'] and config['image'] != parsed_args.image: create_info['config']['install'] = None # reset dockerfile, since we specified manually an image create_info['config']['dockerfile'] = None create_info['config']['image'] = parsed_args.image if parsed_args.no_image: create_info['config']['image'] = None if parsed_args.server: create_info['config']['servers'] = [] for name in parsed_args.server: create_info['config']['servers'].append(name) create_info['config']['resources'] = create_info['config'].get( 'resources', {}) resources = create_info['config']['resources'] default_cpu_and_memory = 1 if create_info['config']['image'] else 0 resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory)) resources['memory'] = int( parsed_args.memory or resources.get('memory', default_cpu_and_memory)) resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0)) resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0)) if parsed_args.local: create_info['server'] = 'local' # make sure we do not limit the resources to something that is not available on the local machine warning = [] cpu = cpuinfo.get_cpu_info() mem = psutil.virtual_memory().total gpu = 0 try: gpu = len(get_ordered_devices()) except CudaNotImplementedException: pass if not create_info['config']['image'] and not all( [x == 0 for x in six.itervalues(resources)]): self.logger.warning( "! No Docker virtualization since no `image` defined, resources limitation ignored." ) if create_info['config']['image'] and resources['gpu'] > 0: if not (sys.platform == "linux" or sys.platform == "linux2"): self.logger.warning( "! Your operating system does not support GPU allocation for " "Docker virtualization. " "NVIDIA-Docker2 is only supported on Linux.") local_max_resources = { 'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu } if create_info['config']['image']: # read max hardware within Docker out = docker_call([ 'run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal' ]) cpus, memory = out.decode('utf-8').strip().split('\n') local_max_resources['cpu'] = int(cpus) memory = memory.replace('MemTotal:', '').replace('kB', '').strip() local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024) if local_max_resources['cpu'] < resources['cpu']: warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu'])) resources['cpu'] = local_max_resources['cpu'] if local_max_resources['memory'] < resources['memory']: warning.append( 'memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory'])) resources['memory'] = local_max_resources['memory'] if local_max_resources['gpu'] < resources['gpu']: warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu'])) resources['gpu'] = local_max_resources['gpu'] if warning: self.logger.warning( "! Resources downgrade due to missing hardware: %s." % (', '.join(warning), )) if parsed_args.config and not create_info['config']['configPath']: create_info['config']['configPath'] = parsed_args.config create_info['config']['sourcesAttached'] = True creating_git_job = loading_text("- Create job in local Git ... ") if aetros.utils.git.get_current_commit_hash(): create_info['origin_git_source'] = { 'origin': aetros.utils.git.get_current_remote_url(), 'author': aetros.utils.git.get_current_commit_author(), 'message': aetros.utils.git.get_current_commit_message(), 'branch': aetros.utils.git.get_current_branch(), 'commit': aetros.utils.git.get_current_commit_hash(), } job_backend.create(create_info=create_info, server=None) creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name)) summary = "➤ Summary: Job running " if parsed_args.local: summary += 'locally' else: summary += 'on the cluster' if create_info['config']['image']: summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \ % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu']) else: summary += ' on host using all available resources.' print(summary) # tasks = [] # # if 'tasks' in config: # for name, task_config in six.iteritems(config['tasks']): # replica = 1 # if 'replica' in task_config: # replica = int(task_config['replica']) # for index in range(0, replica): # tasks.append(job_backend.create_task(job_id, task_config, name, index)) if parsed_args.offline: if not parsed_args.local: self.logger.warning( "Can not create a remote job in offline mode.") sys.exit(1) self.logger.warning("Execution started offline.") else: adding_files = loading_text("- Connecting to " + home_config['host'] + " ... ") if job_backend.connect(): adding_files("connected.") else: parsed_args.offline = True adding_files("failed. Continue in offline mode.") if not parsed_args.offline: sys.stdout.write("- Uploading job data ... ") job_backend.git.push() job_backend.client.wait_until_queue_empty(['files'], clear_end=False) sys.stdout.write(" done.\n") link = "%s/model/%s/job/%s" % ( home_config['url'], job_backend.model_name, job_backend.job_id) sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link)) if parsed_args.local: job_backend.start(collect_system=False, offline=parsed_args.offline, push=False) if not parsed_args.offline: job_backend.git.start_push_sync() cpus = create_info['config']['resources']['cpu'] memory = create_info['config']['resources']['memory'] if not parsed_args.gpu_device and create_info['config'][ 'resources']['gpu'] > 0: # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1] parsed_args.gpu_device = [] for i in range(0, create_info['config']['resources']['gpu']): parsed_args.gpu_device.append(i) start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device, offline=parsed_args.offline)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' init') parser.add_argument('name', help="Model name") parser.add_argument('directory', nargs='?', help="Directory, default in current.") parser.add_argument('--organisation', '-o', help="Create the model in the organisation instead of the user account.") parser.add_argument('--space', '-s', help="Create the model in given space. If space does not exist, create it.") parser.add_argument('--private', action='store_true', help="Make the model private. Example: aetros init my-model --private") parser.add_argument('--force', '-f', action='store_true', help="Force overwriting of already existing configuration file.") home_config = read_home_config() parsed_args = parser.parse_args(args) if not parsed_args.name: parser.print_help() sys.exit(1) path = os.getcwd() if parsed_args.directory: path = os.path.abspath(parsed_args.directory) if os.path.exists(path) and not os.path.isdir(path): sys.stderr.write('Path already exist and is not a directory: ' + path) if not os.path.exists(path): os.makedirs(path) yaml = ruamel.yaml.YAML() config = {} if os.path.exists(path+'/aetros.yml'): with open(path+'/aetros.yml', 'r') as f: config = yaml.load(f) if isinstance(config, dict) and 'model' in config and not parsed_args.force: print("failed: aetros.yml already exists in with a linked model to " + config['model']+ '. Use -f to force.') sys.exit(1) if not parsed_args.private: print("Warning: creating public model. Use --private to create private models.") if '/' in parsed_args.name: sys.stderr.write('No / allowed in name. Use -o if thie model should be created in an organisation.') sys.exit(1) response = api.create_model(parsed_args.name or (os.path.basename(os.getcwd())), parsed_args.organisation, parsed_args.space, parsed_args.private) name = response['name'] if response['already_exists']: print("Notice: Model already exists remotely.") config['model'] = name with open(path + '/aetros.yml', 'w+') as f: yaml.dump(config, f) print("aetros.yml created and linked with model " + name + ' in ' + path) print("Open AETROS Trainer to see the model at https://" + home_config['host'] + '/model/' + name) git_remote_url = 'git@%s:%s.git' % (home_config['host'], name) print("Use git to store your source code. Each model has its own Git repository.") print(" $ cd " + path) print(" $ git init") print(" $ git remote add origin " + git_remote_url) print(" $ git add .") print(" $ git commit -m 'first commit'") print(" $ git push origin master")
def start_command(logger, job_backend, env_overwrite=None, volumes=None, cpus=1, memory=1, gpu_devices=None, offline=False): home_config = read_home_config() env = {} if env_overwrite: env.update(env_overwrite) start_time = time.time() env['AETROS_MODEL_NAME'] = job_backend.model_name env['AETROS_JOB_ID'] = str(job_backend.job_id) env['AETROS_OFFLINE'] = '1' if offline else '' env['AETROS_GIT_INDEX_FILE'] = job_backend.git.index_path env['DEBUG'] = os.getenv('DEBUG', '') env['PYTHONUNBUFFERED'] = os.getenv('PYTHONUNBUFFERED', '1') env['PYTHONIOENCODING'] = os.getenv('PYTHONIOENCODING', 'UTF-8') env['AETROS_ATTY'] = '1' env['AETROS_GIT'] = job_backend.git.get_base_command() env['PATH'] = os.getenv('PATH', '') if 'PYTHONPATH' not in env: env['PYTHONPATH'] = os.getenv('PYTHONPATH', '') if os.getenv('AETROS_SSH_KEY_BASE64'): env['AETROS_SSH_KEY_BASE64'] = os.getenv('AETROS_SSH_KEY_BASE64') elif get_ssh_key_for_host(home_config['host']): # we need to read the key into env so the docker container can connect to AETROS env['AETROS_SSH_KEY_BASE64'] = open(get_ssh_key_for_host(home_config['host']), 'r').read() job_config = job_backend.job['config'] job = job_backend.get_job_model() if 'command' not in job_config: job_backend.fail('No "command" given. See Configuration section in the documentation.') job_commands = job_config['command'] docker_image = job_config['image'] if job_backend.is_simple_model(): if docker_image: simple_command = ['python'] else: simple_command = [sys.executable] simple_command += ['-m', 'aetros', 'start-simple', job_backend.model_name + '/' + job_backend.job_id] job_commands = {'run': ' '.join(simple_command)} if job_commands is None: raise Exception('No command specified.') if not isinstance(job_commands, list) and not isinstance(job_commands, dict): job_commands = [job_commands] # replace {{batch_size}} parameters if isinstance(job_config['parameters'], dict): for key, value in six.iteritems(flatten_parameters(job_config['parameters'])): if isinstance(job_commands, list): for k, v in enumerate(job_commands): if isinstance(job_commands[k], six.string_types): job_commands[k] = job_commands[k].replace('{{' + key + '}}', simplejson.dumps(value)) elif isinstance(job_commands, dict): for k, v in six.iteritems(job_commands): if isinstance(job_commands[k], six.string_types): job_commands[k] = job_commands[k].replace('{{' + key + '}}', simplejson.dumps(value)) job_backend.set_system_info('commands', job_commands) os.chdir(job_backend.git.work_tree) docker_image_built = False if docker_image and (job_config['dockerfile'] or job_config['install']): rebuild_image = job_config['rebuild_image'] if 'rebuild_image' in job_config else False docker_image = docker_build_image(logger, home_config, job_backend, rebuild_image) docker_image_built = True job_backend.collect_device_information(gpu_devices) state = {'last_process': None} job_backend.set_system_info('processRunning', False, True) def pause(): if not state['last_process'] or state['last_process'].poll() is not None: # no running process return if docker_image: if docker_pause(logger, home_config, job_backend): job_backend.set_paused(True) else: os.killpg(os.getpgid(state['last_process'].pid), signal.SIGSTOP) job_backend.set_paused(True) def cont(): if not state['last_process'] or state['last_process'].poll() is not None: # no running process return job_backend.set_paused(False) if docker_image: docker_continue(logger, home_config, job_backend) else: os.killpg(os.getpgid(state['last_process'].pid), signal.SIGCONT) job_backend.on_pause = pause job_backend.on_continue = cont if docker_image: env['AETROS_GIT_INDEX_FILE'] = '/aetros/' + job_backend.model_name + '.git/' + os.path.basename(env['AETROS_GIT_INDEX_FILE']) with job_backend.git.batch_commit('JOB_SYSTEM_INFORMATION'): aetros_environment = {'aetros_version': __version__, 'variables': env.copy()} if 'AETROS_SSH_KEY' in aetros_environment['variables']: del aetros_environment['variables']['AETROS_SSH_KEY'] if 'AETROS_SSH_KEY_BASE64' in aetros_environment['variables']: del aetros_environment['variables']['AETROS_SSH_KEY_BASE64'] job_backend.set_system_info('environment', aetros_environment) job_backend.set_system_info('memory_total', memory * 1024 * 1024 * 1024) import cpuinfo cpu = cpuinfo.get_cpu_info() job_backend.set_system_info('cpu_name', cpu['brand']) job_backend.set_system_info('cpu', [cpu['hz_actual_raw'][0], cpus]) job_backend.start_monitoring(cpu_cores=cpus, gpu_devices=gpu_devices, docker_container=job_backend.job_id) if not docker_image_built: docker_pull_image(logger, home_config, job_backend) docker_image_information(logger, home_config, job_backend) # make sure old container is removed subprocess.Popen([home_config['docker'], 'rm', job_backend.job_id], stderr=subprocess.PIPE).wait() command = docker_command_wrapper(logger, home_config, job_backend, volumes, cpus, memory, gpu_devices, env) # since linux doesnt handle SIGINT when pid=1 process has no signal listener, # we need to make sure, we attached one to the pid=1 process trap = 'trapIt () { "$@"& pid="$!"; trap "kill -INT $pid" INT TERM; ' \ 'while kill -0 $pid > /dev/null 2>&1; do wait $pid; ec="$?"; done; exit $ec;};' command.append(docker_image) command += ['/bin/sh', '-c', trap + 'trapIt /bin/sh /job/aetros/command.sh'] else: # non-docker # env['PYTHONPATH'] += ':' + os.getcwd() job_backend.collect_system_information() job_backend.collect_environment(env) job_backend.start_monitoring(gpu_devices=gpu_devices) command = ['/bin/sh', job_backend.git.work_tree + '/aetros/command.sh'] logger.debug("$ %s " % (' '.join([simplejson.dumps(a) for a in command]))) job_backend.set_system_info('image/name', str(docker_image)) p = None exited = False last_return_code = None state['last_process'] = None all_done = False command_stats = None files = job_backend.file_list() def clean(): # clear working tree shutil.rmtree(job_backend.git.work_tree) def on_force_exit(): # make sure the process dies clean() with open(os.devnull, 'r+b', 0) as DEVNULL: if docker_image: # docker run does not proxy INT signals to the docker-engine, # so we need to do it on our own directly. subprocess.Popen(args=[home_config['docker'], 'kill', job_backend.job_id], stdout=DEVNULL, stderr=DEVNULL).wait() elif not exited and state['last_process'] and state['last_process'].poll() is None: # wait for last command os.killpg(os.getpgid(state['last_process'].pid), signal.SIGKILL) job_backend.on_force_exit = on_force_exit try: job_backend.set_status('STARTED', add_section=False) # logger.warning("$ %s " % (str(command),)) # make sure maxTime limitation is correctly calculated job_backend.monitoring_thread.handle_max_time = True job_backend.monitoring_thread.handle_max_time_time = time.time() # Since JobBackend sends SIGINT to its current process group, it sends also to its parents when same pg. # We need to change the process group of the process, so this won't happen. # If we don't this, the master process (server command e.g.) receives the SIGINT as well. kwargs = {} if os.name == 'nt': kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP else: kwargs['preexec_fn'] = os.setsid # only use full env when no image used command_env = env if not docker_image: command_env = os.environ.copy() command_env.update(env) if os.environ.get('LD_LIBRARY_PATH', None): command_env['LD_LIBRARY_PATH_ORI'] = command_env['LD_LIBRARY_PATH'] def write_command_sh(job_command): f = open(job_backend.git.work_tree + '/aetros/command.sh', 'w+') if not docker_image: # new shells unset LD_LIBRARY_PATH automatically, so we make sure it will be there again f.write('export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_ORI;\n') if job.get_working_dir(): f.write('cd %s;\n' % (job.get_working_dir(),)) f.write(job_command) f.close() def read_line(line): handled, filtered_line, failed = extract_api_calls(line, job_backend.handle_stdout_api, logger=logger) if is_debug(): for call in handled: logger.debug('STDOUT API CALL: ' + str(call)) for fail in failed: logger.warning("API call failed '%s': %s %s" % (str(fail['line']), str(type(fail['exception']).__name__), str(fail['exception']))) return filtered_line def exec_command(index, command, job_command): write_command_sh(job_command) print('%s $ %s' % ('/' + job.get_working_dir(), job_command.strip())) args = command logger.debug('$ ' + ' '.join([simplejson.dumps(a) for a in args])) command_stats[index]['started'] = time.time() - start_time job_backend.set_system_info('command_stats', command_stats, True) # important to prefix it, otherwise name='master' would reset all stats in controller backend command_env['AETROS_JOB_NAME'] = 'command_' + str(index) state['last_process'] = subprocess.Popen( args=args, bufsize=0, stderr=subprocess.PIPE, stdout=subprocess.PIPE, env=command_env, **kwargs ) job_backend.set_system_info('processRunning', True, True) wait_stdout = sys.stdout.attach(state['last_process'].stdout, read_line=read_line) wait_stderr = sys.stderr.attach(state['last_process'].stderr) state['last_process'].wait() command_stats[index]['rc'] = last_return_code command_stats[index]['ended'] = time.time() - start_time job_backend.set_system_info('command_stats', command_stats, True) job_backend.set_system_info('processRunning', True, False) wait_stdout() wait_stderr() # make sure a new line is printed after a command print("") return state['last_process'] done = 0 total = len(job_commands) job_backend.set_system_info('command_stats', command_stats, True) if isinstance(job_commands, list): command_stats = [{'rc': None, 'started': None, 'ended': None} for x in job_commands] for k, job_command in enumerate(job_commands): job_backend.set_status('Command ' + str(k+1)) p = exec_command(k, command, job_command) last_return_code = p.poll() if last_return_code == 0: done += 1 else: # one failed, so exit and don't execute next break if isinstance(job_commands, dict): command_stats = {} for name, job_command in six.iteritems(job_commands): command_stats[name] = {'rc': None, 'started': None, 'ended': None} for name, job_command in six.iteritems(job_commands): job_backend.set_status('Command ' + name) p = exec_command(name, command, job_command) last_return_code = p.poll() if last_return_code == 0: done += 1 else: # one failed, so exit and don't execute next break all_done = done == total exited = True if state['last_process']: sys.exit(state['last_process'].poll()) else: sys.exit(1) except SystemExit: # since we started the command in a new process group, a SIGINT or CTRL+C on this process won't affect # our actual command process. So we need to take care that we stop everything. logger.debug("SystemExit, exited=%s, all-done=%s, has-last-process=%s, pid=%s" %( str(exited), str(all_done), state['last_process'] is not None, state['last_process'].poll() if state['last_process'] else None )) # make sure the process dies if docker_image: # docker run does not proxy INT signals to the docker-engine, # so we need to do it on our own directly. p = subprocess.Popen(args=[home_config['docker'], 'inspect', job_backend.job_id], stderr=subprocess.PIPE, stdout=subprocess.PIPE) p.wait() if p.poll() == 0: subprocess.Popen(args=[home_config['docker'], 'kill', job_backend.job_id]).wait() elif not exited and state['last_process'] and state['last_process'].poll() is None: # wait for last command os.killpg(os.getpgid(state['last_process'].pid), signal.SIGINT) state['last_process'].wait() if 'output' in job_config and job_config['output']: upload_output_files(job_backend, job_config['output']) if exited: if all_done: job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_DONE) else: job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_FAILED) else: # master received SIGINT before the all job commands exited. if not job_backend.in_early_stop: # in_early_stop indicates whether we want to have a planned stop (maxTime limitation for example), # which should mark the job as done, not as abort(). # if this is not set, we the master received a SIGINT without early_stop, so mark as aborted. job_backend.abort() else: # let the on_shutdown listener handle the rest pass clean()
def diff_objects(self, latest_commit_sha): """ Push all changes to origin, based on objects, not on commits. Important: Call this push after every new commit, or we lose commits. """ base = ['git', '--bare', '--git-dir', self.git_path] object_shas = [] summary = {'commits': [], 'trees': [], 'files': []} def read_parents_and_tree_from(commit): if commit in self.synced_object_shas or commit in object_shas: # this commit has already been synced or read return None, None self.synced_object_shas[commit] = True summary['commits'].append(commit) object_shas.append(commit) object_content = subprocess.check_output(base + ['cat-file', '-p', commit]).decode('utf-8').strip() parents = [] tree = '' for line in object_content.splitlines(): if line.startswith('tree '): tree = line[len('tree '):] if line.startswith('parent '): parents.append(line[len('parent '):]) return parents, tree def collect_files_from_tree(tree): if tree in self.synced_object_shas or tree in object_shas: # we have exactly this tree already synced or read, meaning all its objects as well return self.synced_object_shas[tree] = True summary['trees'].append(tree) object_shas.append(tree) object_content = subprocess.check_output(base + ['ls-tree', '-r', '-t', tree]).decode('utf-8').strip() for line in object_content.splitlines(): exploded = line.split(' ') if len(exploded) < 3: sys.stderr.write("Error: Wrong line format of ls-tree for %s: %s\n" % (tree, line,)) sys.exit(1) object_to_add = str(exploded[2][:40]) path = str(exploded[2][41:]) if object_to_add in self.synced_object_shas or object_to_add in object_shas: # have it already in the list or already synced continue object_shas.append(object_to_add) self.synced_object_shas[object_to_add] = True summary['files'].append([object_to_add, path]) commits_to_check = [latest_commit_sha] while len(commits_to_check): sha = commits_to_check.pop(0) parents, tree = read_parents_and_tree_from(sha) if parents: for parent in parents: if parent not in commits_to_check: commits_to_check.append(parent) if tree: collect_files_from_tree(tree) is_debug2() and self.logger.debug("shas_to_check %d: %s " % (len(object_shas), str(object_shas),)) if not object_shas: return [], summary try: is_debug2() and self.logger.debug("Do git-cat-file-check.sh") ssh_stream = create_ssh_stream(read_home_config(), exit_on_failure=False) channel = ssh_stream.get_transport().open_session() channel.exec_command('git-cat-file-check.sh "%s"' % (self.model_name + '.git',)) channel.sendall('\n'.join(object_shas)) channel.shutdown_write() def readall(c): content = b'' while True: try: chunk = c.recv(1024) if chunk == b'': break content += chunk except (KeyboardInterrupt, SystemExit): return return content missing_objects = readall(channel).decode('utf-8').splitlines() channel.close() ssh_stream.close() # make sure we have in summary only SHAs we actually will sync for stype in six.iterkeys(summary): ids = summary[stype][:] for sha in ids: if stype == 'files': if sha[0] not in missing_objects: summary[stype].remove(sha) else: if sha not in missing_objects: summary[stype].remove(sha) return missing_objects, summary except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.logger.error("Failed to generate diff_objects: %s" % (str(e),)) for sha in object_shas: if sha in self.synced_object_shas: del self.synced_object_shas[sha] return None, None
def start_custom(logger, job_backend): job_model = job_backend.get_job_model() config = job_model.config custom_git = False if 'gitCustom' in config and config['gitCustom']: custom_git = config['gitCustom'] if custom_git and ('sourceGitUrl' not in config or not config['sourceGitUrl']): raise Exception('Server git url is not configured. Aborted') if 'sourcePythonScript' not in config or not config['sourcePythonScript']: raise Exception('Server python script is not configured. Aborted') python_script = config['sourcePythonScript'] git_tree = 'master' if custom_git: git_url = config['sourceGitUrl'] else: user_config = read_home_config() git_url = 'git@' + user_config[ 'host'] + ':' + job_backend.model_name + '.git' if 'sourceGitTree' in config and config['sourceGitTree']: git_tree = config['sourceGitTree'] work_tree = job_backend.git.work_tree my_env = os.environ.copy() if 'PYTHONPATH' not in my_env: my_env['PYTHONPATH'] = '' my_env['PYTHONPATH'] += ':' + os.getcwd() my_env['AETROS_MODEL_NAME'] = job_backend.model_name my_env['AETROS_JOB_ID'] = job_backend.job_id my_env['AETROS_ATTY'] = '1' logger.info("Setting up git repository %s in %s" % (git_url, work_tree)) logger.info("Using git tree of '%s'" % (git_tree, )) try: if os.path.exists(work_tree): shutil.rmtree(work_tree) args = ['git', 'clone', git_url, work_tree] code = subprocess.call(args, stderr=sys.stderr, stdout=sys.stdout) if code != 0: raise Exception('Could not clone repository %s to %s' % (git_url, work_tree)) # make sure the requested branch is existent in local git. Target FETCH_HEAD to this branch. git_execute(logger, work_tree, ['fetch', 'origin', git_tree]) git_execute(logger, work_tree, ['checkout', git_tree]) except GitCommandException as e: raise Exception( 'Could not run "%s" for repository %s in %s. Look at previous output.' % (e.cmd, git_url, work_tree)) args = (sys.executable, python_script) logger.info("Model source code checked out.") logger.info("-----------") logger.info("-----------") logger.info("Switch working directory to " + work_tree) logger.warning("$ %s %s" % args) try: subprocess.Popen(args, close_fds=True, env=my_env, cwd=work_tree).wait() except KeyboardInterrupt: logger.warning("Job aborted.") sys.exit(1)
def main(self, args): import aetros.const parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' server') parser.add_argument('name', nargs='?', help="Server name") parser.add_argument('--generate-ssh-key', help="Generates automatically a ssh key, register them in AETROS in " "your account, and delete them when the server exits. " "You should prefer 'aetros authenticate' command as its safer.") parser.add_argument('--allow-host-execution', action='store_true', help="Whether a job can run on this server " "directly, without a virtual (docker) container.\nSecurity risk and makes resource limitation useless.") parser.add_argument('--max-memory', help="How many RAM is available. In gigabyte. Per default all available memory.") parser.add_argument('--max-cpus', help="How many cores are available. Per default all available CPU cores.") parser.add_argument('--max-gpus', help="How many GPUs are available. Comma separate list of device ids." "Per default all available GPU cards. Use 'aetros gpu' too see the ids.") parser.add_argument('--no-gpus', action='store_true', help="Disable all GPUs") parser.add_argument('--max-jobs', help="How many jobs are allowed to run in total until the process exists automatically.") parser.add_argument('--host', help="Default trainer.aetros.com. Read from the global configuration ~/aetros.yml.") parser.add_argument('--show-stdout', action='store_true', help="Show all stdout of all jobs. Only for debugging necessary.") parsed_args = parser.parse_args(args) if not parsed_args.name: parser.print_help() sys.exit() self.config = read_home_config() if parsed_args.max_jobs: self.max_jobs = int(parsed_args.max_jobs) if parsed_args.max_memory: self.resources_limit['memory'] = int(parsed_args.max_memory) if parsed_args.max_cpus: self.resources_limit['cpus'] = int(parsed_args.max_cpus) self.resources_limit['host_execution'] = parsed_args.allow_host_execution gpus = [] try: gpus = aetros.cuda_gpu.get_ordered_devices() for i in range(len(gpus)): self.enabled_gpus.append(i) except aetros.cuda_gpu.CudaNotImplementedException: pass if parsed_args.max_gpus: self.enabled_gpus = [] for i in parsed_args.max_gpus.split(','): i = int(i) if i < 0 or i >= len(gpus): raise Exception('--max-gpus ' + str(i) + ' not available on the system. GPUs ' + str([i for i in range(len(gpus))])+ ' detected.') self.enabled_gpus.append(i) elif parsed_args.no_gpus: self.enabled_gpus = [] if parsed_args.show_stdout: self.show_stdout = True event_listener = EventListener() event_listener.on('registration', self.registration_complete) event_listener.on('failed', self.connection_failed) event_listener.on('jobs', self.sync_jobs) event_listener.on('close', self.on_client_close) if hasattr(signal, 'SIGUSR1'): signal.signal(signal.SIGUSR1, self.on_signusr1) ssh_key_registered = False if parsed_args.generate_ssh_key: self.logger.info('Generate SSH key') ssh_key = paramiko.RSAKey.generate(4096) self.ssh_key_private = ssh_key.key.private_bytes( serialization.Encoding.PEM, serialization.PrivateFormat.TraditionalOpenSSL, serialization.NoEncryption() ).decode() self.ssh_key_public = 'rsa ' + ssh_key.get_base64() + ' ' + parsed_args.name self.logger.info('Register SSH key at ' + self.config['host']) data = { 'name': parsed_args.name, 'secure_key': parsed_args.generate_ssh_key, 'key': self.ssh_key_public, } try: response = aetros.api.http_request('server/ssh-key', json_body=data, method='post') except aetros.api.ApiError as e: if 'access_denied' in e.error: print("error: Could not connect to " + self.config['url'] + ': Access denied. --generate-ssh-key seems to be wrong. Incorrect host? See "aetros id"') sys.exit(1) raise ssh_key_registered = response == True def delete_ssh_key(): self.logger.info('Delete SSH key at ' + self.config['host']) data = { 'secure_key': parsed_args.generate_ssh_key, 'key': self.ssh_key_public, } response = aetros.api.http_request('server/ssh-key/delete', json_body=data) if not response: self.logger.error('Could not delete SSH key in AETROS Trainer.') if parsed_args.generate_ssh_key and ssh_key_registered: atexit.register(delete_ssh_key) if parsed_args.host: self.config['host'] = parsed_args.host if self.ssh_key_private: self.config['ssh_key_base64'] = self.ssh_key_private self.server = ServerClient(self.config, event_listener, self.logger) self.general_logger_stdout = GeneralLogger(job_backend=self, redirect_to=sys.__stdout__) self.general_logger_stderr = GeneralLogger(job_backend=self, redirect_to=sys.__stderr__) sys.stdout = self.general_logger_stdout sys.stderr = self.general_logger_stderr self.server.configure(parsed_args.name) self.logger.debug('Connecting to ' + self.config['host']) self.server.start() self.write_log("\n") try: while self.active: if self.registered: self.server.send_message({'type': 'utilization', 'values': self.collect_system_utilization()}, '') self.check_finished_jobs() time.sleep(1) except SystemExit: self.logger.warning('Killed') self.stop() except KeyboardInterrupt: self.stop()
def main(self, args): from aetros.starter import start parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start') parser.add_argument('name', help='the model name, e.g. aetros/mnist-network to start new job, or job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5 to start a pre-created job.') parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.") parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.") parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.") parser.add_argument('-b', '--branch', help="This overwrites the Git branch used when new job should be started.") parser.add_argument('-c', '--config', help="Default /aetros.yml in Git root.") parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.") parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.") parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.") parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.") parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.") parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.") parser.add_argument('--gpu-device', action='append', help="Which GPU device id should be mapped into the Docker container. Only with --local.") parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.") parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.") parser.add_argument('--insights', action='store_true', help="activates insights. Only for simple models.") parser.add_argument('--dataset', help="Dataset id when model has placeholders. Only for simple models with placeholders as input/output.") parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.") parsed_args = parser.parse_args(args) if not parsed_args.name: print("fatal: no model defined. 'aetros start user/model-name'.") sys.exit(2) if parsed_args.name and parsed_args.name.count('/') > 1: # start a concrete job, used by server command gpu_devices = [] if parsed_args.gpu_device: gpu_devices = [int(x) for x in parsed_args.gpu_device] start(self.logger, parsed_args.name, cpus=int(parsed_args.cpu), memory=int(parsed_args.memory), gpu_devices=gpu_devices) return home_config = read_home_config() model_name = parsed_args.name # create a new job hyperparameter = {} if parsed_args.param: for param in parsed_args.param: if '=' not in param: raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"') name, value = param.split('=') hyperparameter[name] = value job_config = {'insights': parsed_args.insights} if parsed_args.image: job_config['image'] = parsed_args.image if parsed_args.branch: job_config['sourceGitTree'] = parsed_args.branch if parsed_args.max_epochs: job_config['maxEpochs'] = int(parsed_args.max_epochs) if parsed_args.max_time: job_config['maxTime'] = float(parsed_args.max_time) job_config['priority'] = 0 if parsed_args.priority: job_config['priority'] = float(parsed_args.priority) if parsed_args.rebuild_image: job_config['config']['rebuild_image'] = True if parsed_args.server: job_config['servers'] = [] for name in parsed_args.server: job_config['servers'].append(name) job_config['resources'] = {} if parsed_args.cpu: job_config['resources']['cpu'] = int(parsed_args.cpu) if parsed_args.memory: job_config['resources']['memory'] = int(parsed_args.memory) if parsed_args.gpu: job_config['resources']['gpu'] = int(parsed_args.gpu) if parsed_args.gpu_memory: job_config['resources']['gpu_memory'] = int(parsed_args.gpu_memory) config_path = parsed_args.config or 'aetros.yml' try: self.logger.debug("Create job ...") created = api.create_job(model_name, config_path, parsed_args.local, hyperparameter, parsed_args.dataset, config=job_config) except api.ApiError as e: if 'Connection refused' in e.error: self.logger.error("You are offline") raise self.logger.info("Job %s/%s created." % (model_name, created['id'])) if parsed_args.local: start(self.logger, model_name + '/' + created['id'], gpu_devices=parsed_args.gpu_device) else: print("Open http://%s/model/%s/job/%s to monitor it." % (home_config['host'], model_name, created['id']))