def make_wrapper(self): """ Generate the GeneFlow app wrapper script. Args: self: class instance Returns: On success: True. On failure: False. """ # make assets folder, if it doesn't already exist asset_path = Path(self._path / 'assets') asset_path.mkdir(exist_ok=True) Log.some().info( 'compiling %s', str(asset_path / '{}.sh'.format(self._config['name'])) ) # compile jinja2 template if not TemplateCompiler.compile_template( None, 'wrapper-script.sh.j2', str(asset_path / '{}.sh'.format(self._config['name'])), **self._config ): Log.an().error('cannot compile GeneFlow app wrapper script') return False return True
def make_test(self): """ Generate the GeneFlow app test script. Args: self: class instance Returns: On success: True. On failure: False. """ # make test folder, if it doesn't already exist test_path = Path(self._path / 'test') test_path.mkdir(exist_ok=True) Log.some().info('compiling %s', str(test_path / 'test.sh')) # compile jinja2 template if not TemplateCompiler.compile_template( None, 'test.sh.j2', str(test_path / 'test.sh'), **self._config ): Log.an().error('cannot compile GeneFlow app test script') return False return True
def _send_notifications(self, status): # construct message msg_data = { 'to': '', 'from': '*****@*****.**', 'subject': 'GeneFlow Job "{}": {}'.format( self._job['name'], status ), 'content': ( 'Your GeneFlow job status has changed to {}' '\nJob Name: {}' '\nJob ID: {}' ).format(status, self._job['name'], self._job_id) } # use agave token as header if available if 'agave' in self._workflow_context: msg_headers = { 'Authorization':'Bearer {}'.format( self._workflow_context['agave']\ .get_context_options()['agave_wrapper']\ ._agave.token.token_info.get('access_token') ) } else: msg_headers = {} Log.some().info('message headers: %s', str(msg_headers)) for notify in self._job['notifications']: Log.some().info( 'sending notification(s) to %s @ %s', str(notify['to']), notify['url'], ) to_list = notify['to'] if isinstance(notify['to'], str): to_list = [notify['to']] for to_item in to_list: msg_data['to'] = to_item try: response = requests.post( notify['url'], data=msg_data, headers=msg_headers ) except requests.exceptions.RequestException as err: Log.a().warning( 'cannot send notification to %s @ %s: %s', to_item, notify['url'], str(err) ) if response.status_code != 201: Log.a().warning( 'cannot send notification to %s @ %s: %s', to_item, notify['url'], response.text )
def connect(self): agave_connection_type = self._config.get('connection_type', 'impersonate') if agave_connection_type == 'impersonate': token_username = '******'.format( self._config['domain'], '/' if self._config['domain'] else '', self._config['token_username']) Log.some().debug('user impersonation: %s', token_username) self._agave = Agave(api_server=self._config['server'], username=self._config['username'], password=self._config['password'], token_username=token_username, client_name=self._config['client'], api_key=self._config['key'], api_secret=self._config['secret'], verify=False) elif agave_connection_type == 'agave-cli': # get credentials from ~/.agave/current agave_clients = Agave._read_clients() agave_clients[0]['verify'] = False # don't verify ssl self._agave = Agave(**agave_clients[0]) # when using agave-cli, token_username must be the same as the # stored creds in user's home directory, this can be different # from job username self._config['token_username'] \ = agave_clients[0]['username'] else: Log.an().error('invalid agave connection type: %s', agave_connection_type) return False return True
def files_import_from_agave(self, system_id, file_path, file_name, url_to_ingest): """ Wrap AgavePy import data file command. Args: self: class instance. system_id: Identifier for Agave storage system. file_path: Path where file is to be imported. file_name: Name of the imported file. url_to_ingest: Agave URL to be ingested. Returns: On success: True with no exceptions. On failure: Throws exception. """ response = self._agave.files.importData(systemId=system_id, filePath=file_path, fileName=file_name, urlToIngest=urllib.parse.quote( str(url_to_ingest or ''), safe='/:')) async_response = AgaveAsyncResponse(self._agave, response) status = async_response.result() Log.some().debug('import %s: %s -> agave://%s/%s/%s', str(status), url_to_ingest, system_id, file_path, file_name) if str(status) == 'FINISHED': return True # not finished, try again raise Exception('agave import failed')
def make_agave(self): """ Generate the GeneFlow Agave app definition. Args: self: class instance Returns: On success: True. On failure: False. """ Log.some().info('compiling %s', str(self._path / 'agave-app-def.json.j2')) if not TemplateCompiler.compile_template( None, 'agave-app-def.json.j2.j2', str(self._path / 'agave-app-def.json.j2'), slugify_name=slugify(self._app['name'], regex_pattern=r'[^-a-z0-9_]+'), **self._app): Log.an().error( 'cannot compile GeneFlow Agave app definition template') return False return True
def make_wrapper(self): """ Generate the GeneFlow app wrapper script. Args: self: class instance Returns: On success: True. On failure: False. """ # make assets folder, if it doesn't already exist asset_path = Path(self._path / 'assets') asset_path.mkdir(exist_ok=True) script_path = str(asset_path / '{}.sh'.format( slugify(self._app['name'], regex_pattern=r'[^-a-z0-9_]+'))) Log.some().info('compiling %s', script_path) # compile jinja2 template if not TemplateCompiler.compile_template(None, 'wrapper-script.sh.j2', script_path, **self._app): Log.an().error('cannot compile GeneFlow app wrapper script') return False # make script executable by owner os.chmod(script_path, stat.S_IRWXU) return True
def update_def(self, agave): """ Update GeneFlow app definition by adding the implementation section. Args: self: class instance Returns: On success: True. On failure: False. """ Log.some().info('updating %s', str(self._path / 'app.yaml')) try: with open(str(self._path / 'app.yaml'), 'a') as app_yaml: app_yaml.write('\n\nimplementation:') if agave: app_yaml.write('\n agave:') app_yaml.write('\n agave_app_id: {}-{}-{}{}'.format( agave['apps_prefix'], slugify(self._app['name'], regex_pattern=r'[^-a-z0-9_]+'), self._app['agave_version'], agave['revision'])) app_yaml.write('\n local:') app_yaml.write('\n script: {}.sh'.format( slugify(self._app['name'], regex_pattern=r'[^-a-z0-9_]+'))) except IOError as err: Log.an().error('cannot update GeneFlow app definition: %s', err) return False return True
def main(): """ Geneflow CLI main entrypoint. Args: None. Returns: Nothing. """ args, subparser = parse_args() if not args: sys.exit(1) # configure logging Log.config(args[0].log_level, args[0].log_file) # display GeneFlow version Log.some().info('GeneFlow %s', __version__) # call the appropriate command if not args[0].func(args=args[0], other_args=args[1], subparser=subparser): sys.exit(1) sys.exit(0)
def _call(self, systemId, filePath, fileName, urlToIngest): """ Wrap agavePy import data file command. Args: self: class instance systemId: Identifier for Agave storage system filePath: Path where file is to be imported fileName: Name of the imported file urlToIngest: Agave URL to be ingested Returns: True """ response = self._agave.files.importData(systemId=systemId, filePath=filePath, fileName=fileName, urlToIngest=urlToIngest) async_response = AgaveAsyncResponse(self._agave, response) status = async_response.result() Log.some().info('import %s: %s -> agave://%s/%s/%s', str(status), urlToIngest, systemId, filePath, fileName) if str(status) == 'FINISHED': return True # not finished, try again raise Exception('agave import failed')
def _init_exec_context_set(self): """ Initialize set of execution contexts, which is specified by the execution.context job parameters. Args: self: class instance Returns: On success: True. """ # get explicit execution contexts from the job parameters self._exec_contexts = set(self._job['execution']['context'].values()) # check validity of exec contexts for context in self._exec_contexts: if not Contexts.is_exec_context(context): msg = 'invalid exec context: {}'.format(context) Log.an().error(msg) return self._fatal(msg) Log.some().debug('execution contexts: %s', self._exec_contexts) return True
def retry_failed(self, map_item): """ Retry a job. Args: self: class instance. Returns: True if failed/stopped job restarted successfully False if failed/stopped job not restarted due to error """ # retry job Log.some().info( '[step.%s]: retrying agave job (%s), attempt number %s', self._step['name'], map_item['template']['output'], map_item['attempt']+1 ) # add another run to list map_item['attempt'] += 1 map_item['run'].append({}) if not self._run_map(map_item): Log.a().warning( '[step.%s]: cannot retry agave job (%s), attempt number %s', self._step['name'], map_item['template']['output'], map_item['attempt'] ) return False return True
def upload_agave_test_data(self): """ Upload Agave test data from workflow package. Args: self: class instance. Returns: None """ if (not self._agave or not self._agave_params or not self._agave_params.get('agave')): Log.a().warning( 'must provide agave parameters to upload test data') return False # create main test data URI parsed_base_test_uri = URIParser.parse('agave://{}/{}'.format( self._agave_params['agave']['deploymentSystem'], self._agave_params['agave']['testDataDir'])) Log.some().info('creating base test data uri: %s', parsed_base_test_uri['chopped_uri']) if not DataManager.mkdir(parsed_uri=parsed_base_test_uri, recursive=True, agave={ 'agave': self._agave, 'agave_config': self._config['agave'] }): Log.a().warning('cannot create base test data uri: %s', parsed_base_test_uri['chopped_uri']) return False # upload test data parsed_local_test_uri = URIParser.parse(str(Path(self._path) / 'data')) parsed_agave_test_uri = URIParser.parse('{}/{}'.format( parsed_base_test_uri['chopped_uri'], Path(self._path).name)) Log.some().info('copying test data from %s to %s', parsed_local_test_uri['chopped_uri'], parsed_agave_test_uri['chopped_uri']) if not DataManager.copy(parsed_src_uri=parsed_local_test_uri, parsed_dest_uri=parsed_agave_test_uri, local={}, agave={ 'agave': self._agave, 'agave_config': self._config['agave'] }): Log.a().warning('cannot copy test data from %s to %s', parsed_local_test_uri['chopped_uri'], parsed_agave_test_uri['chopped_uri']) return False return True
def retry_failed(self): """ Check if any jobs failed or stopped. Args: self: class instance. Returns: If no failure: True. On failure: Error message. """ # check if any jobs failed or stopped for map_item in self._map: if ( map_item['status'] == 'FAILED' or map_item['status'] == 'STOPPED' ): # retry the job, if not at limit if map_item['attempt'] >= self._config['agave']['job_retry']: msg = ( 'agave job failed ({}) for step "{}", ' 'retries for map item "{}" reached limit of {}' ).format( map_item['run'][map_item['attempt']]['agave_job_id'], self._step['name'], map_item['filename'], self._config['agave']['job_retry'] ) Log.an().error(msg) return self._fatal(msg) # retry job Log.some().info( ( 'agave job failed (%s) for step "%s", ' 'retrying map item "%s"' ), map_item['run'][map_item['attempt']]['agave_job_id'], self._step['name'], map_item['filename'] ) # add another run to list map_item['attempt'] += 1 map_item['run'].append({}) if not self._run_map(map_item): msg = 'cannot re-run agave job for map item "{}"'\ .format(map_item['filename']) Log.an().error(msg) return self._fatal(msg) return True
def _copy_asset(self, asset): """ Copy app assets. Args: self: class instance asset: what to copy Returns: On success: True. On failure: False. """ if not self._copy_prefix: Log.a().warning( 'copy prefix must be specified when copying app assets' ) return False if not asset.get('dst'): Log.a().warning('asset dst required for app %s', self._app['name']) return False if not asset.get('src'): Log.a().warning('asset src required for app %s', self._app['name']) return False # create asset destination asset_path = Path(self._path / asset['dst']) asset_path.mkdir(exist_ok=True) if 'zip' in asset: # create a tar.gz of src cmd = 'tar -czf "{}" --directory="{}" .'.format( str(Path(asset_path / '{}.tar.gz'.format(asset['zip']))), str(Path(self._copy_prefix) / asset['src']) ) Log.some().info('zipping: %s', cmd) cmd_result = ShellWrapper.invoke(cmd) if cmd_result is False: Log.a().warning('cannot zip asset src: %s', cmd) return False Log.some().info('tar stdout: %s', cmd_result) else: # move without creating tar.gz cmd = 'cp -R "{}" "{}"'.format( str(Path(self._copy_prefix) / asset['src']), str(asset_path) ) Log.some().info('copying: %s', cmd) cmd_result = ShellWrapper.invoke(cmd) if cmd_result is False: Log.a().warning('cannot copy asset src: %s', cmd) return False Log.some().info('copy stdout: %s', cmd_result) return True
def add_apps(args): """ Add GeneFlow apps to database. Args: args.app_yaml: GeneFlow definition with apps. args.config_file: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ app_yaml = args.app_yaml config_file = args.config_file environment = args.environment # load config file cfg = Config() if not cfg.load(config_file): Log.an().error('cannot load config file: %s', config_file) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # import apps defs = data_source.import_apps_from_def(app_yaml) if not defs: Log.an().error('app definition load failed: %s', app_yaml) return False data_source.commit() # display new IDs for app in defs: Log.some().info('app loaded: %s -> %s', app, defs[app]) return True
def add_workflows(args, other_args, subparser=None): """ Add GeneFlow workflows to database. Args: args.workflow_yaml: GeneFlow definition with workflows. args.config: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ workflow_yaml = args.workflow_yaml config = args.config environment = args.environment # load config file cfg = Config() if not cfg.load(config): Log.an().error('cannot load config file: %s', config) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # import workflow defs = data_source.import_workflows_from_def(workflow_yaml) if not defs: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False data_source.commit() # display new IDs for workflow in defs: Log.some().info('workflow loaded: %s -> %s', workflow, defs[workflow]) return True
def stage(self, **kwargs): """ Copy data to all contexts except 'final' from source URI. Source URI can be multiple locations, but only copy to the first element of dest URIs. Set _staged indicator to True on success. Args: self: class instance. **kwargs: additional arguments required by DataManager.copy(). Returns: True or False. """ for context in self._parsed_data_uris: if context != self._source_context: if self._clean: # remove target URI first pass for i, parsed_source_uri in enumerate( self._parsed_data_uris[self._source_context]): Log.some().debug( 'staging data: %s->%s to %s->%s', self._source_context, parsed_source_uri['chopped_uri'], context, self._parsed_data_uris[context][i]['chopped_uri']) if context != 'final': if not DataManager.copy( parsed_src_uri=parsed_source_uri, parsed_dest_uri=self._parsed_data_uris[context] [i], **kwargs): msg = 'cannot stage data by copying from {} to {}'.format( parsed_source_uri['chopped_uri'], self._parsed_data_uris[context][i] ['chopped_uri']) Log.an().error(msg) return self._fatal(msg) self._staged = True return True
def _init_data_context_set(self): """ Initialize set of data contexts, which is determined by inputs and output. Args: self: class instance Returns: On success: True. On failure: False. """ # check input URIs for data contexts for input_key in self._workflow['inputs']: parsed_uri = URIParser.parse(self._workflow['inputs'][input_key]['value'][0]) if not parsed_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][input_key]['value'][0] ) Log.an().error(msg) return self._fatal(msg) self._data_contexts.add(parsed_uri['scheme']) # add output URI data context parsed_output_uri = URIParser.parse(self._job['output_uri']) if not parsed_output_uri: msg = 'invalid base of job output uri: {}'.format( self._job['output_uri'] ) Log.an().error(msg) return self._fatal(msg) self._data_contexts.add(parsed_output_uri['scheme']) # check validity of data contexts for context in self._data_contexts: if not Contexts.is_data_context(context): msg = 'invalid data context: {}'.format(context) Log.an().error(msg) return self._fatal(msg) Log.some().debug('data contexts: %s', self._data_contexts) return True
def checkpoint(self): """ Check if step meets completion criteria, based on "checkpoint" execution parameter. Args: self: class instance. Returns: True if step meets completion criteria. False if it does not. """ checkpoint = self._step['execution']['parameters'].get( 'checkpoint', 'any') status = self.get_status() finished = [item == 'FINISHED' for item in status.values()] Log.some().info('[step.%s]: checkpoint: %s of %s job(s) finished', self._step['name'], sum(finished), len(finished)) # print summary of job result in debug mode for item in sorted(status): Log.some().debug('[step.%s]: checkpoint: %s -> %s', self._step['name'], item, status[item]) if checkpoint == 'all': # all jobs must be finished Log.some().info('[step.%s]: checkpoint: all jobs must finish', self._step['name']) return all(finished) if checkpoint == 'none': # no jobs have to be finished Log.some().info( '[step.%s]: checkpoint: jobs do not have to finish', self._step['name']) return True # at least one job must be finished # default to 'any' if anything other than 'all', 'any', or 'none is used Log.some().info('[step.%s]: checkpoint: at least one job must finish', self._step['name']) return any(finished)
def stage(self, **kwargs): """ Copy data to all contexts except 'final' from source URI. Set _staged indicator to True on success. Args: self: class instance. **kwargs: additional arguments required by DataManager.copy(). Returns: True or False. """ for context in self._parsed_data_uris: if context != self._source_context: if self._clean: # remove target URI first pass Log.some().debug('staging data: {}->{} to {}->{}'.format( self._source_context, self._parsed_data_uris[ self._source_context]['chopped_uri'], context, self._parsed_data_uris[context]['chopped_uri'])) if context != 'final': if not DataManager.copy( parsed_src_uri=self._parsed_data_uris\ [self._source_context], parsed_dest_uri=self._parsed_data_uris[context], **kwargs ): msg = 'cannot stage data by copying from {} to {}'.format( self._parsed_data_uris[self._source_context]\ ['chopped_uri'], self._parsed_data_uris[context]['chopped_uri'] ) Log.an().error(msg) return self._fatal(msg) self._staged = True return True
def make_def(self): """ Generate the GeneFlow app definition. Args: self: class instance Returns: On success: True. On failure: False. """ Log.some().info('compiling %s', str(self._path / 'app.yaml.j2')) if not TemplateCompiler.compile_template( None, 'app.yaml.j2.j2', str(self._path / 'app.yaml.j2'), ** self._config): Log.an().error('cannot compile GeneFlow app definition template') return False return True
def run_workflow(job, config, log_level): """ Run a GeneFlow workflow. Args: job: job dict describing run. config: GeneFlow configuration dict. log_level: logging level for this run. Returns: On success: Workflow job dict. On failure: False. """ if job['log']: # reconfig log location for this run Log.config(log_level, job['log']) Log.some().info('job loaded: %s -> %s', job['name'], job['id']) # run job workflow = Workflow(job['id'], config) if not workflow.initialize(): Log.an().error('workflow initialization failed: job_id=%s', job['id']) return False Log.some().info('running workflow:\n%s', str(workflow)) if not workflow.run(): Log.an().error('workflow run failed: job_id=%s', job['id']) return False Log.some().info('workflow complete:\n%s', str(workflow)) return workflow.get_job()
def __init__(self, job, config, parsed_job_work_uri): """ Instantiate LocalWorkflow class. """ self._job = job self._config = config self._parsed_job_work_uri = parsed_job_work_uri # drmaa library for grid engine self._drmaa_session = drmaa.Session() Log.some().debug('DRMAA contact strings: {}'.format( self._drmaa_session.contact)) Log.some().debug('DRMAA systems: {}'.format( self._drmaa_session.drmsInfo)) Log.some().debug('DRMAA implementations: {}'.format( self._drmaa_session.drmaaImplementation)) Log.some().debug('DRMAA version: {}'.format( self._drmaa_session.version))
def check_running_jobs(self): """ Check the status/progress of all map-reduce items.. And update _map status. Args: self: class instance. Returns: True. """ # check if jobs are still running for map_item in self._map: map_item['status'] = self._agave['agave_wrapper'].jobs_get_status( map_item['run'][map_item['attempt']]['agave_job_id'] ) # for status failures, set to 'UNKNOWN' if not map_item['status']: msg = 'cannot get job status for step "{}"'\ .format(self._step['name']) Log.a().warning(msg) map_item['status'] = 'UNKNOWN' # set status of run-attempt map_item['run'][map_item['attempt']]['status'] = map_item['status'] # check hpc job ids if map_item['run'][map_item['attempt']]['hpc_job_id']: # already have it continue # job id listed in history response = self._agave['agave_wrapper'].jobs_get_history( map_item['run'][map_item['attempt']]['agave_job_id'] ) if not response: msg = 'cannot get hpc job id for job: agave_job_id={}'.format( map_item['run'][map_item['attempt']]['agave_job_id'] ) Log.a().warning(msg) continue for item in response: if item['status'] == 'QUEUED': match = re.match( r'^HPC.*local job (\d*)$', item['description'] ) if match: map_item['run'][map_item['attempt']]['hpc_job_id'] \ = match.group(1) # log hpc job id Log.some().debug( '[step.%s]: hpc job id: %s -> %s', self._step['name'], map_item['template']['output'], match.group(1) ) break if map_item['status'] == 'FAILED' and map_item['attempt'] < 5: # retry job if not at limit if not self.retry_failed(map_item): Log.a().warning( '[step.%s]: cannot retry failed agave job (%s)', self._step['name'], map_item['template']['output'] ) self._update_status_db(self._status, '') return True
def _run_map(self, map_item): """ Run a job for each map item and store the job ID. Args: self: class instance. map_item: map item object (item of self._map). Returns: On success: True. On failure: False. """ # load default app inputs overwrite with template inputs inputs = {} for input_key in self._app['inputs']: if input_key in map_item['template']: if map_item['template'][input_key]: # only include an input if the value is a non-empty string inputs[input_key] = urllib.parse.quote( str(map_item['template'][input_key]), safe='/:' ) else: if self._app['inputs'][input_key]['default']: # only include an input if the value is a non-empty string inputs[input_key] = urllib.parse.quote( str(self._app['inputs'][input_key]['default']), safe='/:' ) # load default app parameters, overwrite with template parameters parameters = {} for param_key in self._app['parameters']: if param_key in map_item['template']: if self._app['parameters'][param_key]['type'] in ['int', 'long']: parameters[param_key] = int(map_item['template'][param_key]) elif self._app['parameters'][param_key]['type'] == ['float', 'double']: parameters[param_key] = float(map_item['template'][param_key]) else: parameters[param_key] = str(map_item['template'][param_key]) else: if self._app['parameters'][param_key]['default'] not in [None, '']: parameters[param_key] \ = self._app['parameters'][param_key]['default'] # add execution method as parameter parameters['exec_method'] = self._step['execution']['method'] # add execution init commands if 'init' param given if 'init' in self._step['execution']['parameters']: parameters['exec_init'] = self._step['execution']['parameters']['init'] # construct agave app template name = 'gf-{}-{}-{}'.format( str(map_item['attempt']), slugify(self._step['name'], regex_pattern=r'[^-a-z0-9_]+'), slugify(map_item['template']['output'], regex_pattern=r'[^-a-z0-9_]+') ) name = name[:62]+'..' if len(name) > 64 else name archive_path = '{}/{}'.format( self._agave['parsed_archive_uri']['chopped_path'], name ) app_template = { 'name': name, 'appId': self._app['implementation']['agave']['agave_app_id'], 'archive': True, 'inputs': inputs, 'parameters': parameters, 'archiveSystem': self._agave['parsed_archive_uri']['authority'], 'archivePath': archive_path } # specify processors if 'slots' param given if 'slots' in self._step['execution']['parameters']: app_template['processorsPerNode'] = int( self._step['execution']['parameters']['slots'] ) # specify memory if 'mem' param given if 'mem' in self._step['execution']['parameters']: app_template['memoryPerNode'] = '{}'.format( self._step['execution']['parameters']['mem'] ) Log.some().debug( "[step.%s]: agave app template:\n%s", self._step['name'], pprint.pformat(app_template) ) # delete archive path if it exists if DataManager.exists( uri=self._agave['parsed_archive_uri']['chopped_uri']+'/'+name, agave=self._agave ): if not DataManager.delete( uri=self._agave['parsed_archive_uri']['chopped_uri']+'/'+name, agave=self._agave ): Log.a().warning( 'cannot delete archive uri: %s/%s', self._agave['parsed_archive_uri']['chopped_uri'], name ) # submit job job = self._agave['agave_wrapper'].jobs_submit(app_template) if not job: msg = 'agave jobs submit failed for "{}"'.format( app_template['name'] ) Log.an().error(msg) return self._fatal(msg) # log agave job id Log.some().debug( '[step.%s]: agave job id: %s -> %s', self._step['name'], map_item['template']['output'], job['id'] ) # record job info map_item['run'][map_item['attempt']]['agave_job_id'] = job['id'] map_item['run'][map_item['attempt']]['archive_uri'] = '{}/{}'\ .format( self._agave['parsed_archive_uri']['chopped_uri'], name ) map_item['run'][map_item['attempt']]['hpc_job_id'] = '' # set status of process map_item['status'] = 'PENDING' map_item['run'][map_item['attempt']]['status'] = 'PENDING' return True
def run(self): """ Run Workflow. Args: self: class instance Returns: On success: True. On failure: False. """ self._update_status_db('RUNNING', '') for node_name in self._dag.get_topological_sort(): node = self._dag.graph().nodes[node_name] if node['type'] == 'input': Log.some().debug('[%s]: staging input', node_name) if not node['node'].stage( move_final=False, **{ context: self._workflow_context[context]\ .get_context_options()\ for context in self._workflow_context } ): msg = 'staging failed for input {}'.format(node_name) Log.an().error(msg) return self._fatal(msg) else: # step node # Reinit connection to exec context if not self._re_init(): msg = 'cannot reinit exec context' Log.an().error(msg) return self._fatal(msg) Log.some().info( '[%s]: app: %s:%s [%s]', node_name, node['node']._app['name'], node['node']._app['version'], node['node']._app['git'] ) Log.some().debug('[%s]: iterating map uri', node_name) if not node['node'].iterate_map_uri(): msg = 'iterate map uri failed for step {}'.format(node_name) Log.an().error(msg) return self._fatal(msg) # run jobs for step Log.some().info('[%s]: running', node_name) if not node['node'].run(): msg = 'run failed for step {}'.format(node_name) Log.an().error(msg) return self._fatal(msg) # poll until job(s) done while not node['node'].all_done(): node['node'].check_running_jobs() time.sleep(self._config['run_poll_delay']) Log.some().debug('[%s]: all jobs complete', node_name) # check1 if step satisfies checkpoint of all, any, or none job completion if not node['node'].checkpoint(): msg = 'failed checkpoint for step {}'.format(node_name) Log.an().error(msg) return self._fatal(msg) # cleanup jobs Log.some().debug('[%s]: cleaning', node_name) if not node['node'].clean_up(): msg = 'clean up failed for step {}'.format(node_name) Log.an().error(msg) return self._fatal(msg) # stage outputs (non-final) Log.some().debug('[%s]: staging output', node_name) if not node['node'].stage( **{ context: self._workflow_context[context]\ .get_context_options()\ for context in self._workflow_context } ): msg = 'staging failed for step {}'.format(node_name) Log.an().error(msg) return self._fatal(msg) # stage final outputs for node_name in self._dag.get_topological_sort(): node = self._dag.graph().nodes[node_name] if node['type'] == 'step': Log.some().debug('[%s]: staging final output', node_name) if not node['node'].stage_final( **{ context: self._workflow_context[context]\ .get_context_options()\ for context in self._workflow_context } ): msg = 'staging final output failed for step {}'.format(node_name) Log.an().error(msg) return self._fatal(msg) Log.some().info('[%s]: complete', node_name) self._update_status_db('FINISHED', '') return True
def _run_map(self, map_item): """ Run a job for each map item and store the job ID. Args: self: class instance. map_item: map item object (item of self._map) Returns: On success: True. On failure: False. """ # load default app inputs overwrite with template inputs inputs = {} for input_key in self._app['inputs']: if input_key in map_item['template']: inputs[input_key] = urllib.parse.quote(str( map_item['template'][input_key] or ''), safe='/:') else: inputs[input_key] = urllib.parse.quote(str( self._app['inputs'][input_key]['default'] or ''), safe='/:') # load default app parameters, overwrite with template parameters parameters = {} for param_key in self._app['parameters']: if param_key in map_item['template']: parameters[param_key] = map_item['template'][param_key] else: parameters[param_key] \ = self._app['parameters'][param_key]['default'] # add execution method as parameter parameters['exec_method'] = self._step['execution']['method'] # construct agave app template name = 'gf-{}-{}-{}'.format(str(map_item['attempt']), slugify(self._step['name']), slugify(map_item['template']['output'])) name = name[:62] + '..' if len(name) > 64 else name archive_path = '{}/{}'.format( self._agave['parsed_archive_uri']['chopped_path'], name) app_template = { 'name': name, 'appId': self._app['definition']['agave']['agave_app_id'], 'archive': True, 'inputs': inputs, 'parameters': parameters, 'archiveSystem': self._agave['parsed_archive_uri']['authority'], 'archivePath': archive_path } Log.some().debug("agave app template:\n%s", pprint.pformat(app_template)) # delete archive path if it exists if DataManager.exists( uri=self._agave['parsed_archive_uri']['chopped_uri'] + '/' + name, agave=self._agave): if not DataManager.delete( uri=self._agave['parsed_archive_uri']['chopped_uri'] + '/' + name, agave=self._agave): Log.a().warning( 'cannot delete archive uri: %s/%s', self._agave['parsed_archive_uri']['chopped_uri'], name) # submit job job = self._agave['agave_wrapper'].jobs_submit(app_template) if not job: msg = 'agave jobs submit failed for "{}"'.format( app_template['name']) Log.an().error(msg) return self._fatal(msg) # log agave job id Log.some().debug('agave job id: %s -> %s', map_item['template']['output'], job['id']) # record job info map_item['run'][map_item['attempt']]['agave_job_id'] = job['id'] map_item['run'][map_item['attempt']]['archive_uri'] = '{}/{}'\ .format( self._agave['parsed_archive_uri']['chopped_uri'], name ) map_item['run'][map_item['attempt']]['hpc_job_id'] = '' # set status of process map_item['status'] = 'PENDING' map_item['run'][map_item['attempt']]['status'] = 'PENDING' return True
def check_running_jobs(self): """ Check the status/progress of all map-reduce items.. And update _map status. Args: self: class instance. Returns: True. """ # check if jobs are still running for map_item in self._map: if map_item['status'] not in ['FINISHED','FAILED','PENDING']: map_item['status'] = self._agave['agave_wrapper'].jobs_get_status( map_item['run'][map_item['attempt']]['agave_job_id'] ) # for status failures, set to 'UNKNOWN' if not map_item['status']: msg = 'cannot get job status for step "{}"'\ .format(self._step['name']) Log.a().warning(msg) map_item['status'] = 'UNKNOWN' if map_item['status'] in ['FINISHED','FAILED']: # status changed to finished or failed Log.a().debug( '[step.%s]: exit status: %s -> %s', self._step['name'], map_item['template']['output'], map_item['status'] ) # decrease num running procs if self._num_running > 0: self._num_running -= 1 # check hpc job ids if ( map_item['status'] != 'PENDING' \ and not map_item['run'][map_item['attempt']].get('hpc_job_id', '') ): # job id listed in history response = self._agave['agave_wrapper'].jobs_get_history( map_item['run'][map_item['attempt']]['agave_job_id'] ) if not response: msg = 'cannot get hpc job id for job: agave_job_id={}'.format( map_item['run'][map_item['attempt']]['agave_job_id'] ) Log.a().warning(msg) else: for item in response: if item['status'] == 'QUEUED': match = re.match( r'^HPC.*local job (\d*)$', item['description'] ) if match: map_item['run'][map_item['attempt']]['hpc_job_id'] \ = match.group(1) # log hpc job id Log.some().debug( '[step.%s]: hpc job id: %s -> %s', self._step['name'], map_item['template']['output'], match.group(1) ) break map_item['run'][map_item['attempt']]['status'] = map_item['status'] if map_item['status'] == 'FAILED' and map_item['attempt'] < 5: if self._throttle_limit == 0 or self._num_running < self._throttle_limit: # retry job if not at retry or throttle limit if not self.retry_failed(map_item): Log.a().warning( '[step.%s]: cannot retry failed agave job (%s)', self._step['name'], map_item['template']['output'] ) else: self._num_running += 1 self._update_status_db(self._status, '') return True
def run(args, other_args, subparser): """ Run GeneFlow workflow engine. Args: args.workflow_path: workflow definition or package directory. args.job: path to job definition Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_path = resolve_workflow_path(args.workflow_path) if workflow_path: Log.some().info('workflow definition found: %s', workflow_path) else: Log.an().error('cannot find workflow definition: %s', args.workflow_path) return False # setup environment env = Environment(workflow_path=workflow_path) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_path) if not defs: Log.an().error('workflow definition load failed: %s', workflow_path) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_path) return False data_source.commit() for workflow in defs['workflows']: Log.some().info('workflow loaded: %s -> %s', workflow, defs['workflows'][workflow]) # get workflow definition back from database to ensure # that it's a valid definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id) return False ### define arg parsing methods def parse_dynamic_args(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = argparse.ArgumentParser() dynamic_parser.add_argument('-j', '--job', type=str, default=None, dest='job_path', help='Job Definition(s)') for input_key in workflow_dict['inputs']: dynamic_parser.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label']) for param_key in workflow_dict['parameters']: dynamic_parser.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) dynamic_parser.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder') dynamic_parser.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') dynamic_parser.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') dynamic_parser.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') dynamic_parser.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') dynamic_parser.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_known_args(other_args) return dynamic_args[0] if 'gooey' in sys.modules: @Gooey(program_name='GeneFlow: {}'.format(workflow_dict['name']), program_description=workflow_dict['description'], target='gf --log-level={} run {}'.format( args.log_level, args.workflow_path), monospace_display=True) def parse_dynamic_args_gui(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Display a GUI interface. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = GooeyParser() input_group = dynamic_parser.add_argument_group( "Workflow Inputs", "Files or folders to be passed to the workflow") for input_key in workflow_dict['inputs']: widget = 'FileChooser' if workflow_dict['inputs'][input_key]['type'] == 'Directory': widget = 'DirChooser' input_group.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label'], widget=widget) param_group = dynamic_parser.add_argument_group( "Workflow Parameters", "Number or string parameters to be passed to the workflow") for param_key in workflow_dict['parameters']: param_group.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) job_group = dynamic_parser.add_argument_group( "Job Options", "Output/intermediate folders and job name") job_group.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder', widget='DirChooser') job_group.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') job_group.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') exec_group = dynamic_parser.add_argument_group( "Execution Options", "Customize workflow execution") exec_group.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') exec_group.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') exec_group.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_args(other_args) return dynamic_args # get dynamic args if args.gui and 'gooey' in sys.modules: dynamic_args = parse_dynamic_args_gui(workflow_dict) else: dynamic_args = parse_dynamic_args(workflow_dict) # get absolute path to job file if provided job_path = None if dynamic_args.job_path: job_path = Path(dynamic_args.job_path).absolute() # load job definition if provided jobs_dict = {} gf_def = Definition() if job_path: if not gf_def.load(job_path): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with known cli parameters apply_job_modifiers(jobs_dict, [ 'name={}'.format(dynamic_args.name), 'output_uri={}'.format( dynamic_args.output) ]) # insert workflow name into job, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # add inputs and parameters to job definition apply_job_modifiers( jobs_dict, [ '{}={}'.format(dynamic_arg, getattr(dynamic_args, dynamic_arg)) for dynamic_arg in vars(dynamic_args) \ if dynamic_arg.startswith('inputs.') or dynamic_arg.startswith('parameters.') ] ) # add work URIs to job definition work_uris = {} for work_arg in dynamic_args.work: parsed_work_uri = URIParser.parse(work_arg) if not parsed_work_uri: # skip if invalid URI Log.a().warning('invalid work uri: %s', work_arg) else: work_uris[ parsed_work_uri['scheme']] = parsed_work_uri['chopped_uri'] apply_job_modifiers(jobs_dict, [ 'work_uri.{}={}'.format(context, work_uris[context]) for context in work_uris ]) # add execution options to job definition apply_job_modifiers(jobs_dict, [ 'execution.context.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_context ] + [ 'execution.method.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_method ] + [ 'execution.parameters.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_param ]) # get default values from workflow definition for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error('invalid input uri: %s', job['inputs'][input_key]) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [{'name': job, 'id': job_ids[job], 'log': None} for job in job_ids] result = pool.map( partial(geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level), jobs) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result