def copy(cls, src_uri=None, parsed_src_uri=None, dest_uri=None, parsed_dest_uri=None, **kwargs): """ Copy data to/from/within workflow contexts. Source and destination URIs are parsed to extract contexts, and the appropriate methods are called accordingly. Args: src: Source URI. dest: Destination URI. **kwargs: Other arguments specific to context. Returns: On success: True. On failure: False. """ # parse and validate src URI if not parsed_src_uri: parsed_src_uri = URIParser.parse(src_uri) if not parsed_src_uri: Log.an().error('invalid src uri: %s', src_uri) return False # parse and validate dest URI if not parsed_dest_uri: parsed_dest_uri = URIParser.parse(dest_uri) if not parsed_dest_uri: Log.an().error('invalid dest uri: %s', dest_uri) return False # check if copy method exists for contexts try: copy_func = getattr( cls, '_copy_{}_{}'.format(parsed_src_uri['scheme'], parsed_dest_uri['scheme'])) except AttributeError: Log.an().error('_copy_%s_%s method not defined', parsed_src_uri['scheme'], parsed_dest_uri['scheme']) return False return copy_func( parsed_src_uri, parsed_dest_uri, **{ list_item: kwargs[list_item] for list_item in set( [parsed_src_uri['scheme'], parsed_dest_uri['scheme']]) })
def mkdir(cls, uri=None, parsed_uri=None, recursive=False, **kwargs): """ Create directory at URI. URIs are parsed to extract contexts, and the appropriate method is called. Either uri or parsed_uri may be specified, but not both, if both are specified, parsed_uri is used. Args: uri: URI to create. parsed_uri: URI to create, already parsed. recursive: If true, recursively create parent directories. **kwargs: Other arguments specific to context. Returns: On success: True. On failure: False. """ # parse and validate URI if not parsed_uri: parsed_uri = URIParser.parse(uri) if not parsed_uri: Log.an().error('invalid uri: %s', uri) return False # check if the mkdir method exists for context if recursive: try: mkdir_func = getattr(cls, '_mkdir_recursive_{}'\ .format(parsed_uri['scheme'])) except AttributeError: Log.an().error('_mkdir_recursive_%s method not defined', parsed_uri['scheme']) return False else: try: mkdir_func = getattr(cls, '_mkdir_{}'\ .format(parsed_uri['scheme'])) except AttributeError: Log.an().error('_mkdir_%s method not defined', parsed_uri['scheme']) return False # always remove final slash from URI before calling mkdir return mkdir_func(URIParser.parse(parsed_uri['chopped_uri']), **kwargs)
def upload_agave_test_data(self): """ Upload Agave test data from workflow package. Args: self: class instance. Returns: None """ if (not self._agave_wrapper or not self._agave_params or not self._agave_params.get('agave')): Log.a().warning( 'must provide agave parameters to upload test data') return False # create main test data URI parsed_base_test_uri = URIParser.parse('agave://{}/{}'.format( self._agave_params['agave']['deploymentSystem'], self._agave_params['agave']['testDataDir'])) Log.some().info('creating base test data uri: %s', parsed_base_test_uri['chopped_uri']) if not DataManager.mkdir(parsed_uri=parsed_base_test_uri, recursive=True, agave={'agave_wrapper': self._agave_wrapper}): Log.a().warning('cannot create base test data uri: %s', parsed_base_test_uri['chopped_uri']) return False # upload test data parsed_local_test_uri = URIParser.parse(str(Path(self._path) / 'data')) parsed_agave_test_uri = URIParser.parse('{}/{}'.format( parsed_base_test_uri['chopped_uri'], Path(self._path).name)) Log.some().info('copying test data from %s to %s', parsed_local_test_uri['chopped_uri'], parsed_agave_test_uri['chopped_uri']) if not DataManager.copy(parsed_src_uri=parsed_local_test_uri, parsed_dest_uri=parsed_agave_test_uri, local={}, agave={'agave_wrapper': self._agave_wrapper}): Log.a().warning('cannot copy test data from %s to %s', parsed_local_test_uri['chopped_uri'], parsed_agave_test_uri['chopped_uri']) return False return True
def list(cls, uri=None, parsed_uri=None, globstr='*', **kwargs): """ List data in various contexts. URIs are parsed to extract contexts, and the appropriate method is called. Either uri or parsed_uri may be specified, but not both. If both are specified, parsed_uri is used. Args: uri: URI to list. parsed_uri: URI to list, already parsed. **kwargs: Other arguments specific to context. Returns: On success: True. On failure: False. """ # parse and validate URI if not parsed_uri: parsed_uri = URIParser.parse(uri) if not parsed_uri: Log.an().error('invalid uri: %s', uri) return False # check if list method exists for context try: list_func = getattr(cls, '_list_{}'.format(parsed_uri['scheme'])) except AttributeError: Log.an().error('_list_%s method not defined', parsed_uri['scheme']) return False return list_func(parsed_uri, globstr, **kwargs)
def _init_archive_uri(self): """ Initialize and validate Agave job archive URI. Args: None. Returns: On success: True. On failure: False. """ if 'agave' not in self._parsed_job_work_uri: Log.an().error('job work uri must include an agave context') return False # construct archive URI self._parsed_archive_uri = URIParser.parse('{}/_agave_jobs'.format( self._parsed_job_work_uri['agave']['chopped_uri'])) if not self._parsed_archive_uri: Log.an().error('invalid job work uri: %s', self._parsed_job_work_uri['agave']) return False # create URI if not DataManager.mkdir(parsed_uri=self._parsed_archive_uri, recursive=True, agave=self.get_context_options()): Log.an().error('cannot create agave archive uri: %s', self._parsed_archive_uri['chopped_uri']) return False return True
def _mkdir_recursive_agave(uri, agave): """ Recursively create agave directory specified by URI. Args: uri: parsed URI to create. agave: dict that contains: agave_wrapper: Agave wrapper object. Returns: On success: True. On failure: False. """ if uri['folder'] != '/': # make sure parent folder exists first parent_uri = URIParser.parse( '{}://{}{}'.format( uri['scheme'], uri['authority'], uri['folder'] ) ) if not _exists_agave(parent_uri, agave): # parent folder does not exist, create if not _mkdir_recursive_agave(parent_uri, agave): Log.an().error( 'cannot create parent folder at uri: %s', parent_uri['chopped_uri'] ) return False return _mkdir_agave(uri, agave)
def _init_data_context_set(self): """ Initialize set of data contexts, which is determined by inputs and output. Args: self: class instance Returns: On success: True. On failure: False. """ # check input URIs for data contexts for input_key in self._workflow['inputs']: parsed_uri = URIParser.parse(self._workflow['inputs'][input_key]['value'][0]) if not parsed_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][input_key]['value'][0] ) Log.an().error(msg) return self._fatal(msg) self._data_contexts.add(parsed_uri['scheme']) # add output URI data context parsed_output_uri = URIParser.parse(self._job['output_uri']) if not parsed_output_uri: msg = 'invalid base of job output uri: {}'.format( self._job['output_uri'] ) Log.an().error(msg) return self._fatal(msg) self._data_contexts.add(parsed_output_uri['scheme']) # check validity of data contexts for context in self._data_contexts: if not Contexts.is_data_context(context): msg = 'invalid data context: {}'.format(context) Log.an().error(msg) return self._fatal(msg) Log.some().debug('data contexts: %s', self._data_contexts) return True
def _init_app_paths(self): """ Add app paths to environment PATH for local workflows. The package path contains the workflow definition YAML file and shell scripts for calling individual apps used in a workflow. Args: None. Output: On success: True. On failure: False. """ parsed_uri = URIParser.parse(self._workflow_path) if not parsed_uri: Log.an().error('invalid workflow path: %s', self._workflow_path) return False apps_uri = ('{}{}' if parsed_uri['folder'] == '/' else '{}/{}')\ .format(parsed_uri['folder'], 'apps') parsed_apps_uri = URIParser.parse( ('{}{}' if parsed_uri['folder'] == '/' else '{}/{}')\ .format(parsed_uri['folder'], 'apps') ) if not parsed_apps_uri: Log.an().error('cannot construct apps uri: %s', apps_uri) return False if not DataManager.exists(parsed_uri=parsed_apps_uri): # no apps directory return True for app_dir in DataManager.list(parsed_uri=parsed_apps_uri): try: os.environ['PATH'] = '{}{}{}'.format( os.path.join(parsed_apps_uri['chopped_path'], app_dir, 'assets'), os.pathsep, os.environ['PATH']) except OSError as err: Log.an().error('workflow app pathmunge error [%s]', str(err)) return False return True
def initialize(self): """ Initialize instance of StageableData. By parsing URIs with GeneFlow.URIParser & checking source context validity. Args: self: class instance. Returns: On success: True. On failure: False. """ # parse data uris for context in self._data_uris: parsed_uri = URIParser.parse(self._data_uris[context]) if not parsed_uri: msg = 'invalid data uri for context: {}->{}'.format( context, self._data_uris[context] ) Log.an().error(msg) return self._fatal(msg) # path cannot be root (/) if parsed_uri['chopped_path'] == '/': msg = 'context data uri cannot be root (/): {}->{}'.format( context, self._data_uris[context] ) Log.an().error(msg) return self._fatal(msg) self._parsed_data_uris[context] = parsed_uri # make sure source_context is one of the listed of URIs if self._source_context not in self._parsed_data_uris: msg = 'source context must one of the data uri contexts: {}'.\ format(self._source_context) Log.an().error(msg) return self._fatal(msg) return True
def files_list(self, system_id, file_path, depth=1): """ Wrap AgavePy file listing command. Args: self: class instance. system_id: Identifier for Agave storage system. file_path: Path for file listing. Returns: List of file names. """ files = [] file_list = self._agave.files.list(systemId=system_id, filePath=file_path + '/', limit=1000000) for f in file_list: if f.name[:1] != '.' and f.name != '': file_uri = URIParser.parse('{}://{}{}'.format( 'agave', system_id, f.path)) files.append({ 'path': file_uri['folder'], 'name': file_uri['name'], 'type': f.type }) # list all subdirectories if not at max depth # depth of -1 means unlimited depth files_subdirs = {} if depth > 1 or depth == -1: for f in files: if f['type'] == 'dir': files_subdirs[f['name']] = self.files_list( system_id, f['path'] + '/' + f['name'], depth - 1 if depth > 1 else depth) # append all items in files_subdirs to files return files + list(itertools.chain(*files_subdirs.values()))
def exists(cls, uri=None, parsed_uri=None, **kwargs): """ Check if URI exists. URIs are parsed to extract contexts, and the appropriate method is called. Either uri or parsed_uri may be specified, but not both, if both are specified, parsed_uri is used. Args: uri: URI to check. parsed_uri: URI to check, already parsed. **kwargs: Other arguments specific to context. Returns: True if the URI exists, False if it doesn't exist, None if an exception occurs. """ # parse and validate URI if not parsed_uri: parsed_uri = URIParser.parse(uri) if not parsed_uri: Log.an().error('invalid uri: %s', uri) return None # check if the exists method exists for context try: exists_func = getattr(cls, '_exists_{}'\ .format(parsed_uri['scheme'])) except AttributeError: Log.an().error('_exists_%s method not defined', parsed_uri['scheme']) return None return exists_func(parsed_uri, **kwargs)
def delete(cls, uri=None, parsed_uri=None, **kwargs): """ Delete URI. URIs are parsed to extract contexts, and the appropriate method is called. Either uri or parsed_uri may be specified, but not both, if both are specified, parsed_uri is used. Args: uri: URI to delete. parsed_uri: URI to delete, already parsed. **kwargs: Other arguments specific to context. Returns: On success: True. On failure: False. """ # parse and validate URI if not parsed_uri: parsed_uri = URIParser.parse(uri) if not parsed_uri: Log.an().error('invalid uri: %s', uri) return False # check if the delete method exists for context try: delete_func = getattr(cls, '_delete_{}'\ .format(parsed_uri['scheme'])) except AttributeError: Log.an().error('_delete_%s method not defined', parsed_uri['scheme']) return False return delete_func(parsed_uri, **kwargs)
def _parse_map_uri(self): """ Parse and validate the map URI, for map-reduce processing. The map URI can point to the output URIs of previous workflow steps. The map URI template value can take the following forms: {workflow->input-name}: 'input-name' must be part of workflow-level inputs (i.e., self._inputs) {step-name->output}: 'step-name' must be a valid step name, and must be listed in the 'depend' list. Args: self: class instance. Returns: On success: True. On failure: False. """ if not self._step['map']['uri']: # map URI is an optional definition field self._map_uri = '' else: match = re.match(r'{([^{}]+)->([^{}]+)}', self._step['map']['uri']) if match: if match.group(1) == 'workflow': # use workflow-level input uri # check if uri name is in input list if match.group(2) in self._inputs: # make sure the input URI to be used as the map URI # is valid self._parsed_map_uri = URIParser.parse( self._inputs[match.group(2)] ) if not self._parsed_map_uri: msg = 'invalid map uri for inputs.{}: {}'\ .format( match.group(2), self._inputs[match.group(2)] ) Log.an().error(msg) return self._fatal(msg) self._map_uri = self._parsed_map_uri['chopped_uri'] else: msg = 'invalid template reference to input: {}'\ .format(self._step['map']['uri']) Log.an().error(msg) return self._fatal(msg) else: # use uri from previous step # check if previous step is a dependency if match.group(1) in self._step['depend']: if match.group(2) == 'output': self._map_uri = self._depend_uris[match.group(1)]\ ['chopped_uri'] self._parsed_map_uri \ = self._depend_uris[match.group(1)] else: msg = 'invalid template reference, must be "output": {}'\ .format(self._step['map']['uri']) Log.an().error(msg) return self._fatal(msg) else: # error, not a dependency msg = 'template reference to step must be listed as dependent: {}'\ .format(self._step['map']['uri']) Log.an().error(msg) return self._fatal(msg) else: # invalid format msg = 'invalid template value for step map uri: {}'.format( self._step['map']['uri'] ) Log.an().error(msg) return self._fatal(msg) return True
def _init_job_uris(self): """ Initialize all work and output URIs. Args: self: class instance Returns: On success: True. On failure: False. """ # name of the job directory job_dir = slugify(self._job['name'], regex_pattern=r'[^-a-z0-9_]+') job_dir_hash = '{}-{}'.format(job_dir, self._job['job_id'][:8]) # validate work URI for each exec context # use the 'data_scheme' for each execution context # and place into a set to remove repeats for context in { Contexts.get_data_scheme_of_exec_context(con) for con in self._exec_contexts }: # work_uri must be set for each exec_context if context not in self._job['work_uri']: msg = 'missing work_uri for context: {}'.format(context) Log.an().error(msg) return self._fatal(msg) parsed_uri = URIParser.parse(self._job['work_uri'][context]) if not parsed_uri: msg = 'invalid base of job work uri for context: {}->{}'.format( context, self._job['work_uri'][context] ) Log.an().error(msg) return self._fatal(msg) # append hashed job dir to each context full_job_work_uri = ( '{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}' ).format(parsed_uri['chopped_uri'], job_dir_hash) # validate again after appending parsed_job_work_uri = URIParser.parse(full_job_work_uri) if not parsed_job_work_uri: msg = 'invalid job work uri for context: {}->{}'.format( context, full_job_work_uri ) Log.an().error(msg) return self._fatal(msg) self._parsed_job_work_uri[context] = parsed_job_work_uri # validate output URI parsed_uri = URIParser.parse(self._job['output_uri']) if not parsed_uri: msg = 'invalid base of job output uri: {}'.format( self._job['output_uri'] ) Log.an().error(msg) return self._fatal(msg) # append job dir (hashed or not) to output uri full_job_output_uri = ( '{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}' ).format( parsed_uri['chopped_uri'], job_dir if self._job['no_output_hash'] else job_dir_hash ) # validate again after appending parsed_job_output_uri = URIParser.parse(full_job_output_uri) if not parsed_job_output_uri: msg = 'invalid job output uri: {}'.format( full_job_output_uri ) Log.an().error(msg) return self._fatal(msg) self._parsed_job_output_uri = parsed_job_output_uri return True
def _init_context_uris(self): """ Generate all context URIs for this workflow run. Context URIs are generated based on contexts given in _parsed_job_work_uri, and the "final" context for steps given in the _parsed_job_output_uri. Args: None. Returns: On failure: Raises WorkflowDAGException. """ self._context_uris['inputs'] = {} self._context_uris['steps'] = {'final': {}} self._parsed_context_uris['inputs'] = {} self._parsed_context_uris['steps'] = {'final': {}} # init contexts in parsed_job_work_uri for inputs and steps for context in self._parsed_job_work_uri: self._context_uris['inputs'][context] = {} self._context_uris['steps'][context] = {} self._parsed_context_uris['inputs'][context] = {} self._parsed_context_uris['steps'][context] = {} for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'input': if node['source_context'] == context: # use original input URI parsed_uri = URIParser.parse( self._workflow['inputs'][node['name']]['value']) if not parsed_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][ node['name']]['value']) raise WorkflowDAGException(msg) self._context_uris['inputs'][context][node['name']]\ = parsed_uri['chopped_uri'] self._parsed_context_uris['inputs'][context]\ [node['name']] = parsed_uri else: # switch context of input URI new_base_uri = '{}/_input-{}'.format( self._parsed_job_work_uri[context]['chopped_uri'], slugify(node['name'])) # create new base URI if not DataManager.mkdir( uri=new_base_uri, recursive=True, **{context: self._context_options[context]}): msg = 'cannot create new base uri for input: {}'\ .format(new_base_uri) Log.an().error(msg) raise WorkflowDAGException(msg) # switch input URI base switched_uri = URIParser.switch_context( self._workflow['inputs'][node['name']]['value'], new_base_uri) if not switched_uri: msg = ( 'cannot switch input uri context to ' 'new base URI: {}->{}' ).format( self._workflow['inputs'][node['name']]\ ['value'], new_base_uri ) Log.an().error(msg) raise WorkflowDAGException(msg) self._context_uris['inputs'][context][node['name']]\ = switched_uri['chopped_uri'] self._parsed_context_uris['inputs'][context]\ [node['name']] = switched_uri else: # node['type'] == 'step' self._context_uris['steps'][context][node['name']]\ = '{}/{}'.format( self._parsed_job_work_uri[context]['chopped_uri'], slugify(node['name']) ) self._parsed_context_uris['steps'][context][node['name']]\ = URIParser.parse( self._context_uris['steps'][context][node['name']] ) # init final contexts for steps for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'step': self._context_uris['steps']['final'][node['name']]\ = '{}/{}'.format( self._parsed_job_output_uri['chopped_uri'], slugify(node['name']) ) self._parsed_context_uris['steps']['final'][node['name']]\ = URIParser.parse( self._context_uris['steps']['final'][node['name']] )
def register_agave_app(self, agave, agave_config, agave_params, agave_publish): """ Register app in Agave. Args: self: class instance Returns: On success: True. On failure: False. """ Log.some().info('registering agave app %s', str(self._path)) Log.some().info('app version: %s', self._config['version']) # compile agave app template if not TemplateCompiler.compile_template( self._path, 'agave-app-def.json.j2', self._path / 'agave-app-def.json', version=self._config['version'], agave=agave_params['agave'] ): Log.a().warning( 'cannot compile agave app "%s" definition from template', self._app['name'] ) return False # create main apps URI parsed_agave_apps_uri = URIParser.parse( 'agave://{}/{}'.format( agave_params['agave']['deploymentSystem'], agave_params['agave']['appsDir'] ) ) Log.some().info( 'creating main apps uri: %s', parsed_agave_apps_uri['chopped_uri'] ) if not DataManager.mkdir( parsed_uri=parsed_agave_apps_uri, recursive=True, agave={ 'agave': agave, 'agave_config': agave_config } ): Log.a().warning('cannot create main agave apps uri') return False # delete app uri if it exists parsed_app_uri = URIParser.parse( 'agave://{}/{}/{}'.format( agave_params['agave']['deploymentSystem'], agave_params['agave']['appsDir'], self._app['folder'] ) ) Log.some().info( 'deleting app uri if it exists: %s', parsed_app_uri['chopped_uri'] ) if not DataManager.delete( parsed_uri=parsed_app_uri, agave={ 'agave': agave, 'agave_config': agave_config } ): # log warning, but ignore.. deleting non-existant uri returns False Log.a().warning( 'cannot delete app uri: %s', parsed_app_uri['chopped_uri'] ) # upload app assets parsed_assets_uri = URIParser.parse(str(self._path / 'assets')) Log.some().info( 'copying app assets from %s to %s', parsed_assets_uri['chopped_uri'], parsed_app_uri['chopped_uri'] ) if not DataManager.copy( parsed_src_uri=parsed_assets_uri, parsed_dest_uri=parsed_app_uri, local={}, agave={ 'agave': agave, 'agave_config': agave_config } ): Log.a().warning( 'cannot copy app assets from %s to %s', parsed_assets_uri['chopped_uri'], parsed_app_uri['chopped_uri'] ) return False # upload test script parsed_test_uri = URIParser.parse( '{}/{}'.format( parsed_app_uri['chopped_uri'], 'test' ) ) Log.some().info( 'creating test uri: %s', parsed_test_uri['chopped_uri'] ) if not DataManager.mkdir( parsed_uri=parsed_test_uri, recursive=True, agave={ 'agave': agave, 'agave_config': agave_config } ): Log.a().warning( 'cannot create test uri: %s', parsed_test_uri['chopped_uri'] ) return False parsed_local_test_script = URIParser.parse( str(self._path / 'test' / 'test.sh') ) parsed_agave_test_script = URIParser.parse( '{}/{}'.format(parsed_test_uri['chopped_uri'], 'test.sh') ) Log.some().info( 'copying test script from %s to %s', parsed_local_test_script['chopped_uri'], parsed_agave_test_script['chopped_uri'] ) if not DataManager.copy( parsed_src_uri=parsed_local_test_script, parsed_dest_uri=parsed_agave_test_script, local={}, agave={ 'agave': agave, 'agave_config': agave_config } ): Log.a().warning( 'cannot copy test script from %s to %s', parsed_local_test_script['chopped_uri'], parsed_agave_test_script['chopped_uri'] ) return False # update existing app, or register new app Log.some().info('registering agave app') app_definition = self._yaml_to_dict( str(self._path / 'agave-app-def.json') ) if not app_definition: Log.a().warning( 'cannot load agave app definition: %s', str(self._path / 'agave-app-def.json') ) return False agwrap = AgaveAppsAddUpdate( agave, agave_config ) app_add_result = agwrap.call(app_definition) if not app_add_result: Log.a().warning( 'cannot register agave app:\n%s', pprint.pformat(app_definition) ) return False register_result = {} # publish app if agave_publish: Log.some().info('publishing agave app') agwrap = AgaveAppsPublish( agave, agave_config ) app_publish_result = agwrap.call(app_add_result['id']) if not app_publish_result: Log.a().warning( 'cannot publish agave app: %s', app_add_result['id'] ) return False # return published id and revision register_result = { 'id': app_publish_result['id'], 'version': self._config['version'], 'revision': 'u{}'.format(app_publish_result['revision']) } else: # return un-published id and blank revision register_result = { 'id': app_add_result['id'], 'version': self._config['version'], 'revision': '' } return register_result
def _get_map_uri_list(self): """ Get the contents of the map URI (agave URI). Args: self: class instance. Returns: Array of base file names in the map URI. Returns False on exception. """ combined_file_list = [] for uri in self._parsed_map_uris: # make sure map URI is compatible scheme (agave) if uri['scheme'] != 'agave': msg = 'invalid map uri scheme for this step: {}'.format( uri['scheme'] ) Log.an().error(msg) return self._fatal(msg) # get file list from URI file_list = DataManager.list( parsed_uri=uri, globstr=self._step['map']['glob'], agave=self._agave ) if file_list is False: msg = 'cannot get contents of map uri: {}'\ .format(uri['chopped_uri']) Log.an().error(msg) return self._fatal(msg) if self._step['map']['inclusive']: # filter with glob if glob.globfilter( [uri['name']], self._step['map']['glob'], flags=glob.EXTGLOB|glob.GLOBSTAR ): combined_file_list.append({ 'chopped_uri': '{}://{}{}'.format( uri['scheme'], uri['authority'], uri['folder'] ), 'filename': uri['name'] }) for f in file_list: if '/' in f: # reparse uri to correctly represent recursive elements new_uri = URIParser.parse('{}/{}'.format(uri['chopped_uri'], f)) combined_file_list.append({ 'chopped_uri': '{}://{}{}'.format( new_uri['scheme'], new_uri['authority'], new_uri['folder'] ), 'filename': new_uri['name'] }) else: combined_file_list.append({ 'chopped_uri': uri['chopped_uri'], 'filename': f }) return combined_file_list
def step_impl(context): for uri in context.uris: parsed_uri = URIParser.parse(uri) assert parsed_uri context.uris[uri] = parsed_uri
def _run_map(self, map_item): """ Run a job for each map item and store the proc and PID. Args: self: class instance. map_item: map item object (item of self._map). Returns: On success: True. On failure: False. """ # load default app inputs, overwrite with template inputs inputs = {} for input_key in self._app['inputs']: if input_key in map_item['template']: inputs[input_key] = map_item['template'][input_key] else: if self._app['inputs'][input_key]['default']: inputs[input_key] = self._app['inputs'][input_key]['default'] # load default app parameters, overwrite with template parameters parameters = {} for param_key in self._app['parameters']: if param_key in map_item['template']: parameters[param_key] = map_item['template'][param_key] else: if self._app['parameters'][param_key]['default'] not in [None, '']: parameters[param_key] \ = self._app['parameters'][param_key]['default'] # construct shell command cmd = self._app['implementation']['local']['script'] for input_key in inputs: if inputs[input_key]: cmd += ' --{}="{}"'.format( input_key, URIParser.parse(inputs[input_key])['chopped_path'] ) for param_key in parameters: if param_key == 'output': cmd += ' --output="{}/{}"'.format( self._parsed_data_uris[self._source_context]\ ['chopped_path'], parameters['output'] ) else: cmd += ' --{}="{}"'.format( param_key, parameters[param_key] ) # add exeuction method cmd += ' --exec_method="{}"'.format(self._step['execution']['method']) # specify execution init commands if 'init' param given if 'init' in self._step['execution']['parameters']: cmd += ' --exec_init="{}"'.format(self._step['execution']['parameters']['init']) # add stdout and stderr log_path = '{}/_log/gf-{}-{}-{}'.format( self._parsed_data_uris[self._source_context]['chopped_path'], map_item['attempt'], slugify(self._step['name'], regex_pattern=r'[^-a-z0-9_]+'), slugify(map_item['template']['output'], regex_pattern=r'[^-a-z0-9_]+') ) cmd += ' > "{}.out" 2> "{}.err"'.format(log_path, log_path) Log.a().debug('command: %s', cmd) # launch process proc = ShellWrapper.spawn(cmd) if proc is False: msg = 'shell process error: {}'.format(cmd) Log.an().error(msg) return self._fatal(msg) # record job info map_item['run'][map_item['attempt']]['proc'] = proc map_item['run'][map_item['attempt']]['pid'] = proc.pid # set status of process map_item['status'] = 'RUNNING' map_item['run'][map_item['attempt']]['status'] = 'RUNNING' return True
def _init_context_uris(self): """ Generate all context URIs for this workflow run. Context URIs are generated based on contexts given in _parsed_job_work_uri, and the "final" context for steps given in the _parsed_job_output_uri. Args: None. Returns: On failure: Raises WorkflowDAGException. """ self._context_uris['inputs'] = {} self._context_uris['steps'] = {'final': {}} self._parsed_context_uris['inputs'] = {} self._parsed_context_uris['steps'] = {'final': {}} # init all data contexts for context in { Contexts.get_data_scheme_of_exec_context(con) for con in self._exec_contexts } | self._data_contexts: self._context_uris['inputs'][context] = {} self._parsed_context_uris['inputs'][context] = {} for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'input': if node['source_context'] == context: # use original input URI parsed_uri = URIParser.parse( self._workflow['inputs'][node['name']]['value']) if not parsed_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][ node['name']]['value']) raise WorkflowDAGException(msg) self._context_uris['inputs'][context][node['name']]\ = parsed_uri['chopped_uri'] self._parsed_context_uris['inputs'][context]\ [node['name']] = parsed_uri else: # skip if _parsed_job_work_uri is not defined for this context # this implies that there is no execution defined for that context, # so no need to setup the data staging location at the work_uri if context not in self._parsed_job_work_uri: continue # switch context of input URI new_base_uri = '{}/_input-{}'.format( self._parsed_job_work_uri[context]['chopped_uri'], slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+')) # create new base URI if not DataManager.mkdir( uri=new_base_uri, recursive=True, **{context: self._context_options[context]}): msg = 'cannot create new base uri for input: {}'\ .format(new_base_uri) Log.an().error(msg) raise WorkflowDAGException(msg) # switch input URI base switched_uri = URIParser.switch_context( self._workflow['inputs'][node['name']]['value'], new_base_uri) if not switched_uri: msg = ( 'cannot switch input uri context to ' 'new base URI: {}->{}' ).format( self._workflow['inputs'][node['name']]\ ['value'], new_base_uri ) Log.an().error(msg) raise WorkflowDAGException(msg) self._context_uris['inputs'][context][node['name']]\ = switched_uri['chopped_uri'] self._parsed_context_uris['inputs'][context]\ [node['name']] = switched_uri for context in { Contexts.get_data_scheme_of_exec_context(con) for con in self._exec_contexts }: self._context_uris['steps'][context] = {} self._parsed_context_uris['steps'][context] = {} for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'step': self._context_uris['steps'][context][node['name']]\ = '{}/{}'.format( self._parsed_job_work_uri[context]['chopped_uri'], slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+') ) self._parsed_context_uris['steps'][context][node['name']]\ = URIParser.parse( self._context_uris['steps'][context][node['name']] ) # init final contexts for steps for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'step': self._context_uris['steps']['final'][node['name']]\ = '{}/{}'.format( self._parsed_job_output_uri['chopped_uri'], slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+') ) self._parsed_context_uris['steps']['final'][node['name']]\ = URIParser.parse( self._context_uris['steps']['final'][node['name']] )
def run(args, other_args, subparser): """ Run GeneFlow workflow engine. Args: args.workflow_path: workflow definition or package directory. args.job: path to job definition Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_path = resolve_workflow_path(args.workflow_path) if workflow_path: Log.some().info('workflow definition found: %s', workflow_path) else: Log.an().error('cannot find workflow definition: %s', args.workflow_path) return False # setup environment env = Environment(workflow_path=workflow_path) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_path) if not defs: Log.an().error('workflow definition load failed: %s', workflow_path) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_path) return False data_source.commit() for workflow in defs['workflows']: Log.some().info('workflow loaded: %s -> %s', workflow, defs['workflows'][workflow]) # get workflow definition back from database to ensure # that it's a valid definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id) return False ### define arg parsing methods def parse_dynamic_args(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = argparse.ArgumentParser() dynamic_parser.add_argument('-j', '--job', type=str, default=None, dest='job_path', help='Job Definition(s)') for input_key in workflow_dict['inputs']: dynamic_parser.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label']) for param_key in workflow_dict['parameters']: dynamic_parser.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) dynamic_parser.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder') dynamic_parser.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') dynamic_parser.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') dynamic_parser.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') dynamic_parser.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') dynamic_parser.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_known_args(other_args) return dynamic_args[0] if 'gooey' in sys.modules: @Gooey(program_name='GeneFlow: {}'.format(workflow_dict['name']), program_description=workflow_dict['description'], target='gf --log-level={} run {}'.format( args.log_level, args.workflow_path), monospace_display=True) def parse_dynamic_args_gui(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Display a GUI interface. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = GooeyParser() input_group = dynamic_parser.add_argument_group( "Workflow Inputs", "Files or folders to be passed to the workflow") for input_key in workflow_dict['inputs']: widget = 'FileChooser' if workflow_dict['inputs'][input_key]['type'] == 'Directory': widget = 'DirChooser' input_group.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label'], widget=widget) param_group = dynamic_parser.add_argument_group( "Workflow Parameters", "Number or string parameters to be passed to the workflow") for param_key in workflow_dict['parameters']: param_group.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) job_group = dynamic_parser.add_argument_group( "Job Options", "Output/intermediate folders and job name") job_group.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder', widget='DirChooser') job_group.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') job_group.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') exec_group = dynamic_parser.add_argument_group( "Execution Options", "Customize workflow execution") exec_group.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') exec_group.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') exec_group.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_args(other_args) return dynamic_args # get dynamic args if args.gui and 'gooey' in sys.modules: dynamic_args = parse_dynamic_args_gui(workflow_dict) else: dynamic_args = parse_dynamic_args(workflow_dict) # get absolute path to job file if provided job_path = None if dynamic_args.job_path: job_path = Path(dynamic_args.job_path).absolute() # load job definition if provided jobs_dict = {} gf_def = Definition() if job_path: if not gf_def.load(job_path): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with known cli parameters apply_job_modifiers(jobs_dict, [ 'name={}'.format(dynamic_args.name), 'output_uri={}'.format( dynamic_args.output) ]) # insert workflow name into job, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # add inputs and parameters to job definition apply_job_modifiers( jobs_dict, [ '{}={}'.format(dynamic_arg, getattr(dynamic_args, dynamic_arg)) for dynamic_arg in vars(dynamic_args) \ if dynamic_arg.startswith('inputs.') or dynamic_arg.startswith('parameters.') ] ) # add work URIs to job definition work_uris = {} for work_arg in dynamic_args.work: parsed_work_uri = URIParser.parse(work_arg) if not parsed_work_uri: # skip if invalid URI Log.a().warning('invalid work uri: %s', work_arg) else: work_uris[ parsed_work_uri['scheme']] = parsed_work_uri['chopped_uri'] apply_job_modifiers(jobs_dict, [ 'work_uri.{}={}'.format(context, work_uris[context]) for context in work_uris ]) # add execution options to job definition apply_job_modifiers(jobs_dict, [ 'execution.context.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_context ] + [ 'execution.method.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_method ] + [ 'execution.parameters.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_param ]) # get default values from workflow definition for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error('invalid input uri: %s', job['inputs'][input_key]) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [{'name': job, 'id': job_ids[job], 'log': None} for job in job_ids] result = pool.map( partial(geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level), jobs) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def _init_job_uris(self): """ Initialize all work and output URIs. Args: self: class instance Returns: On success: True. On failure: False. """ # name of the job directory job_dir = '{}-{}'.format(slugify(self._job['name']), self._job['job_id'][:8]) # validate work URI for each context for context in self._job['work_uri']: parsed_uri = URIParser.parse(self._job['work_uri'][context]) if not parsed_uri: msg = 'invalid base of job work uri for context: {}->{}'.format( context, self._job['work_uri'][context]) Log.an().error(msg) return self._fatal(msg) # append job dir to each context full_job_work_uri = ('{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}').format( parsed_uri['chopped_uri'], job_dir) # validate again after appending parsed_job_work_uri = URIParser.parse(full_job_work_uri) if not parsed_job_work_uri: msg = 'invalid job work uri for context: {}->{}'.format( context, full_job_work_uri) Log.an().error(msg) return self._fatal(msg) self._parsed_job_work_uri[context] = parsed_job_work_uri # validate output URI parsed_uri = URIParser.parse(self._job['output_uri']) if not parsed_uri: msg = 'invalid base of job output uri: {}'.format( self._job['output_uri']) Log.an().error(msg) return self._fatal(msg) # append job dir to each context full_job_output_uri = ('{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}').format(parsed_uri['chopped_uri'], job_dir) # validate again after appending parsed_job_output_uri = URIParser.parse(full_job_output_uri) if not parsed_job_output_uri: msg = 'invalid job output uri: {}'.format(full_job_output_uri) Log.an().error(msg) return self._fatal(msg) self._parsed_job_output_uri = parsed_job_output_uri return True
def run(args): """ Run GeneFlow workflow engine. Args: args.workflow: workflow definition or package directory. args.job_yaml: job definition. Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_yaml = resolve_workflow_path(args.workflow) if workflow_yaml: Log.some().info('workflow definition found: %s', workflow_yaml) else: Log.an().error('cannot find workflow definition: %s', args.workflow) return False # get absolute path to job file if provided job_yaml = None if args.job_yaml: job_yaml = Path(args.job_yaml).absolute() # setup environment env = Environment(workflow_path=workflow_yaml) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_yaml) if not defs: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False data_source.commit() for workflow in defs['workflows']: Log.some().info( 'workflow loaded: %s -> %s', workflow, defs['workflows'][workflow] ) # load job definition if provided jobs_dict = {} gf_def = Definition() if job_yaml: if not gf_def.load(job_yaml): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with cli parameters if args.data: apply_job_modifiers(jobs_dict, args.data) # insert workflow name, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # extract workflow defaults for inputs and parameters if not provided # in job definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id ) return False for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error( 'invalid input uri: %s', job['inputs'][input_key] ) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [ { 'name': job, 'id': job_ids[job], 'log': None } for job in job_ids ] result = pool.map( partial( geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level ), jobs ) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def _init_graph_structure(self): """ Create empty nodes for each workflow input and step. Nodes contain attributes for type (e.g., input or step), contexts for data staging (e.g., local or agave), source context, and node. The node attribute is initialized as None, but will later be a reference to a WorkflowInput or WorkflowStep object. Args: None. Returns: On failure: Raises WorkflowDAGException. """ # add empty input nodes to graph for input_name in self._workflow['inputs']: # extract the input source context parsed_input_uri = URIParser.parse( self._workflow['inputs'][input_name]['value']) if not parsed_input_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][input_name]['value']) Log.an().error(msg) raise WorkflowDAGException(msg) source_context = parsed_input_uri['scheme'] try: self._graph.add_node('input.{}'.format(input_name), name='{}'.format(input_name), type='input', contexts={source_context: ''}, source_context=source_context, exec_context=None, node=None) except nx.NetworkXException as err: msg = 'cannot add input node "{}" to graph [{}]'.format( input_name, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # add empty step nodes to graph for step_name, step in self._workflow['steps'].items(): # extract the step source context source_data_context = Contexts.get_data_scheme_of_exec_context( step['execution']['context']) if not source_data_context: msg = 'invalid execution context ({}) for step {}'.format( step['execution']['context'], step_name) Log.an().error(msg) raise WorkflowDAGException(msg) contexts = {source_data_context: ''} if step_name in self._workflow['final_output']: contexts['final'] = '' try: self._graph.add_node('step.{}'.format(step_name), name='{}'.format(step_name), type='step', step=step, contexts=contexts, source_context=source_data_context, exec_context=step['execution']['context'], node=None) except nx.NetworkXException as err: msg = 'cannot add step node "{}" to graph [{}]'.format( step_name, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # create graph edges and determine contexts for each node based on # dependencies for step_name, step in self._workflow['steps'].items(): # name of this step node step_node = 'step.{}'.format(step_name) # get all input or step dependencies for this step deps = self._get_step_dependencies(step) for dep in deps: if deps[dep]['name'] == 'workflow': # input or parameter dependency input_node = 'input.{}'.format(deps[dep]['var']) # only add edge if it's an input (not a parameter) if input_node in self._graph.nodes: # add graph edge from input to step try: self._graph.add_edge(input_node, step_node) except nx.NetworkXException as err: msg = ('cannot add edge from node "{}" to ' 'node "{}" [{}]').format( input_node, step_node, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # add context key to dict for input node self._graph.nodes[input_node]['contexts'][ Contexts.get_data_scheme_of_exec_context( step['execution']['context'])] = '' else: # if input not found, make sure var refers to # a parameter if not deps[dep]['var'] in self._parameters: msg = ('invalid dependency for step "{}", ' 'parameter or input "{}" does not exist' ).format(step_name, deps[dep]['var']) Log.an().error(msg) raise WorkflowDAGException(msg) else: # step dependency depend_node = 'step.{}'.format(deps[dep]['name']) if not self._graph.has_node(depend_node): msg = ('invalid dependency for step "{}", ' 'step "{}" does not exist').format( step_name, depend_node) Log.an().error(msg) raise WorkflowDAGException(msg) # add graph edge from step to step try: self._graph.add_edge(depend_node, step_node) except nx.NetworkXException as err: msg = ('cannot add edge from node "{}" to ' 'node "{}" [{}]').format( depend_node, step_node, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # add context key to dict for depend node self._graph.nodes[depend_node]['contexts'][ Contexts.get_data_scheme_of_exec_context( step['execution']['context'])] = ''
def _run_map(self, map_item): """ Run a job for each map item and store the job ID. Args: self: class instance. map_item: map item object (item of self._map). Returns: On success: True. On failure: False. """ # load default app inputs, overwrite with template inputs inputs = {} for input_key in self._app['inputs']: if input_key in map_item['template']: inputs[input_key] = map_item['template'][input_key] else: if self._app['inputs'][input_key]['default']: inputs[input_key] = self._app['inputs'][input_key][ 'default'] # load default app parameters, overwrite with template parameters parameters = {} for param_key in self._app['parameters']: if param_key in map_item['template']: parameters[param_key] = map_item['template'][param_key] else: if self._app['parameters'][param_key]['default'] not in [ None, '' ]: parameters[param_key] \ = self._app['parameters'][param_key]['default'] # get full path of wrapper script path = shutil.which(self._app['implementation']['local']['script']) if not path: msg = 'wrapper script not found in path: %s'.format( self._app['implementation']['local']['script']) Log.an().error(msg) return self._fatal(msg) # construct argument list for wrapper script args = [path] for input_key in inputs: if inputs[input_key]: args.append('--{}={}'.format( input_key, URIParser.parse(inputs[input_key])['chopped_path'])) for param_key in parameters: if param_key == 'output': args.append('--output={}/{}'.format( self._parsed_data_uris[self._source_context][0]\ ['chopped_path'], parameters['output'] )) else: args.append('--{}={}'.format(param_key, parameters[param_key])) # add exeuction method args.append('--exec_method={}'.format( self._step['execution']['method'])) # specify execution init commands if 'init' param given if 'init' in self._step['execution']['parameters']: args.append('--exec_init={}'.format( self._step['execution']['parameters']['init'])) Log.a().debug('[step.%s]: command: %s -> %s', self._step['name'], map_item['template']['output'], ' '.join(args)) # construct job name name = 'gf-{}-{}-{}'.format( map_item['attempt'], slugify(self._step['name'], regex_pattern=r'[^-a-z0-9_]+'), slugify(map_item['template']['output'], regex_pattern=r'[^-a-z0-9_]+')) # construct paths for logging stdout and stderr log_path = '{}/_log/{}'.format( self._parsed_data_uris[self._source_context][0]['chopped_path'], name) # create and populate job template jt = self._slurm['drmaa_session'].createJobTemplate() jt.remoteCommand = '/bin/bash' jt.args = args jt.jobName = name jt.errorPath = ':{}.err'.format(log_path) jt.outputPath = ':{}.out'.format(log_path) # pass execution parameters to job template native_spec = ' --nodes=1 --ntasks=1' if 'queue' in self._step['execution']['parameters']: native_spec += ' -p {}'.format( self._step['execution']['parameters']['queue']) if 'slots' in self._step['execution']['parameters']: native_spec += ' --cpus-per-task={}'.format( self._step['execution']['parameters']['slots']) if 'other' in self._step['execution']['parameters']: native_spec += ' {}'.format( self._step['execution']['parameters']['other']) jt.nativeSpecification = native_spec # submit hpc job using drmaa library try: job_id = self._slurm['drmaa_session'].runJob(jt) except drmaa.DrmCommunicationException as err: msg = 'cannot submit slurm job for step "{}" [{}]'\ .format(self._step['name'], str(err)) Log.a().warning(msg) # set to failed, but return True so that it's retried map_item['status'] = 'FAILED' map_item['run'][map_item['attempt']]['status'] = 'FAILED' return True self._slurm['drmaa_session'].deleteJobTemplate(jt) Log.a().debug('[step.%s]: hpc job id: %s -> %s', self._step['name'], map_item['template']['output'], job_id) # record job info map_item['run'][map_item['attempt']]['hpc_job_id'] = job_id # set status of process map_item['status'] = 'QUEUED' map_item['run'][map_item['attempt']]['status'] = 'QUEUED' return True