def _init_exec_context_set(self): """ Initialize set of execution contexts, which is specified by the execution.context job parameters. Args: self: class instance Returns: On success: True. """ # get explicit execution contexts from the job parameters self._exec_contexts = set(self._job['execution']['context'].values()) # check validity of exec contexts for context in self._exec_contexts: if not Contexts.is_exec_context(context): msg = 'invalid exec context: {}'.format(context) Log.an().error(msg) return self._fatal(msg) Log.some().debug('execution contexts: %s', self._exec_contexts) return True
def _init_data_context_set(self): """ Initialize set of data contexts, which is determined by inputs and output. Args: self: class instance Returns: On success: True. On failure: False. """ # check input URIs for data contexts for input_key in self._workflow['inputs']: parsed_uri = URIParser.parse(self._workflow['inputs'][input_key]['value'][0]) if not parsed_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][input_key]['value'][0] ) Log.an().error(msg) return self._fatal(msg) self._data_contexts.add(parsed_uri['scheme']) # add output URI data context parsed_output_uri = URIParser.parse(self._job['output_uri']) if not parsed_output_uri: msg = 'invalid base of job output uri: {}'.format( self._job['output_uri'] ) Log.an().error(msg) return self._fatal(msg) self._data_contexts.add(parsed_output_uri['scheme']) # check validity of data contexts for context in self._data_contexts: if not Contexts.is_data_context(context): msg = 'invalid data context: {}'.format(context) Log.an().error(msg) return self._fatal(msg) Log.some().debug('data contexts: %s', self._data_contexts) return True
def _init_job_uris(self): """ Initialize all work and output URIs. Args: self: class instance Returns: On success: True. On failure: False. """ # name of the job directory job_dir = slugify(self._job['name'], regex_pattern=r'[^-a-z0-9_]+') job_dir_hash = '{}-{}'.format(job_dir, self._job['job_id'][:8]) # validate work URI for each exec context # use the 'data_scheme' for each execution context # and place into a set to remove repeats for context in { Contexts.get_data_scheme_of_exec_context(con) for con in self._exec_contexts }: # work_uri must be set for each exec_context if context not in self._job['work_uri']: msg = 'missing work_uri for context: {}'.format(context) Log.an().error(msg) return self._fatal(msg) parsed_uri = URIParser.parse(self._job['work_uri'][context]) if not parsed_uri: msg = 'invalid base of job work uri for context: {}->{}'.format( context, self._job['work_uri'][context] ) Log.an().error(msg) return self._fatal(msg) # append hashed job dir to each context full_job_work_uri = ( '{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}' ).format(parsed_uri['chopped_uri'], job_dir_hash) # validate again after appending parsed_job_work_uri = URIParser.parse(full_job_work_uri) if not parsed_job_work_uri: msg = 'invalid job work uri for context: {}->{}'.format( context, full_job_work_uri ) Log.an().error(msg) return self._fatal(msg) self._parsed_job_work_uri[context] = parsed_job_work_uri # validate output URI parsed_uri = URIParser.parse(self._job['output_uri']) if not parsed_uri: msg = 'invalid base of job output uri: {}'.format( self._job['output_uri'] ) Log.an().error(msg) return self._fatal(msg) # append job dir (hashed or not) to output uri full_job_output_uri = ( '{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}' ).format( parsed_uri['chopped_uri'], job_dir if self._job['no_output_hash'] else job_dir_hash ) # validate again after appending parsed_job_output_uri = URIParser.parse(full_job_output_uri) if not parsed_job_output_uri: msg = 'invalid job output uri: {}'.format( full_job_output_uri ) Log.an().error(msg) return self._fatal(msg) self._parsed_job_output_uri = parsed_job_output_uri return True
def _init_graph_structure(self): """ Create empty nodes for each workflow input and step. Nodes contain attributes for type (e.g., input or step), contexts for data staging (e.g., local or agave), source context, and node. The node attribute is initialized as None, but will later be a reference to a WorkflowInput or WorkflowStep object. Args: None. Returns: On failure: Raises WorkflowDAGException. """ # add empty input nodes to graph for input_name in self._workflow['inputs']: # extract the input source context parsed_input_uri = URIParser.parse( self._workflow['inputs'][input_name]['value']) if not parsed_input_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][input_name]['value']) Log.an().error(msg) raise WorkflowDAGException(msg) source_context = parsed_input_uri['scheme'] try: self._graph.add_node('input.{}'.format(input_name), name='{}'.format(input_name), type='input', contexts={source_context: ''}, source_context=source_context, exec_context=None, node=None) except nx.NetworkXException as err: msg = 'cannot add input node "{}" to graph [{}]'.format( input_name, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # add empty step nodes to graph for step_name, step in self._workflow['steps'].items(): # extract the step source context source_data_context = Contexts.get_data_scheme_of_exec_context( step['execution']['context']) if not source_data_context: msg = 'invalid execution context ({}) for step {}'.format( step['execution']['context'], step_name) Log.an().error(msg) raise WorkflowDAGException(msg) contexts = {source_data_context: ''} if step_name in self._workflow['final_output']: contexts['final'] = '' try: self._graph.add_node('step.{}'.format(step_name), name='{}'.format(step_name), type='step', step=step, contexts=contexts, source_context=source_data_context, exec_context=step['execution']['context'], node=None) except nx.NetworkXException as err: msg = 'cannot add step node "{}" to graph [{}]'.format( step_name, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # create graph edges and determine contexts for each node based on # dependencies for step_name, step in self._workflow['steps'].items(): # name of this step node step_node = 'step.{}'.format(step_name) # get all input or step dependencies for this step deps = self._get_step_dependencies(step) for dep in deps: if deps[dep]['name'] == 'workflow': # input or parameter dependency input_node = 'input.{}'.format(deps[dep]['var']) # only add edge if it's an input (not a parameter) if input_node in self._graph.nodes: # add graph edge from input to step try: self._graph.add_edge(input_node, step_node) except nx.NetworkXException as err: msg = ('cannot add edge from node "{}" to ' 'node "{}" [{}]').format( input_node, step_node, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # add context key to dict for input node self._graph.nodes[input_node]['contexts'][ Contexts.get_data_scheme_of_exec_context( step['execution']['context'])] = '' else: # if input not found, make sure var refers to # a parameter if not deps[dep]['var'] in self._parameters: msg = ('invalid dependency for step "{}", ' 'parameter or input "{}" does not exist' ).format(step_name, deps[dep]['var']) Log.an().error(msg) raise WorkflowDAGException(msg) else: # step dependency depend_node = 'step.{}'.format(deps[dep]['name']) if not self._graph.has_node(depend_node): msg = ('invalid dependency for step "{}", ' 'step "{}" does not exist').format( step_name, depend_node) Log.an().error(msg) raise WorkflowDAGException(msg) # add graph edge from step to step try: self._graph.add_edge(depend_node, step_node) except nx.NetworkXException as err: msg = ('cannot add edge from node "{}" to ' 'node "{}" [{}]').format( depend_node, step_node, str(err)) Log.an().error(msg) raise WorkflowDAGException(msg) # add context key to dict for depend node self._graph.nodes[depend_node]['contexts'][ Contexts.get_data_scheme_of_exec_context( step['execution']['context'])] = ''
def _init_context_uris(self): """ Generate all context URIs for this workflow run. Context URIs are generated based on contexts given in _parsed_job_work_uri, and the "final" context for steps given in the _parsed_job_output_uri. Args: None. Returns: On failure: Raises WorkflowDAGException. """ self._context_uris['inputs'] = {} self._context_uris['steps'] = {'final': {}} self._parsed_context_uris['inputs'] = {} self._parsed_context_uris['steps'] = {'final': {}} # init all data contexts for context in { Contexts.get_data_scheme_of_exec_context(con) for con in self._exec_contexts } | self._data_contexts: self._context_uris['inputs'][context] = {} self._parsed_context_uris['inputs'][context] = {} for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'input': if node['source_context'] == context: # use original input URI parsed_uri = URIParser.parse( self._workflow['inputs'][node['name']]['value']) if not parsed_uri: msg = 'invalid input uri: {}'.format( self._workflow['inputs'][ node['name']]['value']) raise WorkflowDAGException(msg) self._context_uris['inputs'][context][node['name']]\ = parsed_uri['chopped_uri'] self._parsed_context_uris['inputs'][context]\ [node['name']] = parsed_uri else: # skip if _parsed_job_work_uri is not defined for this context # this implies that there is no execution defined for that context, # so no need to setup the data staging location at the work_uri if context not in self._parsed_job_work_uri: continue # switch context of input URI new_base_uri = '{}/_input-{}'.format( self._parsed_job_work_uri[context]['chopped_uri'], slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+')) # create new base URI if not DataManager.mkdir( uri=new_base_uri, recursive=True, **{context: self._context_options[context]}): msg = 'cannot create new base uri for input: {}'\ .format(new_base_uri) Log.an().error(msg) raise WorkflowDAGException(msg) # switch input URI base switched_uri = URIParser.switch_context( self._workflow['inputs'][node['name']]['value'], new_base_uri) if not switched_uri: msg = ( 'cannot switch input uri context to ' 'new base URI: {}->{}' ).format( self._workflow['inputs'][node['name']]\ ['value'], new_base_uri ) Log.an().error(msg) raise WorkflowDAGException(msg) self._context_uris['inputs'][context][node['name']]\ = switched_uri['chopped_uri'] self._parsed_context_uris['inputs'][context]\ [node['name']] = switched_uri for context in { Contexts.get_data_scheme_of_exec_context(con) for con in self._exec_contexts }: self._context_uris['steps'][context] = {} self._parsed_context_uris['steps'][context] = {} for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'step': self._context_uris['steps'][context][node['name']]\ = '{}/{}'.format( self._parsed_job_work_uri[context]['chopped_uri'], slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+') ) self._parsed_context_uris['steps'][context][node['name']]\ = URIParser.parse( self._context_uris['steps'][context][node['name']] ) # init final contexts for steps for node_name in self._topo_sort: node = self._graph.nodes[node_name] if node['type'] == 'step': self._context_uris['steps']['final'][node['name']]\ = '{}/{}'.format( self._parsed_job_output_uri['chopped_uri'], slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+') ) self._parsed_context_uris['steps']['final'][node['name']]\ = URIParser.parse( self._context_uris['steps']['final'][node['name']] )