def prune_nodes(graph, nodes=None, node_selector=None): if not nodes and not node_selector: return graph components_before_prune = list( nx.algorithms.components.weakly_connected_components(graph)) prune_nodes = nodes if nodes else \ [node for node in graph.nodes if node_selector(node)] for node in prune_nodes: _GraphUtil.prune_node(node, graph) components_after_prune = list( nx.algorithms.components.weakly_connected_components(graph)) if len(components_before_prune) < len(components_after_prune): logger.warning('Pruning of nodes %s from graph %s caused components to ' 'become disconnected! Components before prune: %s; after prune: %s', prune_nodes, graph, components_before_prune, components_after_prune) return graph
def process_arg(self, arg, node, raw_args): regex = None try: regex = re.compile(self.pattern) except Exception: raise Exception( 'Error compiling regex for `{}`: `{}` is an invalid pattern'. format(self.type, self.properties['pattern'])) rendered_arg = None try: rendered_arg = self.render_template(arg, raw_args) except jinja2.exceptions.UndefinedError: logger.warning( 'Could not render template `%s`; cannot verify that the argument ' 'matches the required pattern `%s`!', arg, regex.pattern) return arg if regex.match(rendered_arg): # return the original arg, not the rendered arg, because we are not # actually transforming anything, just validating return arg VERBATIM_REGEX = '<<.+>>' if re.compile(VERBATIM_REGEX).search(rendered_arg): logger.warning( 'Argument generated from `%s` may not match the required pattern `%s` and fail.', rendered_arg, regex.pattern) return arg raise Exception( 'Invalid argument `{}`: does not match expected pattern `{}`'. format(rendered_arg, regex.pattern))
def _build_graph(graph_name, nodes): graph = nx.DiGraph() logger.debug('Creating graph %s from nodes %s', graph_name, nodes.keys()) graph.add_nodes_from(nodes.keys()) graph.add_edges_from( (upstream_name, downstream_name) for (upstream_name, (downstream_node_names, node)) in six.iteritems(nodes) for downstream_name in downstream_node_names) if not nx.algorithms.components.is_weakly_connected(graph): components = list( nx.algorithms.components.weakly_connected_components(graph)) logger.warning( 'Multiple connected components found for graph `%s`: %s', graph_name, components) if not nx.algorithms.dag.is_directed_acyclic_graph(graph): raise exceptions.CyclicWorkflowException( 'Invalid graph `{}`: not a DAG!'.format(graph_name)) logger.debug('Successfully created graph %s with nodes %s', graph_name, graph.nodes) return graph
def _flow_control_nodes(self, wf): result = { 'start': ([wf['start']['to']], wf['start']), wf['end'].get('name', 'end'): ([], wf['end']) } if wf.get('kill'): result[wf['kill'].get('name', 'kill')] = ([], wf['kill']) for decision in wf['decision']: if not self.debug: raise Exception('decision node found: `{}` not supported right ' 'now! Set --debug to build without raising this error, but ' 'only at your own risk!'.format(decision['name'])) logger.warning( 'decision node found: `%s`. Including all downstream ' 'branches because --debug was specified', decision['name']) cases = [case['to'] for case in decision['switch']['case']] default = decision['switch']['default']['to'] result[decision['name']] = (cases + [default], decision) return result
def resolve_properties(self, execution_context, default_task_args=None, base_operator_loader=None, preprocessor_loader=None): """ Get the properties / arguments for the operator, and split them according to their source. Specifically, properties are provided to the operator by either the DAG config file, the resources available in the operator's context, any task defaults specified in the primary DAG, and the schema defaults, in that order of precedence. Once the properties are all resolved, this method then validates all of the resolved arguments against the task's schema. :param execution_context: the context in which this node is executed, specifically containing the available resources and the node that referred to this node, if any :type execution_context: boundary_layer.containers.ExecutionContext :param default_task_args: the default task args defined in the DAG :type default_task_args: dict :param base_operator_loader: A method that retrieves typed operators, equivalent to a Registry.get method :type base_operator_loader: callable :param preprocessor_loader: A method that retrieves typed preprocessors, equivalent to a Registry.get method :type preprocessor_loader: callable :returns: a mapping of property source to property key/value pairs :rtype: dict<dict<string, any>> """ schema = self.get_schema(base_operator_loader) schema_properties = frozenset(schema.get('properties', {}).keys()) self.set_default_task_args(default_task_args) (sources, property_values) = self._get_property_sources_and_values( schema_properties, execution_context) validated = validator.validate_and_fill_defaults(item=property_values, schema=schema) for key in validated: if key not in property_values: continue sources.schema.add(key) logger.debug('%s: validated partitioned properties: %s', self.name, sources) preprocessors = self._load_preprocessors(base_operator_loader, preprocessor_loader) self._preprocessor_imports = { pp_name: pp.imports() for (pp_name, pp) in six.iteritems(preprocessors) } preprocessed_values = self._apply_preprocessors( args=validated, preprocessors=preprocessors) if self._resolved_properties: if preprocessed_values != self._resolved_properties.values: raise Exception( 'resolve_properties() was already called for operator {}, ' 'and different values were computed this time! Found: {}, ' 'expected: {}. This was probably caused by repeated ' 'references to a sub-dag or generator using different resource ' 'contexts. This is not presently supported!'.format( self, preprocessed_values, self._resolved_properties.values)) else: logger.warning( 'resolve_properties() was already called for operator %s, ' 'but no differences in the computed properties were found.', self) self._resolved_properties = ResolvedProperties( sources=sources, values=preprocessed_values) return self._resolved_properties
def validate_and_resolve_properties(spec): secondary_lookup = {dag['name']: dag for dag in spec.secondary} default_task_args = spec.primary.get('default_task_args', {}) # Construct sets of all of the resources created and requested, # so that we can check for unused resources and default args all_resources_created = set() all_resources_requested = set() all_defaults_used = set() def validate_dag(dag, execution_context): nodes = Workflow.get_all_nodes(dag) dag_resources = { resource.name: resource for resource in dag['resources'] } Workflow.ensure_no_duplicate_names( dag['name'], [node.name for node in nodes] + list(dag_resources), list(execution_context.resources)) for resource in dag_resources.values(): (create_properties, destroy_properties) = resource.resolve_properties( execution_context=execution_context, default_task_args=default_task_args, base_operator_loader=plugins.manager.operators, preprocessor_loader=plugins.manager. property_preprocessors, ) all_defaults_used.update( set(create_properties.sources.default_task_args)) all_defaults_used.update( set(destroy_properties.sources.default_task_args)) all_resources_created.update(set(dag_resources)) all_resources_requested.update( set(resource for node in nodes for resource in node.requires_resources)) resources_available = execution_context.resources.copy() resources_available.update(dag_resources) missing_resources = { name: missing for (name, missing) in [(node.name, frozenset(node.requires_resources) - frozenset(resources_available)) for node in nodes] if missing } if missing_resources: raise ValueError( 'Error in dag {}: Operators require resources ' 'outside their local contexts: {}'.format( dag['name'], missing_resources)) for node in nodes: properties = node.resolve_properties( execution_context=execution_context._replace( resources=resources_available), default_task_args=default_task_args, base_operator_loader=plugins.manager.operators, preprocessor_loader=plugins.manager.property_preprocessors, ) all_defaults_used.update( set(properties.sources.default_task_args)) all_referrers = dag['sub_dags'] + dag['generators'] for referrer in all_referrers: subdag = secondary_lookup[referrer.target] subdag_resources_available = { name: resource for (name, resource) in six.iteritems(resources_available) if name in frozenset(referrer.requires_resources) } subdag_ctx = ExecutionContext( referrer=referrer, resources=subdag_resources_available) validate_dag(subdag, subdag_ctx) validate_dag(spec.primary, ExecutionContext(referrer=None, resources={})) unused_resources = all_resources_created - all_resources_requested if unused_resources: raise ValueError('Unused resources `{}`'.format( '`, `'.join(unused_resources))) unused_defaults = frozenset(default_task_args) - frozenset( all_defaults_used) if unused_defaults: logger.warning('Unused default task args: `%s`', '`, `'.join(unused_defaults))