Example #1
0
    def _prune_unused_resources(dag):
        resources_defined = dag.get('resources')
        if not resources_defined:
            return dag

        resources_required = frozenset(
            resource
            for operator_section in ['before', 'operators', 'after', 'subdags', 'generators']
            for operator in dag.get(operator_section, [])
            for resource in operator.get('requires_resources', []))

        keep_resources = []

        for resource in resources_defined:
            if resource['name'] not in resources_required:
                logger.info(
                    'Discarding unused resource %s from dag %s',
                    resource['name'],
                    dag['name'])
                continue

            keep_resources.append(resource)

        result = dag.copy()
        result['resources'] = keep_resources
        return result
    def __init__(self, load_package_plugins=True, plugins=None):

        self._plugins = [p for p in (plugins or [])]

        if load_package_plugins:
            self._plugins += self._load_package_plugins()

        logger.info('Loaded plugins %s',
                    ', '.join(plugin.name for plugin in self._plugins))
    def cluster_config(self):
        plugins_with_cluster_configs = [
            pc for pc in self._plugin_containers if pc.plugin.cluster_config()
        ]

        if not plugins_with_cluster_configs:
            raise Exception(
                'No cluster configurations found for oozie parser!')

        if len(plugins_with_cluster_configs) > 1:
            logger.info(
                'Multiple cluster configurations found.  Choosing configuration '
                'from plugin `%s`, with priority `%s`',
                plugins_with_cluster_configs[0].name,
                plugins_with_cluster_configs[0].priority)

        return plugins_with_cluster_configs[0].plugin.cluster_config()
Example #4
0
    def _unprune_referenced_sub_workflows(self, keep_paths, prune_paths):
        """ This method is only called when --only-nodes was specified.  It
            enables a user to specify a sub-workflow referencing node as one of
            the --only-nodes arguments.  If such a node is referenced, then the
            entire sub-workflow that it targets will be added to the keep_paths
            list, and furthermore, any referring nodes nested in that sub-workflow
            will also be added to the keep_paths recursively.

            :param keep_paths: the keep paths computed after partitioning the workflow
            :type keep_paths: list<list<(string, string)>>
            :param prune_paths: the prune paths computed after partitioning the workflow
            :type prune_paths: list<list<(string, string)>>

            :returns: an updated pair of (keep_paths, prune_paths)
            :rtype: (list<list<(string, string)>>, list<list<(string, string)>>)
        """

        keep_nodes = frozenset([path[-1] for path in keep_paths])

        shift_path_indexes = frozenset(idx for (idx,
                                                path) in enumerate(prune_paths)
                                       if any(node in keep_nodes
                                              for node in path))

        if not shift_path_indexes:
            return (keep_paths, prune_paths)

        for idx in shift_path_indexes:
            node = prune_paths[idx][-1]
            logger.info(
                "Keeping node %s.%s because it is downstream of an --only-nodes argument",
                node[0], node[1])

        return self._unprune_referenced_sub_workflows(
            keep_paths + [prune_paths[i] for i in shift_path_indexes], [
                path for (i, path) in enumerate(prune_paths)
                if i not in shift_path_indexes
            ])
Example #5
0
    def _prune_paths(self, prune_nodes, keep_nodes,
                     allow_augmented_keep_nodes):
        """ Actually do the pruning.  There are a few steps here.

            First, we must figure out whether there are any nodes in the keep
            pile that require nodes from the prune pile, because those nodes
            are used in paths to the keep nodes (but only if
            allow_augmented_keep_nodes is True)

            Second, we figure out whether there are any nodes in the keep pile
            that refer to sub-workflows that are entirely in the prune pile,
            so that these referring nodes should also be pruned.

            Third, we figure out which workflows, if any, will become inaccessible
            due to the removal of referring nodes, and we discard these.

            Fourth, we prune all the specified nodes out of the graph.

            Finally, we delete any sub-workflows that end up empty after pruning.

            :param prune_nodes: paths to all of the nodes we plan to prune
            :type prune_nodes: list<list<(string, string)>>
            :param keep_nodes: paths to all of the nodes we plan to keep
            :type keep_nodes: list<list<(string, string)>>
            :param allow_augmented_keep_nodes: Whether to augment the list of
                keep_nodes by adding in any nodes required for access to nodes
                in the keep_nodes list.  For example,
        """

        # Use a set to denote the nodes that we are ultimately going
        # to prune
        planned_prune_nodes = set(path[-1] for path in prune_nodes)
        planned_keep_nodes = set(path[-1] for path in keep_nodes)

        # First step, remove items from planned_prune_nodes if necessary
        if allow_augmented_keep_nodes:
            self._augment_keep_nodes_list(keep_nodes, planned_prune_nodes,
                                          planned_keep_nodes)

        # Second step, augment planned_prune_nodes to reflect any fully-deleted
        # sub-workflows
        self._augment_pruned_sub_workflow_referrers(planned_prune_nodes,
                                                    planned_keep_nodes)

        # Third step, identify any sub-workflows that have become inaccessible
        # by removal of referring nodes
        inaccessible_workflows = self._find_inaccessible_workflows(
            planned_prune_nodes)

        # Fourth step, prune the graph
        logger.debug('going to prune away nodes: %s', planned_prune_nodes)

        keyed_graphs = self._build_keyed_graph_map()

        pruned_primary = self._prune_workflow(workflow=self.primary,
                                              is_primary=True,
                                              graph=keyed_graphs[None],
                                              prune_nodes=planned_prune_nodes)
        if self._workflow_is_empty(pruned_primary):
            raise Exception(
                'Pruning operation produced an empty primary workflow: {}'.
                format(pruned_primary))

        pruned_secondary = []
        for workflow in self.secondary:
            workflow_name = workflow['name']

            if workflow_name in inaccessible_workflows:
                logger.debug('Skipping inaccessible workflow %s',
                             workflow_name)
                continue

            pruned_workflow = self._prune_workflow(
                workflow=workflow,
                is_primary=False,
                graph=keyed_graphs[workflow_name],
                prune_nodes=planned_prune_nodes)

            # Fifth step, discard empty workflows
            if self._workflow_is_empty(pruned_workflow):
                logger.info(
                    'Pruning operation produced an empty sub-workflow: %s',
                    pruned_workflow['name'])
            else:
                pruned_secondary.append(pruned_workflow)

        logger.debug('pruned primary workflow: %s', pruned_primary)
        logger.debug('pruned secondary workflows: %s', pruned_secondary)

        return (pruned_primary, pruned_secondary)