Esempio n. 1
0
    def assign_metrics(self, pipeline_metrics: dict):
        """Assign pipeline metrics to specific pipeline steps.

        This assignment follows a similar logic to the detection of `out`
        dependencies. Starting from a temporary step - child of all the leaf
        nodes, all the nodes in the pipelines are traversed in reversed
        topological order. When a step shows one of the metrics as part of its
        code, then that metric is assigned to the step.

        Args:
            pipeline_metrics (dict): a dict of pipeline metrics where the key
                always the KFP sanitized name and the value the name of the
                original variable.
        """
        # create a temporary step at the end of the pipeline to simplify the
        # iteration from the leaf steps
        tmp_step_name = "_tmp"
        leaf_steps = self.pipeline.get_leaf_steps()
        if not leaf_steps:
            return
        [
            self.pipeline.add_edge(step.name, tmp_step_name)
            for step in leaf_steps
        ]

        # pipeline_metrics is a dict having sanitized variable names as keys
        # and the corresponding variable names as values. Here we need to refer
        # to the sanitized names using the python variables.
        # XXX: We could change parse_metrics_print_statements() to return the
        # XXX: reverse dictionary, but that would require changing either
        # XXX: rpc.nb.get_pipeline_metrics() or change in the JupyterLab
        # XXX: Extension parsing of the RPC result
        rev_pipeline_metrics = {v: k for k, v in pipeline_metrics.items()}
        metrics_left = set(rev_pipeline_metrics.keys())
        for anc in graphutils.get_ordered_ancestors(self.pipeline,
                                                    tmp_step_name):
            if not metrics_left:
                break

            anc_step = self.pipeline.get_step(anc)
            anc_source = '\n'.join(anc_step.source)
            # get all the marshal candidates from father's source and intersect
            # with the metrics that have not been matched yet
            marshal_candidates = astutils.get_marshal_candidates(anc_source)
            assigned_metrics = metrics_left.intersection(marshal_candidates)
            # Remove the metrics that have already been assigned.
            metrics_left.difference_update(assigned_metrics)
            # Generate code to produce the metrics artifact in the current step
            if assigned_metrics:
                code = METRICS_TEMPLATE % ("    " + ",\n    ".join([
                    '"%s": %s' % (rev_pipeline_metrics[x], x)
                    for x in sorted(assigned_metrics)
                ]))
                anc_step.source.append(code)
            # need to have a `metrics` flag set to true in order to set the
            # metrics output artifact in the pipeline template
            anc_step.metrics = True

        self.pipeline.remove_node(tmp_step_name)
Esempio n. 2
0
    def dependencies_detection(self, imports_and_functions: str = ""):
        """Detect the data dependencies between nodes in the graph.

        The data dependencies detection algorithm roughly works as follows:

        1. Traversing the graph in topological order, for every node `step` do
        2. Detect the `ins` of current `step` by running PyFlakes on the source
         code. During this action the pipeline parameters are taken into
         consideration
        3. Parse `step`'s global function definitions to get free variables
         (i.e. variables that would need to be marshalled in other steps that
         call these functions) - in this action pipeline parameters are taken
         into consideration.
        4. Get all the function that `step` calls
        5. For every `step`'s ancestor `anc` do
            - Get all the potential names (objects, functions, ...) of `anc`
             that could be marshalled (saved)
            - Intersect this with the `step`'s `ins` (from action 2) and add
             the result to `anc`'s `outs`.
            - for every `step`'s function call (action 4), check if this
             function was defined in `anc` and if it has free variables
             (action 3). If so, add to `step`'s `ins` and to `anc`'s `outs`
             these free variables.

        Args:
            imports_and_functions: Multiline Python source that is prepended to
                every pipeline step

        Returns: annotated graph
        """
        # resolve the data dependencies between steps, looping through the
        # graph
        for step in self.pipeline.steps:
            # detect the INS dependencies of the CURRENT node------------------
            step_source = '\n'.join(step.source)
            # get the variables that this step is missing and the pipeline
            # parameters that it actually needs.
            ins, parameters = self._detect_in_dependencies(
                source_code=step_source,
                pipeline_parameters=self.pipeline.pipeline_parameters)
            fns_free_variables = self._detect_fns_free_variables(
                step_source, imports_and_functions,
                self.pipeline.pipeline_parameters)

            # Get all the function calls. This will be used below to check if
            # any of the ancestors declare any of these functions. Is that is
            # so, the free variables of those functions will have to be loaded.
            fn_calls = astutils.get_function_calls(step_source)

            # add OUT dependencies annotations in the PARENT nodes-------------
            # Intersect the missing names of this father's child with all
            # the father's names. The intersection is the list of variables
            # that the father need to serialize
            # The ancestors are the the nodes that have a path to `step`,
            # ordered by path length.
            ins_left = ins.copy()
            for anc in (graphutils.get_ordered_ancestors(
                    self.pipeline, step.name)):
                if not ins_left:
                    # if there are no more variables that need to be
                    # marshalled, stop the graph traverse
                    break
                anc_step = self.pipeline.get_step(anc)
                anc_source = '\n'.join(anc_step.source)
                # get all the marshal candidates from father's source and
                # intersect with the required names of the current node
                marshal_candidates = astutils.get_marshal_candidates(
                    anc_source)
                outs = ins_left.intersection(marshal_candidates)
                # Remove the ins that have already been assigned to an ancestor
                ins_left.difference_update(outs)
                # Include free variables
                to_remove = set()
                for fn_call in fn_calls:
                    anc_fns_free_vars = anc_step.fns_free_variables
                    if fn_call in anc_fns_free_vars.keys():
                        # the current step needs to load these variables
                        fn_free_vars, used_params = anc_fns_free_vars[fn_call]
                        # search if this function calls other functions (i.e.
                        # if its free variables are found in the free variables
                        # dict)
                        _left = list(fn_free_vars)
                        while _left:
                            _cur = _left.pop(0)
                            # if the free var is itself a fn with free vars
                            if _cur in anc_fns_free_vars:
                                fn_free_vars.update(anc_fns_free_vars[_cur][0])
                                _left = _left + list(
                                    anc_fns_free_vars[_cur][0])
                        ins.update(fn_free_vars)
                        # the current ancestor needs to save these variables
                        outs.update(fn_free_vars)
                        # add the parameters used by the function to the list
                        # of pipeline parameters used by the step
                        _pps = self.pipeline.pipeline_parameters
                        for param in used_params:
                            parameters[param] = _pps[param]

                        # Remove this function as it has been served. We don't
                        # want other ancestors to save free variables for this
                        # function. Using the helper to_remove because the set
                        # can not be resized during iteration.
                        to_remove.add(fn_call)
                        # add the function and its free variables to the
                        # current step as well. This is useful in case
                        # *another* function will call this one (`fn_call`) in
                        # a child step. In this way we can track the calls up
                        # to the last free variable. (refer to test
                        # `test_dependencies_detection_recursive`)
                        fns_free_variables[fn_call] = anc_fns_free_vars[
                            fn_call]
                fn_calls.difference_update(to_remove)
                # Add to ancestor the new outs annotations. First merge the
                # current outs present in the anc with the new ones
                anc_step.outs.update(outs)

            step.ins = sorted(ins)
            step.parameters = parameters
            step.fns_free_variables = fns_free_variables
Esempio n. 3
0
def test_get_marshal_candidates_exc():
    """Tests exception when passing a wrong code snippet."""
    with pytest.raises(SyntaxError):
        kale_ast.get_marshal_candidates(_wrong_code_snippet)
Esempio n. 4
0
def test_get_marshal_candidates(code, target):
    """Tests get_marshal_candidates function."""
    res = kale_ast.get_marshal_candidates(code)
    assert sorted(res) == sorted(target)