Esempio n. 1
0
    def __init__(self,
                 nb_path: str,
                 nb_metadata_overrides: Dict[str, Any] = None,
                 skip_validation: bool = False):
        """Instantiate a new NotebookProcessor.

        Args:
            nb_path: Path to source notebook
            nb_metadata_overrides: Override notebook config settings
            skip_validation: Set to True in order to skip the notebook's
                metadata validation. This is useful in case the
                NotebookProcessor is used to parse a part of the notebook
                (e.g., retrieve pipeline metrics) and the notebook config (for
                pipeline generation) might still be invalid.
        """
        self.nb_path = os.path.expanduser(nb_path)
        self.notebook = self._read_notebook()

        nb_metadata = self.notebook.metadata.get(KALE_NB_METADATA_KEY, dict())

        # fixme: needed?
        nb_metadata.update({"notebook_path": nb_path})
        if nb_metadata_overrides:
            nb_metadata.update(nb_metadata_overrides)
        # validate and populate defaults
        # FIXME: Maybe improve this by implementing a "skip_validation" flag
        #  in the config class
        self.config = None
        if not skip_validation:
            self.config = NotebookConfig(**nb_metadata)
        self.pipeline = Pipeline(self.config)
Esempio n. 2
0
def test_get_volumes_parameters_exc(dummy_nb_config, volumes):
    """Tests that volumes are correctly converted from list into dict."""
    with pytest.raises(ValueError,
                       match="VolumeTypeValidator:"
                       " Value unknown is not allowed"):
        pipeline = Pipeline(NotebookConfig(**dummy_nb_config, volumes=volumes))
        pipeline.set_volume_pipeline_parameters()
Esempio n. 3
0
 def __init__(self,
              config: PipelineConfig = None,
              skip_validation: bool = False,
              **kwargs):
     self.config = config
     if not config and not skip_validation:
         self.config = self.config_cls(**kwargs)
     self.pipeline = Pipeline(self.config)
Esempio n. 4
0
def assign_metrics(pipeline: Pipeline, pipeline_metrics: dict):
    """Assign pipeline metrics to specific pipeline steps.

    This assignment follows a similar logic to the detection of `out`
    dependencies. Starting from a temporary step - child of all the leaf nodes,
    all the nodes in the pipelines are traversed in reversed topological order.
    When a step shows one of the metrics as part of its code, then that metric
    is assigned to the step.

    Args:
        pipeline: Pipeline object
        pipeline_metrics (dict): a dict of pipeline metrics where the key is
            the KFP sanitized name and the value the name of the original
            variable.
    """
    # create a temporary step at the end of the pipeline to simplify the
    # iteration from the leaf steps
    tmp_step_name = "_tmp"
    leaf_steps = pipeline.get_leaf_steps()
    if not leaf_steps:
        return
    [pipeline.add_edge(step.name, tmp_step_name) for step in leaf_steps]

    # pipeline_metrics is a dict having sanitized variable names as keys and
    # the corresponding variable names as values. Here we need to refer to
    # the sanitized names using the python variables.
    # XXX: We could change parse_metrics_print_statements() to return the
    # XXX: reverse dictionary, but that would require changing either
    # XXX: rpc.nb.get_pipeline_metrics() or change in the JupyterLab Extension
    # XXX: parsing of the RPC result
    rev_pipeline_metrics = {v: k for k, v in pipeline_metrics.items()}
    metrics_left = set(rev_pipeline_metrics.keys())
    for anc in graphutils.get_ordered_ancestors(pipeline, tmp_step_name):
        if not metrics_left:
            break

        anc_step = pipeline.get_step(anc)
        anc_source = '\n'.join(anc_step.source)
        # get all the marshal candidates from father's source and intersect
        # with the metrics that have not been matched yet
        marshal_candidates = kale_ast.get_marshal_candidates(anc_source)
        assigned_metrics = metrics_left.intersection(marshal_candidates)
        # Remove the metrics that have already been assigned.
        metrics_left.difference_update(assigned_metrics)
        # Generate code to produce the metrics artifact in the current step
        if assigned_metrics:
            code = METRICS_TEMPLATE % ("    " + ",\n    ".join([
                '"%s": %s' % (rev_pipeline_metrics[x], x)
                for x in sorted(assigned_metrics)
            ]))
            anc_step.source.append(code)
        # need to have a `metrics` flag set to true in order to set the
        # metrics output artifact in the pipeline template
        anc_step.metrics = True

    pipeline.remove_node(tmp_step_name)
Esempio n. 5
0
def test_generate_function(_nb_config_mock, notebook_processor, step_name,
                           source, ins, outs, metadata, target):
    """Test that python code is generated correctly."""
    pipeline = Pipeline(NotebookConfig(**{**DUMMY_NB_CONFIG, **metadata}))
    pipeline.processor = notebook_processor
    step = Step(name=step_name, source=source, ins=ins, outs=outs)
    compiler = Compiler(pipeline)
    res = compiler.generate_lightweight_component(step)
    target = open(os.path.join(THIS_DIR, "../assets/functions", target)).read()
    assert res.strip() == target.strip()
Esempio n. 6
0
def test_set_volume_pipeline_parameters(notebook_processor, dummy_nb_config,
                                        volumes, target):
    """Tests that volumes are correctly converted from list into dict."""
    notebook_processor.pipeline = Pipeline(
        NotebookConfig(**dummy_nb_config, volumes=volumes))
    notebook_processor._set_volume_pipeline_parameters()
    assert target == notebook_processor.pipeline.pipeline_parameters
Esempio n. 7
0
def test_merge_code(dummy_nb_config):
    """Test the merge code functionality."""
    pipeline = Pipeline(NotebookConfig(**dummy_nb_config))
    pipeline.add_step(Step(name="test", source=["test1"]))
    pipeline.get_step("test").merge_code("test2")

    assert pipeline.get_step("test").source == ["test1", "test2"]
Esempio n. 8
0
def test_dependencies_detection_free_variable(notebook_processor,
                                              dummy_nb_config):
    """Test dependencies detection with free variables."""
    pipeline = Pipeline(dummy_nb_config)

    _source = ['''
x = 5
''']
    pipeline.add_step(Step(name="step1", source=_source))

    _source = ['''
def foo():
   print(x)
''']
    pipeline.add_step(Step(name="step2", source=_source))

    _source = ['''
foo()
''']
    pipeline.add_step(Step(name="step3", source=_source))

    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    notebook_processor.pipeline = pipeline
    notebook_processor.dependencies_detection()
    assert sorted(pipeline.get_step("step1").ins) == []
    assert sorted(pipeline.get_step("step1").outs) == ["x"]
    assert sorted(pipeline.get_step("step2").ins) == ["x"]
    assert sorted(pipeline.get_step("step2").outs) == ["foo", "x"]
    assert sorted(pipeline.get_step("step3").ins) == ["foo", "x"]
    assert sorted(pipeline.get_step("step3").outs) == []
Esempio n. 9
0
def test_deps_detection_recursive_different_steps_branch(notebook_processor,
                                                         dummy_nb_config):
    """Test dependencies when fns are passed from multiple branches."""
    pipeline = Pipeline(dummy_nb_config)

    _source = ['''
x = 5
y = 6
''']
    pipeline.add_step(Step(name="step0", source=_source))
    _source = ['''
def foo():
    print(x)
''']
    pipeline.add_step(Step(name="step_l", source=_source))
    _source = ['''
def bar():
    print(y)
''']
    pipeline.add_step(Step(name="step_r", source=_source))
    _source = ['''
def result():
    foo()
    bar()
''']
    pipeline.add_step(Step(name="step_m", source=_source))
    _source = ["result()"]
    pipeline.add_step(Step(name="step_f", source=_source))

    pipeline.add_edge("step0", "step_l")
    pipeline.add_edge("step0", "step_r")
    pipeline.add_edge("step_l", "step_m")
    pipeline.add_edge("step_r", "step_m")
    pipeline.add_edge("step_m", "step_f")

    notebook_processor.pipeline = pipeline
    notebook_processor.dependencies_detection()
    assert sorted(pipeline.get_step("step0").ins) == []
    assert sorted(pipeline.get_step("step0").outs) == ['x', 'y']
    assert sorted(pipeline.get_step("step_l").ins) == ['x']
    assert sorted(pipeline.get_step("step_l").outs) == ['foo', 'x']
    assert sorted(pipeline.get_step("step_r").ins) == ['y']
    assert sorted(pipeline.get_step("step_r").outs) == ['bar', 'y']
    assert sorted(pipeline.get_step("step_m").ins) == ['bar', 'foo', 'x', 'y']
    assert (sorted(pipeline.get_step("step_m").outs)
            == ['bar', 'foo', 'result', 'x', 'y'])
    assert (sorted(pipeline.get_step("step_f").ins)
            == ['bar', 'foo', 'result', 'x', 'y'])
    assert sorted(pipeline.get_step("step_f").outs) == []
Esempio n. 10
0
def test_deps_detection_recursive_different_steps_long(notebook_processor,
                                                       dummy_nb_config):
    """Test dependencies are detected even with a long chain of fns calls."""
    pipeline = Pipeline(dummy_nb_config)

    _source = ['''
x = 5
def init():
    print(x)
''']
    pipeline.add_step(Step(name="step0", source=_source))
    _source = ['''
def foo():
    init()
''']
    pipeline.add_step(Step(name="step1", source=_source))
    _source = ['''
def bar():
    foo()
''']
    pipeline.add_step(Step(name="step2", source=_source))
    _source = ["bar()"]
    pipeline.add_step(Step(name="step3", source=_source))

    pipeline.add_edge("step0", "step1")
    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    notebook_processor.pipeline = pipeline
    notebook_processor.dependencies_detection()
    assert sorted(pipeline.get_step("step0").ins) == []
    assert sorted(pipeline.get_step("step0").outs) == ['init', 'x']
    assert sorted(pipeline.get_step("step1").ins) == ['init', 'x']
    assert sorted(pipeline.get_step("step1").outs) == ['foo', 'init', 'x']
    assert sorted(pipeline.get_step("step2").ins) == ['foo', 'init', 'x']
    assert (sorted(pipeline.get_step("step2").outs)
            == ['bar', 'foo', 'init', 'x'])
    assert (sorted(pipeline.get_step("step3").ins)
            == ['bar', 'foo', 'init', 'x'])
    assert sorted(pipeline.get_step("step3").outs) == []
Esempio n. 11
0
def test_dependencies_detection_with_try_except(notebook_processor,
                                                dummy_nb_config):
    """Test dependencies are detected with functions inside try."""
    pipeline = Pipeline(dummy_nb_config)

    _source = ['''
x = 5
y = 6
''']
    pipeline.add_step(Step(name="step1", source=_source))
    _source = ['''
try:
    def foo():
        print(x)
    def bar():
        print(y)
except:
    pass
''']
    pipeline.add_step(Step(name="step2", source=_source))
    _source = ['''
foo()
bar()
''']
    pipeline.add_step(Step(name="step3", source=_source))

    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    notebook_processor.pipeline = pipeline
    notebook_processor.dependencies_detection()
    assert sorted(pipeline.get_step("step1").ins) == []
    assert sorted(pipeline.get_step("step1").outs) == ['x', 'y']
    assert sorted(pipeline.get_step("step2").ins) == ['x', 'y']
    assert sorted(pipeline.get_step("step2").outs) == ['bar', 'foo', 'x', 'y']
    assert sorted(pipeline.get_step("step3").ins) == ['bar', 'foo', 'x', 'y']
    assert sorted(pipeline.get_step("step3").outs) == []
Esempio n. 12
0
def test_dependencies_detection_with_pipeline_parameters(notebook_processor,
                                                         dummy_nb_config):
    """Test dependencies are detected with pipeline parameters and globals."""
    imports_and_functions = "import math"

    pipeline = Pipeline(dummy_nb_config)
    pipeline.pipeline_parameters = {"y": (5, 'int')}

    _source = ["x = 5"]
    pipeline.add_step(Step(name="step1",
                           source=_prepend_to_source(_source,
                                                     imports_and_functions)))
    _source = ['''
def foo(x):
    def bar():
        math.sqrt(x + y)
    bar()
''']
    pipeline.add_step(Step(name="step2",
                           source=_prepend_to_source(_source,
                                                     imports_and_functions)))
    _source = ["foo(5)"]
    pipeline.add_step(Step(name="step3",
                           source=_prepend_to_source(_source,
                                                     imports_and_functions)))

    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    notebook_processor.pipeline = pipeline
    notebook_processor.dependencies_detection(imports_and_functions)
    assert sorted(pipeline.get_step("step1").ins) == []
    assert sorted(pipeline.get_step("step1").outs) == []
    assert sorted(pipeline.get_step("step2").ins) == []
    assert sorted(pipeline.get_step("step2").outs) == ['foo']
    assert pipeline.get_step("step2").parameters == {"y": (5, 'int')}
    assert sorted(pipeline.get_step("step3").ins) == ['foo']
    assert sorted(pipeline.get_step("step3").outs) == []
    assert pipeline.get_step("step3").parameters == {"y": (5, 'int')}
Esempio n. 13
0
def test_dependencies_detection_with_globals(notebook_processor,
                                             dummy_nb_config):
    """Test dependencies detection with inner function and globals."""
    imports_and_functions = "import math"

    pipeline = Pipeline(dummy_nb_config)

    _source = ["x = 5"]
    pipeline.add_step(Step(name="step1",
                           source=_prepend_to_source(_source,
                                                     imports_and_functions)))
    _source = ['''
def foo(x):
    def bar():
        math.sqrt(x)
    bar()
''']
    pipeline.add_step(Step(name="step2",
                           source=_prepend_to_source(_source,
                                                     imports_and_functions)))
    _source = ["foo(5)"]
    pipeline.add_step(Step(name="step3",
                           source=_prepend_to_source(_source,
                                                     imports_and_functions)))

    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    notebook_processor.pipeline = pipeline
    notebook_processor.dependencies_detection(imports_and_functions)
    assert sorted(pipeline.get_step("step1").ins) == []
    assert sorted(pipeline.get_step("step1").outs) == []
    assert sorted(pipeline.get_step("step2").ins) == []
    assert sorted(pipeline.get_step("step2").outs) == ['foo']
    assert sorted(pipeline.get_step("step3").ins) == ['foo']
    assert sorted(pipeline.get_step("step3").outs) == []
Esempio n. 14
0
def test_dependencies_detection_with_parameter(notebook_processor,
                                               dummy_nb_config):
    """Test dependencies detection with function with parameter."""
    pipeline = Pipeline(dummy_nb_config)

    _source = ["x = 5"]
    pipeline.add_step(Step(name="step1", source=_source))
    _source = ['''
def foo(x):
    def bar():
        print(x)
''']
    pipeline.add_step(Step(name="step2", source=_source))
    _source = ["foo(5)"]
    pipeline.add_step(Step(name="step3", source=_source))

    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    notebook_processor.pipeline = pipeline
    notebook_processor.dependencies_detection()
    assert sorted(pipeline.get_step("step1").ins) == []
    assert sorted(pipeline.get_step("step1").outs) == []
    assert sorted(pipeline.get_step("step2").ins) == []
    assert sorted(pipeline.get_step("step2").outs) == ['foo']
    assert sorted(pipeline.get_step("step3").ins) == ['foo']
    assert sorted(pipeline.get_step("step3").outs) == []
Esempio n. 15
0
class NotebookProcessor:
    """Convert a Notebook to a Pipeline object."""
    def __init__(self,
                 nb_path: str,
                 nb_metadata_overrides: Dict[str, Any] = None,
                 skip_validation: bool = False):
        """Instantiate a new NotebookProcessor.

        Args:
            nb_path: Path to source notebook
            nb_metadata_overrides: Override notebook config settings
            skip_validation: Set to True in order to skip the notebook's
                metadata validation. This is useful in case the
                NotebookProcessor is used to parse a part of the notebook
                (e.g., retrieve pipeline metrics) and the notebook config (for
                pipeline generation) might still be invalid.
        """
        self.nb_path = os.path.expanduser(nb_path)
        self.notebook = self._read_notebook()

        nb_metadata = self.notebook.metadata.get(KALE_NB_METADATA_KEY, dict())

        # fixme: needed?
        nb_metadata.update({"notebook_path": nb_path})
        if nb_metadata_overrides:
            nb_metadata.update(nb_metadata_overrides)
        # validate and populate defaults
        # FIXME: Maybe improve this by implementing a "skip_validation" flag
        #  in the config class
        self.config = None
        if not skip_validation:
            self.config = NotebookConfig(**nb_metadata)
        self.pipeline = Pipeline(self.config)

    def _read_notebook(self):
        if not os.path.exists(self.nb_path):
            raise ValueError("NotebookProcessor could not find a notebook at"
                             " path %s" % self.nb_path)
        return nb.read(self.nb_path, as_version=nb.NO_CONVERT)

    def to_pipeline(self):
        """Convert an annotated Notebook to a Pipeline object."""
        (pipeline_parameters_source, pipeline_metrics_source,
         imports_and_functions) = self.parse_notebook()

        self.parse_pipeline_parameters(pipeline_parameters_source)
        self.pipeline.set_volume_pipeline_parameters()

        # get a list of variables that need to be logged as pipeline metrics
        pipeline_metrics = astutils.parse_metrics_print_statements(
            pipeline_metrics_source)

        # run static analysis over the source code
        self.dependencies_detection(imports_and_functions)
        self.assign_metrics(pipeline_metrics)

        # if there are multiple DAG leaves, add an empty step at the end of the
        # pipeline for final snapshot
        leaf_steps = self.pipeline.get_leaf_steps()
        if self.config.autosnapshot and len(leaf_steps) > 1:
            _name = "final_auto_snapshot"
            self.pipeline.add_step(Step(name=_name, source=[]))
            # add a link from all the last steps of the pipeline to
            # the final auto snapshot one.
            for step in leaf_steps:
                self.pipeline.add_edge(step.name, _name)

        # FIXME: Move this to a base class Processor, to be executed by default
        #  after `to_pipeline`, so that it is agnostic to the type of
        #  processor.
        for step in self.pipeline.steps:
            step.config.update(self.pipeline.config.steps_defaults)

        # TODO: Additional action required:
        #  Run a static analysis over every step to check that pipeline
        #  parameters are not assigned with new values.
        return self.pipeline

    def parse_pipeline_parameters(self, source: str):
        """Get pipeline parameters from source code."""
        pipeline_parameters = astutils.parse_assignments_expressions(source)
        for name, (v_type, v_value) in pipeline_parameters.items():
            pipeline_parameters[name] = PipelineParam(v_type, v_value)
        self.pipeline.pipeline_parameters = pipeline_parameters

    def parse_notebook(self):
        """Creates a NetworkX graph based on the input notebook's tags.

        Cell's source code are embedded into the graph as node attributes.
        """
        # will be assigned at the end of each for loop
        prev_step_name = None

        # All the code cells that have to be pre-pended to every pipeline step
        # (i.e., imports and functions) are merged here
        imports_block = list()
        functions_block = list()

        # Variables that will become pipeline parameters
        pipeline_parameters = list()
        # Variables that will become pipeline metrics
        pipeline_metrics = list()

        for c in self.notebook.cells:
            if c.cell_type != "code":
                continue

            tags = self.parse_cell_metadata(c.metadata)

            if len(tags['step_names']) > 1:
                raise NotImplementedError("Kale does not yet support multiple"
                                          " step names in a single notebook"
                                          " cell. One notebook cell was found"
                                          " with %s  step names" %
                                          tags['step_names'])

            step_name = (tags['step_names'][0]
                         if 0 < len(tags['step_names']) else None)

            if step_name == 'skip':
                # when the cell is skipped, don't store `skip` as the previous
                # active cell
                continue
            if step_name == 'pipeline-parameters':
                pipeline_parameters.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'imports':
                imports_block.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'functions':
                functions_block.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'pipeline-metrics':
                pipeline_metrics.append(c.source)
                prev_step_name = step_name
                continue

            # if none of the above apply, then we are parsing a code cell with
            # a block names and (possibly) some dependencies

            # if the cell was not tagged with a step name,
            # add the code to the previous cell
            if not step_name:
                if prev_step_name == 'imports':
                    imports_block.append(c.source)
                elif prev_step_name == 'functions':
                    functions_block.append(c.source)
                elif prev_step_name == 'pipeline-parameters':
                    pipeline_parameters.append(c.source)
                elif prev_step_name == 'pipeline-metrics':
                    pipeline_metrics.append(c.source)
                # current_block might be None in case the first cells of the
                # notebooks have not been tagged.
                elif prev_step_name:
                    # this notebook cell will be merged to a previous one that
                    # specified a step name
                    self.pipeline.get_step(prev_step_name).merge_code(c.source)
            else:
                # in this branch we are sure that we are reading a code cell
                # with a step tag, so we must not allow for pipeline-metrics
                if prev_step_name == 'pipeline-metrics':
                    raise ValueError("Tag pipeline-metrics must be placed on a"
                                     " cell at the end of the Notebook."
                                     " Pipeline metrics should be considered"
                                     " as a result of the pipeline execution"
                                     " and not of single steps.")
                # add node to DAG, adding tags and source code of notebook cell
                if step_name not in self.pipeline.nodes:
                    step = Step(name=step_name,
                                source=[c.source],
                                ins=set(),
                                outs=set(),
                                limits=tags.get("limits", {}),
                                labels=tags.get("labels", {}),
                                annotations=tags.get("annotations", {}))
                    self.pipeline.add_step(step)
                    for _prev_step in tags['prev_steps']:
                        if _prev_step not in self.pipeline.nodes:
                            raise ValueError("Step %s does not exist. It was "
                                             "defined as previous step of %s" %
                                             (_prev_step, tags['step_names']))
                        self.pipeline.add_edge(_prev_step, step_name)
                else:
                    self.pipeline.get_step(step_name).merge_code(c.source)

                prev_step_name = step_name

        # Prepend any `imports` and `functions` cells to every Pipeline step
        for step in self.pipeline.steps:
            step.source = imports_block + functions_block + step.source

        # merge together pipeline parameters
        pipeline_parameters = '\n'.join(pipeline_parameters)
        # merge together pipeline metrics
        pipeline_metrics = '\n'.join(pipeline_metrics)

        imports_and_functions = "\n".join(imports_block + functions_block)
        return pipeline_parameters, pipeline_metrics, imports_and_functions

    def parse_cell_metadata(self, metadata):
        """Parse a notebook's cell's metadata field.

        The Kale UI writes specific tags inside the 'tags' field, as a list
        of string tags. Supported tags are defined by _TAGS_LANGUAGE.

        Args:
            metadata (dict): a dict containing a notebook's cell's metadata

        Returns (dict): parsed tags based on Kale tagging language

        """
        parsed_tags = dict()

        # `step_names` is a list because a notebook cell might be assigned to
        # more than one Pipeline step.
        parsed_tags['step_names'] = list()
        parsed_tags['prev_steps'] = list()
        # define intermediate variables so that dicts are not added to a steps
        # when they are empty
        cell_annotations = dict()
        cell_labels = dict()
        cell_limits = dict()

        # the notebook cell was not tagged
        if 'tags' not in metadata or len(metadata['tags']) == 0:
            return parsed_tags

        for t in metadata['tags']:
            if not isinstance(t, str):
                raise ValueError(
                    "Tags must be string. Found tag %s of type %s" %
                    (t, type(t)))
            # Check that the tag is defined by the Kale tagging language
            if any(re.match(_t, t) for _t in _TAGS_LANGUAGE) is False:
                raise ValueError("Unrecognized tag: {}".format(t))

            # Special tags have a specific effect on the cell they belong to.
            # Specifically:
            #  - skip: ignore the notebook cell
            #  - pipeline-parameters: use the cell to populate Pipeline
            #       parameters. The cell must contain only assignment
            #       expressions
            #  - pipeline-metrics: use the cell to populate Pipeline metrics.
            #       The cell must contain only variable names
            #  - imports: the code of the corresponding cell(s) will be
            #       prepended to every Pipeline step
            #  - functions: same as imports, but the corresponding code is
            #       placed **after** `imports`
            special_tags = [
                'skip', 'pipeline-parameters', 'pipeline-metrics', 'imports',
                'functions'
            ]
            if t in special_tags:
                parsed_tags['step_names'] = [t]
                return parsed_tags

            # now only `block|step` and `prev` tags remain to be parsed.
            tag_parts = t.split(':')
            tag_name = tag_parts.pop(0)

            if tag_name == "annotation":
                key, value = get_annotation_or_label_from_tag(tag_parts)
                cell_annotations.update({key: value})

            if tag_name == "label":
                key, value = get_annotation_or_label_from_tag(tag_parts)
                cell_labels.update({key: value})

            if tag_name == "limit":
                key, value = get_limit_from_tag(tag_parts)
                cell_limits.update({key: value})

            # name of the future Pipeline step
            # TODO: Deprecate `block` in future release
            if tag_name in ["block", "step"]:
                if tag_name == "block":
                    warnings.warn(
                        "`block` tag will be deprecated in a future"
                        " version, use `step` tag instead", DeprecationWarning)
                step_name = tag_parts.pop(0)
                parsed_tags['step_names'].append(step_name)
            # name(s) of the father Pipeline step(s)
            if tag_name == "prev":
                prev_step_name = tag_parts.pop(0)
                parsed_tags['prev_steps'].append(prev_step_name)

        if not parsed_tags['step_names'] and parsed_tags['prev_steps']:
            raise ValueError(
                "A cell can not provide `prev` annotations without "
                "providing a `block` or `step` annotation as well")

        if cell_annotations:
            if not parsed_tags['step_names']:
                raise ValueError(
                    "A cell can not provide Pod annotations in a cell"
                    " that does not declare a step name.")
            parsed_tags['annotations'] = cell_annotations

        if cell_limits:
            if not parsed_tags['step_names']:
                raise ValueError(
                    "A cell can not provide Pod resource limits in a"
                    " cell that does not declare a step name.")
            parsed_tags['limits'] = cell_limits
        return parsed_tags

    def get_pipeline_parameters_source(self):
        """Get just pipeline parameters cells from the notebook.

        Returns (str): pipeline parameters source code
        """
        return self._get_reserved_tag_source(PIPELINE_PARAMETERS_TAG)

    def get_pipeline_metrics_source(self):
        """Get just pipeline metrics cells from the notebook.

        Returns (str): pipeline metrics source code
        """
        # check that the pipeline metrics tag is only assigned to cells at
        # the end of the notebook
        detected = False
        tags = _TAGS_LANGUAGE[:]
        tags.remove(PIPELINE_METRICS_TAG)

        for c in self.notebook.cells:
            # parse only source code cells
            if c.cell_type != "code":
                continue

            # if we see a pipeline-metrics tag, set the flag
            if (('tags' in c.metadata and len(c.metadata['tags']) > 0 and any(
                    re.match(PIPELINE_METRICS_TAG, t)
                    for t in c.metadata['tags']))):
                detected = True
                continue

            # if we have the flag set and we detect any other tag from the tags
            # language, then raise error
            if (detected and 'tags' in c.metadata
                    and len(c.metadata['tags']) > 0 and any([
                        any(re.match(tag, t) for t in c.metadata['tags'])
                        for tag in tags
                    ])):
                raise ValueError(
                    "Tag pipeline-metrics tag must be placed on a "
                    "cell at the end of the Notebook."
                    " Pipeline metrics should be considered as a"
                    " result of the pipeline execution and not of"
                    " single steps.")
        return self._get_reserved_tag_source(PIPELINE_METRICS_TAG)

    def _get_reserved_tag_source(self, search_tag):
        """Get just the specific tag's source code.

        When searching for tag x, will return all cells that are tagged with x
        and, if untagged, follow cells with tag x. The result is a multiline
        string containing all the python code associated to x.

        Note: This is designed for 'special' tags, as the STEP_TAG is excluded
              from the match.

        Args:
            search_tag (str): the target tag

        Returns: the unified code of all the cells belonging to `search_tag`
        """
        detected = False
        source = ''

        language = _TAGS_LANGUAGE[:]
        language.remove(search_tag)

        for c in self.notebook.cells:
            # parse only source code cells
            if c.cell_type != "code":
                continue
            # in case the previous cell was a `search_tag` cell and this
            # cell is not any other tag of the tag language:
            if (detected and
                (('tags' not in c.metadata or len(c.metadata['tags']) == 0)
                 or all([
                     not any(re.match(tag, t) for t in c.metadata['tags'])
                     for tag in language
                 ]))):
                source += '\n' + c.source
            elif (
                ('tags' in c.metadata and len(c.metadata['tags']) > 0
                 and any(re.match(search_tag, t)
                         for t in c.metadata['tags']))):
                source += '\n' + c.source
                detected = True
            else:
                detected = False
        return source.strip()

    def assign_metrics(self, pipeline_metrics: dict):
        """Assign pipeline metrics to specific pipeline steps.

        This assignment follows a similar logic to the detection of `out`
        dependencies. Starting from a temporary step - child of all the leaf
        nodes, all the nodes in the pipelines are traversed in reversed
        topological order. When a step shows one of the metrics as part of its
        code, then that metric is assigned to the step.

        Args:
            pipeline_metrics (dict): a dict of pipeline metrics where the key
                always the KFP sanitized name and the value the name of the
                original variable.
        """
        # create a temporary step at the end of the pipeline to simplify the
        # iteration from the leaf steps
        tmp_step_name = "_tmp"
        leaf_steps = self.pipeline.get_leaf_steps()
        if not leaf_steps:
            return
        [
            self.pipeline.add_edge(step.name, tmp_step_name)
            for step in leaf_steps
        ]

        # pipeline_metrics is a dict having sanitized variable names as keys
        # and the corresponding variable names as values. Here we need to refer
        # to the sanitized names using the python variables.
        # XXX: We could change parse_metrics_print_statements() to return the
        # XXX: reverse dictionary, but that would require changing either
        # XXX: rpc.nb.get_pipeline_metrics() or change in the JupyterLab
        # XXX: Extension parsing of the RPC result
        rev_pipeline_metrics = {v: k for k, v in pipeline_metrics.items()}
        metrics_left = set(rev_pipeline_metrics.keys())
        for anc in graphutils.get_ordered_ancestors(self.pipeline,
                                                    tmp_step_name):
            if not metrics_left:
                break

            anc_step = self.pipeline.get_step(anc)
            anc_source = '\n'.join(anc_step.source)
            # get all the marshal candidates from father's source and intersect
            # with the metrics that have not been matched yet
            marshal_candidates = astutils.get_marshal_candidates(anc_source)
            assigned_metrics = metrics_left.intersection(marshal_candidates)
            # Remove the metrics that have already been assigned.
            metrics_left.difference_update(assigned_metrics)
            # Generate code to produce the metrics artifact in the current step
            if assigned_metrics:
                code = METRICS_TEMPLATE % ("    " + ",\n    ".join([
                    '"%s": %s' % (rev_pipeline_metrics[x], x)
                    for x in sorted(assigned_metrics)
                ]))
                anc_step.source.append(code)
            # need to have a `metrics` flag set to true in order to set the
            # metrics output artifact in the pipeline template
            anc_step.metrics = True

        self.pipeline.remove_node(tmp_step_name)

    def dependencies_detection(self, imports_and_functions: str = ""):
        """Detect the data dependencies between nodes in the graph.

        The data dependencies detection algorithm roughly works as follows:

        1. Traversing the graph in topological order, for every node `step` do
        2. Detect the `ins` of current `step` by running PyFlakes on the source
         code. During this action the pipeline parameters are taken into
         consideration
        3. Parse `step`'s global function definitions to get free variables
         (i.e. variables that would need to be marshalled in other steps that
         call these functions) - in this action pipeline parameters are taken
         into consideration.
        4. Get all the function that `step` calls
        5. For every `step`'s ancestor `anc` do
            - Get all the potential names (objects, functions, ...) of `anc`
             that could be marshalled (saved)
            - Intersect this with the `step`'s `ins` (from action 2) and add
             the result to `anc`'s `outs`.
            - for every `step`'s function call (action 4), check if this
             function was defined in `anc` and if it has free variables
             (action 3). If so, add to `step`'s `ins` and to `anc`'s `outs`
             these free variables.

        Args:
            imports_and_functions: Multiline Python source that is prepended to
                every pipeline step

        Returns: annotated graph
        """
        # resolve the data dependencies between steps, looping through the
        # graph
        for step in self.pipeline.steps:
            # detect the INS dependencies of the CURRENT node------------------
            step_source = '\n'.join(step.source)
            # get the variables that this step is missing and the pipeline
            # parameters that it actually needs.
            ins, parameters = self._detect_in_dependencies(
                source_code=step_source,
                pipeline_parameters=self.pipeline.pipeline_parameters)
            fns_free_variables = self._detect_fns_free_variables(
                step_source, imports_and_functions,
                self.pipeline.pipeline_parameters)

            # Get all the function calls. This will be used below to check if
            # any of the ancestors declare any of these functions. Is that is
            # so, the free variables of those functions will have to be loaded.
            fn_calls = astutils.get_function_calls(step_source)

            # add OUT dependencies annotations in the PARENT nodes-------------
            # Intersect the missing names of this father's child with all
            # the father's names. The intersection is the list of variables
            # that the father need to serialize
            # The ancestors are the the nodes that have a path to `step`,
            # ordered by path length.
            ins_left = ins.copy()
            for anc in (graphutils.get_ordered_ancestors(
                    self.pipeline, step.name)):
                if not ins_left:
                    # if there are no more variables that need to be
                    # marshalled, stop the graph traverse
                    break
                anc_step = self.pipeline.get_step(anc)
                anc_source = '\n'.join(anc_step.source)
                # get all the marshal candidates from father's source and
                # intersect with the required names of the current node
                marshal_candidates = astutils.get_marshal_candidates(
                    anc_source)
                outs = ins_left.intersection(marshal_candidates)
                # Remove the ins that have already been assigned to an ancestor
                ins_left.difference_update(outs)
                # Include free variables
                to_remove = set()
                for fn_call in fn_calls:
                    anc_fns_free_vars = anc_step.fns_free_variables
                    if fn_call in anc_fns_free_vars.keys():
                        # the current step needs to load these variables
                        fn_free_vars, used_params = anc_fns_free_vars[fn_call]
                        # search if this function calls other functions (i.e.
                        # if its free variables are found in the free variables
                        # dict)
                        _left = list(fn_free_vars)
                        while _left:
                            _cur = _left.pop(0)
                            # if the free var is itself a fn with free vars
                            if _cur in anc_fns_free_vars:
                                fn_free_vars.update(anc_fns_free_vars[_cur][0])
                                _left = _left + list(
                                    anc_fns_free_vars[_cur][0])
                        ins.update(fn_free_vars)
                        # the current ancestor needs to save these variables
                        outs.update(fn_free_vars)
                        # add the parameters used by the function to the list
                        # of pipeline parameters used by the step
                        _pps = self.pipeline.pipeline_parameters
                        for param in used_params:
                            parameters[param] = _pps[param]

                        # Remove this function as it has been served. We don't
                        # want other ancestors to save free variables for this
                        # function. Using the helper to_remove because the set
                        # can not be resized during iteration.
                        to_remove.add(fn_call)
                        # add the function and its free variables to the
                        # current step as well. This is useful in case
                        # *another* function will call this one (`fn_call`) in
                        # a child step. In this way we can track the calls up
                        # to the last free variable. (refer to test
                        # `test_dependencies_detection_recursive`)
                        fns_free_variables[fn_call] = anc_fns_free_vars[
                            fn_call]
                fn_calls.difference_update(to_remove)
                # Add to ancestor the new outs annotations. First merge the
                # current outs present in the anc with the new ones
                anc_step.outs.update(outs)

            step.ins = sorted(ins)
            step.parameters = parameters
            step.fns_free_variables = fns_free_variables

    def _detect_in_dependencies(self,
                                source_code: str,
                                pipeline_parameters: dict = None):
        """Detect missing names from one pipeline step source code.

        Args:
            source_code: Multiline Python source code
            pipeline_parameters: Pipeline parameters dict
        """
        commented_source_code = utils.comment_magic_commands(source_code)
        ins = flakeutils.pyflakes_report(code=commented_source_code)

        # Pipeline parameters will be part of the names that are missing,
        # but of course we don't want to marshal them in as they will be
        # present as parameters
        relevant_parameters = set()
        if pipeline_parameters:
            # Not all pipeline parameters are needed in every pipeline step,
            # these are the parameters that are actually needed by this step.
            relevant_parameters = ins.intersection(pipeline_parameters.keys())
            ins.difference_update(relevant_parameters)
        step_params = {k: pipeline_parameters[k] for k in relevant_parameters}
        return ins, step_params

    def _detect_fns_free_variables(self,
                                   source_code: str,
                                   imports_and_functions: str = "",
                                   step_parameters: dict = None):
        """Return the function's free variables.

        Free variable: _If a variable is used in a code block but not defined
        there, it is a free variable._

        An Example:

        ```
        x = 5
        def foo():
            print(x)
        ```

        In the example above, `x` is a free variable for function `foo`,
        because it is defined outside of the context of `foo`.

        Here we run the PyFlakes report over the function body to get all the
        missing names (i.e. free variables), excluding the function arguments.

        Args:
            source_code: Multiline Python source code
            imports_and_functions: Multiline Python source that is prepended
                to every pipeline step. It should contain the code cells that
                where tagged as `import` and `functions`. We prepend this code
                to the function body because it will always be present in any
                pipeline step.
            step_parameters: Step parameters names. The step parameters
                are removed from the pyflakes report, as these names will
                always be available in the step's context.

        Returns (dict): A dictionary with the name of the function as key and
            a list of variables names + consumed pipeline parameters as values.
        """
        fns_free_vars = dict()
        # now check the functions' bodies for free variables. fns is a
        # dict function_name -> function_source
        fns = astutils.parse_functions(source_code)
        for fn_name, fn in fns.items():
            code = imports_and_functions + "\n" + fn
            free_vars = flakeutils.pyflakes_report(code=code)
            # the pipeline parameters that are used in the function
            consumed_params = {}
            if step_parameters:
                consumed_params = free_vars.intersection(
                    step_parameters.keys())
                # remove the used parameters form the free variables, as they
                # need to be handled differently.
                free_vars.difference_update(consumed_params)
            fns_free_vars[fn_name] = (free_vars, consumed_params)
        return fns_free_vars
Esempio n. 16
0
class BaseProcessor(ABC):
    """Provides basic tools for processors to generate a Pipeline object."""

    id: str
    no_op_step: Step
    config_cls = PipelineConfig

    def __init__(self,
                 config: PipelineConfig = None,
                 skip_validation: bool = False,
                 **kwargs):
        self.config = config
        if not config and not skip_validation:
            self.config = self.config_cls(**kwargs)
        self.pipeline = Pipeline(self.config)

    def run(self) -> Pipeline:
        """Process the source into a Pipeline object."""
        self.to_pipeline()
        self._post_pipeline()
        return self.pipeline

    @abstractmethod
    def to_pipeline(self):
        """A processor class is supposed to extend this method."""
        pass

    def _post_pipeline(self):
        # keep reference to original processor, so the pipeline knows
        # what backend generated it.
        self.pipeline.processor = self
        self._add_final_autosnapshot_step()
        self._configure_poddefaults()
        self._apply_steps_defaults()
        self._set_volume_pipeline_parameters()

    def _add_final_autosnapshot_step(self):
        if not self.no_op_step:
            raise RuntimeError("Processor class needs to define a no-op step.")
        leaf_steps = self.pipeline.get_leaf_steps()
        if self.config.autosnapshot and len(leaf_steps) > 1:
            _step = copy.deepcopy(self.no_op_step)
            _step.config.name = "final_auto_snapshot"
            self.pipeline.add_step(_step)
            # add a link from all the last steps of the pipeline to
            # the final auto snapshot one.
            for step in leaf_steps:
                self.pipeline.add_edge(step.name, _step.config.name)

    def _configure_poddefaults(self):
        # FIXME: We should reconsider the implementation of
        #  https://github.com/kubeflow-kale/kale/pull/175/files to
        #  avoid using an RPC and always detect PodDefaults here.
        _pod_defaults_labels = dict()
        try:
            _pod_defaults_labels = kfutils.find_poddefault_labels()
        except Exception as e:
            log.warning("Could not retrieve PodDefaults. Reason: %s", e)
        self.pipeline.config.steps_defaults["labels"] = {
            **self.pipeline.config.steps_defaults.get("labels", dict()),
            **_pod_defaults_labels
        }

    def _apply_steps_defaults(self):
        for step in self.pipeline.steps:
            step.config.update(self.pipeline.config.steps_defaults)

    def _set_volume_pipeline_parameters(self):
        """Create pipeline parameters for volumes to be mounted on steps."""
        volume_parameters = dict()
        for v in self.pipeline.config.volumes:  # type: VolumeConfig
            if v.type == 'pv':
                # FIXME: How should we handle existing PVs?
                continue
            if v.type == 'pvc':
                mount_point = v.mount_point.replace('/', '_').strip('_')
                par_name = "vol_{}".format(mount_point)
                volume_parameters[par_name] = PipelineParam("str", v.name)
            elif v.type == 'new_pvc':
                rok_url = v.annotations.get("rok/origin")
                if rok_url is not None:
                    par_name = "rok_{}_url".format(v.name.replace('-', '_'))
                    volume_parameters[par_name] = PipelineParam("str", rok_url)
            else:
                raise ValueError("Unknown volume type: {}".format(v.type))
        self.pipeline.pipeline_parameters.update(volume_parameters)
Esempio n. 17
0
def test_dependencies_detection_inner_function(dummy_nb_config):
    """Test dependencies detection with inner functions."""
    pipeline = Pipeline(dummy_nb_config)

    _source = ["x = 5"]
    pipeline.add_step(Step(name="step1", source=_source))
    _source = ['''
def foo():
    def bar(x):
        print(x)
    bar(5)
''']
    pipeline.add_step(Step(name="step2", source=_source))
    _source = ['''
foo()
print(x)
''']
    pipeline.add_step(Step(name="step3", source=_source))

    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    dependencies.dependencies_detection(pipeline)
    assert sorted(pipeline.get_step("step1").ins) == []
    assert sorted(pipeline.get_step("step1").outs) == ['x']
    assert sorted(pipeline.get_step("step2").ins) == []
    assert sorted(pipeline.get_step("step2").outs) == ['foo']
    assert sorted(pipeline.get_step("step3").ins) == ['foo', 'x']
    assert sorted(pipeline.get_step("step3").outs) == []
Esempio n. 18
0
def test_dependencies_detection_recursive_different_steps(dummy_nb_config):
    """Test dependencies are detected even with a chain of functions calls."""
    pipeline = Pipeline(dummy_nb_config)

    _source = ['''
x = 5
def foo():
    print(x)
''']
    pipeline.add_step(Step(name="step1", source=_source))
    _source = ['''
def bar():
    foo()
''']
    pipeline.add_step(Step(name="step2", source=_source))
    _source = ["bar()"]
    pipeline.add_step(Step(name="step3", source=_source))

    pipeline.add_edge("step1", "step2")
    pipeline.add_edge("step2", "step3")

    dependencies.dependencies_detection(pipeline)
    assert sorted(pipeline.get_step("step1").ins) == []
    assert sorted(pipeline.get_step("step1").outs) == ['foo', 'x']
    assert sorted(pipeline.get_step("step2").ins) == ['foo', 'x']
    assert sorted(pipeline.get_step("step2").outs) == ['bar', 'foo', 'x']
    assert sorted(pipeline.get_step("step3").ins) == ['bar', 'foo', 'x']
    assert sorted(pipeline.get_step("step3").outs) == []
Esempio n. 19
0
class NotebookProcessor:
    """Convert a Notebook to a Pipeline object."""
    def __init__(self,
                 nb_path: str,
                 nb_metadata_overrides: Dict[str, Any] = None,
                 skip_validation: bool = False):
        """Instantiate a new NotebookProcessor.

        Args:
            nb_path: Path to source notebook
            nb_metadata_overrides: Override notebook config settings
            skip_validation: Set to True in order to skip the notebook's
                metadata validation. This is useful in case the
                NotebookProcessor is used to parse a part of the notebook
                (e.g., retrieve pipeline metrics) and the notebook config (for
                pipeline generation) might still be invalid.
        """
        self.nb_path = os.path.expanduser(nb_path)
        self.notebook = self._read_notebook()

        nb_metadata = self.notebook.metadata.get(KALE_NB_METADATA_KEY, dict())

        # fixme: needed?
        nb_metadata.update({"notebook_path": nb_path})
        if nb_metadata_overrides:
            nb_metadata.update(nb_metadata_overrides)
        # validate and populate defaults
        # FIXME: Maybe improve this by implementing a "skip_validation" flag
        #  in the config class
        self.config = None
        if not skip_validation:
            self.config = NotebookConfig(**nb_metadata)
        self.pipeline = Pipeline(self.config)

    def _read_notebook(self):
        if not os.path.exists(self.nb_path):
            raise ValueError("NotebookProcessor could not find a notebook at"
                             " path %s" % self.nb_path)
        return nb.read(self.nb_path, as_version=nb.NO_CONVERT)

    def to_pipeline(self):
        """Convert an annotated Notebook to a Pipeline object."""
        (pipeline_parameters_source, pipeline_metrics_source,
         imports_and_functions) = self.parse_notebook()

        self.parse_pipeline_parameters(pipeline_parameters_source)
        self.pipeline.set_volume_pipeline_parameters()

        # get a list of variables that need to be logged as pipeline metrics
        pipeline_metrics = ast.parse_metrics_print_statements(
            pipeline_metrics_source)

        # run static analysis over the source code
        dependencies.dependencies_detection(
            self.pipeline, imports_and_functions=imports_and_functions)
        dependencies.assign_metrics(self.pipeline, pipeline_metrics)

        # if there are multiple DAG leaves, add an empty step at the end of the
        # pipeline for final snapshot
        leaf_steps = self.pipeline.get_leaf_steps()
        if self.config.autosnapshot and len(leaf_steps) > 1:
            _name = "final_auto_snapshot"
            self.pipeline.add_step(Step(name=_name, source=[]))
            # add a link from all the last steps of the pipeline to
            # the final auto snapshot one.
            for step in leaf_steps:
                self.pipeline.add_edge(step.name, _name)

        # FIXME: Move this to a base class Processor, to be executed by default
        #  after `to_pipeline`, so that it is agnostic to the type of
        #  processor.
        for step in self.pipeline.steps:
            step.config.update(self.pipeline.config.steps_defaults)

        # TODO: Additional action required:
        #  Run a static analysis over every step to check that pipeline
        #  parameters are not assigned with new values.
        return self.pipeline

    def parse_pipeline_parameters(self, source: str):
        """Get pipeline parameters from source code."""
        pipeline_parameters = ast.parse_assignments_expressions(source)
        for name, (v_type, v_value) in pipeline_parameters.items():
            pipeline_parameters[name] = ast.PipelineParam(v_type, v_value)
        self.pipeline.pipeline_parameters = pipeline_parameters

    def parse_notebook(self):
        """Creates a NetworkX graph based on the input notebook's tags.

        Cell's source code are embedded into the graph as node attributes.
        """
        # will be assigned at the end of each for loop
        prev_step_name = None

        # All the code cells that have to be pre-pended to every pipeline step
        # (i.e., imports and functions) are merged here
        imports_block = list()
        functions_block = list()

        # Variables that will become pipeline parameters
        pipeline_parameters = list()
        # Variables that will become pipeline metrics
        pipeline_metrics = list()

        for c in self.notebook.cells:
            if c.cell_type != "code":
                continue

            tags = self.parse_cell_metadata(c.metadata)

            if len(tags['step_names']) > 1:
                raise NotImplementedError("Kale does not yet support multiple"
                                          " step names in a single notebook"
                                          " cell. One notebook cell was found"
                                          " with %s  step names" %
                                          tags['step_names'])

            step_name = (tags['step_names'][0]
                         if 0 < len(tags['step_names']) else None)

            if step_name == 'skip':
                # when the cell is skipped, don't store `skip` as the previous
                # active cell
                continue
            if step_name == 'pipeline-parameters':
                pipeline_parameters.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'imports':
                imports_block.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'functions':
                functions_block.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'pipeline-metrics':
                pipeline_metrics.append(c.source)
                prev_step_name = step_name
                continue

            # if none of the above apply, then we are parsing a code cell with
            # a block names and (possibly) some dependencies

            # if the cell was not tagged with a step name,
            # add the code to the previous cell
            if not step_name:
                if prev_step_name == 'imports':
                    imports_block.append(c.source)
                elif prev_step_name == 'functions':
                    functions_block.append(c.source)
                elif prev_step_name == 'pipeline-parameters':
                    pipeline_parameters.append(c.source)
                elif prev_step_name == 'pipeline-metrics':
                    pipeline_metrics.append(c.source)
                # current_block might be None in case the first cells of the
                # notebooks have not been tagged.
                elif prev_step_name:
                    # this notebook cell will be merged to a previous one that
                    # specified a step name
                    self.pipeline.get_step(prev_step_name).merge_code(c.source)
            else:
                # in this branch we are sure that we are reading a code cell
                # with a step tag, so we must not allow for pipeline-metrics
                if prev_step_name == 'pipeline-metrics':
                    raise ValueError("Tag pipeline-metrics must be placed on a"
                                     " cell at the end of the Notebook."
                                     " Pipeline metrics should be considered"
                                     " as a result of the pipeline execution"
                                     " and not of single steps.")
                # add node to DAG, adding tags and source code of notebook cell
                if step_name not in self.pipeline.nodes:
                    step = Step(name=step_name,
                                source=[c.source],
                                ins=set(),
                                outs=set(),
                                limits=tags.get("limits", {}),
                                labels=tags.get("labels", {}),
                                annotations=tags.get("annotations", {}))
                    self.pipeline.add_step(step)
                    for _prev_step in tags['prev_steps']:
                        if _prev_step not in self.pipeline.nodes:
                            raise ValueError("Step %s does not exist. It was "
                                             "defined as previous step of %s" %
                                             (_prev_step, tags['step_names']))
                        self.pipeline.add_edge(_prev_step, step_name)
                else:
                    self.pipeline.get_step(step_name).merge_code(c.source)

                prev_step_name = step_name

        # Prepend any `imports` and `functions` cells to every Pipeline step
        for step in self.pipeline.steps:
            step.source = imports_block + functions_block + step.source

        # merge together pipeline parameters
        pipeline_parameters = '\n'.join(pipeline_parameters)
        # merge together pipeline metrics
        pipeline_metrics = '\n'.join(pipeline_metrics)

        imports_and_functions = "\n".join(imports_block + functions_block)
        return pipeline_parameters, pipeline_metrics, imports_and_functions

    def parse_cell_metadata(self, metadata):
        """Parse a notebook's cell's metadata field.

        The Kale UI writes specific tags inside the 'tags' field, as a list
        of string tags. Supported tags are defined by _TAGS_LANGUAGE.

        Args:
            metadata (dict): a dict containing a notebook's cell's metadata

        Returns (dict): parsed tags based on Kale tagging language

        """
        parsed_tags = dict()

        # `step_names` is a list because a notebook cell might be assigned to
        # more than one Pipeline step.
        parsed_tags['step_names'] = list()
        parsed_tags['prev_steps'] = list()
        # define intermediate variables so that dicts are not added to a steps
        # when they are empty
        cell_annotations = dict()
        cell_labels = dict()
        cell_limits = dict()

        # the notebook cell was not tagged
        if 'tags' not in metadata or len(metadata['tags']) == 0:
            return parsed_tags

        for t in metadata['tags']:
            if not isinstance(t, str):
                raise ValueError(
                    "Tags must be string. Found tag %s of type %s" %
                    (t, type(t)))
            # Check that the tag is defined by the Kale tagging language
            if any(re.match(_t, t) for _t in _TAGS_LANGUAGE) is False:
                raise ValueError("Unrecognized tag: {}".format(t))

            # Special tags have a specific effect on the cell they belong to.
            # Specifically:
            #  - skip: ignore the notebook cell
            #  - pipeline-parameters: use the cell to populate Pipeline
            #       parameters. The cell must contain only assignment
            #       expressions
            #  - pipeline-metrics: use the cell to populate Pipeline metrics.
            #       The cell must contain only variable names
            #  - imports: the code of the corresponding cell(s) will be
            #       prepended to every Pipeline step
            #  - functions: same as imports, but the corresponding code is
            #       placed **after** `imports`
            special_tags = [
                'skip', 'pipeline-parameters', 'pipeline-metrics', 'imports',
                'functions'
            ]
            if t in special_tags:
                parsed_tags['step_names'] = [t]
                return parsed_tags

            # now only `block|step` and `prev` tags remain to be parsed.
            tag_parts = t.split(':')
            tag_name = tag_parts.pop(0)

            if tag_name == "annotation":
                key, value = get_annotation_or_label_from_tag(tag_parts)
                cell_annotations.update({key: value})

            if tag_name == "label":
                key, value = get_annotation_or_label_from_tag(tag_parts)
                cell_labels.update({key: value})

            if tag_name == "limit":
                key, value = get_limit_from_tag(tag_parts)
                cell_limits.update({key: value})

            # name of the future Pipeline step
            # TODO: Deprecate `block` in future release
            if tag_name in ["block", "step"]:
                if tag_name == "block":
                    warnings.warn(
                        "`block` tag will be deprecated in a future"
                        " version, use `step` tag instead", DeprecationWarning)
                step_name = tag_parts.pop(0)
                parsed_tags['step_names'].append(step_name)
            # name(s) of the father Pipeline step(s)
            if tag_name == "prev":
                prev_step_name = tag_parts.pop(0)
                parsed_tags['prev_steps'].append(prev_step_name)

        if not parsed_tags['step_names'] and parsed_tags['prev_steps']:
            raise ValueError(
                "A cell can not provide `prev` annotations without "
                "providing a `block` or `step` annotation as well")

        if cell_annotations:
            if not parsed_tags['step_names']:
                raise ValueError(
                    "A cell can not provide Pod annotations in a cell"
                    " that does not declare a step name.")
            parsed_tags['annotations'] = cell_annotations

        if cell_limits:
            if not parsed_tags['step_names']:
                raise ValueError(
                    "A cell can not provide Pod resource limits in a"
                    " cell that does not declare a step name.")
            parsed_tags['limits'] = cell_limits
        return parsed_tags

    def get_pipeline_parameters_source(self):
        """Get just pipeline parameters cells from the notebook.

        Returns (str): pipeline parameters source code
        """
        return self._get_reserved_tag_source(PIPELINE_PARAMETERS_TAG)

    def get_pipeline_metrics_source(self):
        """Get just pipeline metrics cells from the notebook.

        Returns (str): pipeline metrics source code
        """
        # check that the pipeline metrics tag is only assigned to cells at
        # the end of the notebook
        detected = False
        tags = _TAGS_LANGUAGE[:]
        tags.remove(PIPELINE_METRICS_TAG)

        for c in self.notebook.cells:
            # parse only source code cells
            if c.cell_type != "code":
                continue

            # if we see a pipeline-metrics tag, set the flag
            if (('tags' in c.metadata and len(c.metadata['tags']) > 0 and any(
                    re.match(PIPELINE_METRICS_TAG, t)
                    for t in c.metadata['tags']))):
                detected = True
                continue

            # if we have the flag set and we detect any other tag from the tags
            # language, then raise error
            if (detected and 'tags' in c.metadata
                    and len(c.metadata['tags']) > 0 and any([
                        any(re.match(tag, t) for t in c.metadata['tags'])
                        for tag in tags
                    ])):
                raise ValueError(
                    "Tag pipeline-metrics tag must be placed on a "
                    "cell at the end of the Notebook."
                    " Pipeline metrics should be considered as a"
                    " result of the pipeline execution and not of"
                    " single steps.")
        return self._get_reserved_tag_source(PIPELINE_METRICS_TAG)

    def _get_reserved_tag_source(self, search_tag):
        """Get just the specific tag's source code.

        When searching for tag x, will return all cells that are tagged with x
        and, if untagged, follow cells with tag x. The result is a multiline
        string containing all the python code associated to x.

        Note: This is designed for 'special' tags, as the STEP_TAG is excluded
              from the match.

        Args:
            search_tag (str): the target tag

        Returns: the unified code of all the cells belonging to `search_tag`
        """
        detected = False
        source = ''

        language = _TAGS_LANGUAGE[:]
        language.remove(search_tag)

        for c in self.notebook.cells:
            # parse only source code cells
            if c.cell_type != "code":
                continue
            # in case the previous cell was a `search_tag` cell and this
            # cell is not any other tag of the tag language:
            if (detected and
                (('tags' not in c.metadata or len(c.metadata['tags']) == 0)
                 or all([
                     not any(re.match(tag, t) for t in c.metadata['tags'])
                     for tag in language
                 ]))):
                source += '\n' + c.source
            elif (
                ('tags' in c.metadata and len(c.metadata['tags']) > 0
                 and any(re.match(search_tag, t)
                         for t in c.metadata['tags']))):
                source += '\n' + c.source
                detected = True
            else:
                detected = False
        return source.strip()
Esempio n. 20
0
def dependencies_detection(pipeline: Pipeline,
                           imports_and_functions: str = ""):
    """Detect the data dependencies between nodes in the graph.

    The data dependencies detection algorithm roughly works as follows:

    1. Traversing the graph in topological order, for every node `step` do
    2. Detect the `ins` of current `step` by running PyFlakes on the source
     code. During this action the pipeline parameters are taken into
     consideration
    3. Parse `step`'s global function definitions to get free variables
     (i.e. variables that would need to be marshalled in other steps that call
     these functions) - in this action pipeline parameters are taken into
     consideration.
    4. Get all the function that `step` calls
    5. For every `step`'s ancestor `anc` do
        - Get all the potential names (objects, functions, ...) of `anc` that
         could be marshalled (saved)
        - Intersect this with the `step`'s `ins` (from action 2) and add the
         result to `anc`'s `outs`.
        - for every `step`'s function call (action 4), check if this function
         was defined in `anc` and if it has free variables (action 3). If so,
         add to `step`'s `ins` and to `anc`'s `outs` these free variables.

    Args:
        pipeline: Pipeline object
        imports_and_functions: Multiline Python source that is prepended to
            every pipeline step

    Returns: annotated graph
    """
    # resolve the data dependencies between steps, looping through the graph
    for step in pipeline.steps:
        # detect the INS dependencies of the CURRENT node----------------------
        step_source = '\n'.join(step.source)
        # get the variables that this step is missing and the pipeline
        # parameters that it actually needs.
        ins, parameters = detect_in_dependencies(
            source_code=step_source,
            pipeline_parameters=pipeline.pipeline_parameters)
        fns_free_variables = detect_fns_free_variables(
            step_source, imports_and_functions, pipeline.pipeline_parameters)

        # Get all the function calls. This will be used below to check if any
        # of the ancestors declare any of these functions. Is that is so, the
        # free variables of those functions will have to be loaded.
        fn_calls = kale_ast.get_function_calls(step_source)

        # add OUT dependencies annotations in the PARENT nodes-----------------
        # Intersect the missing names of this father's child with all
        # the father's names. The intersection is the list of variables
        # that the father need to serialize
        # The ancestors are the the nodes that have a path to `step`, ordered
        # by path length.
        ins_left = ins.copy()
        for anc in (graphutils.get_ordered_ancestors(pipeline, step.name)):
            if not ins_left:
                # if there are no more variables that need to be marshalled,
                # stop the graph traverse
                break
            anc_step = pipeline.get_step(anc)
            anc_source = '\n'.join(anc_step.source)
            # get all the marshal candidates from father's source and intersect
            # with the required names of the current node
            marshal_candidates = kale_ast.get_marshal_candidates(anc_source)
            outs = ins_left.intersection(marshal_candidates)
            # Remove the ins that have already been assigned to an ancestor.
            ins_left.difference_update(outs)
            # Include free variables
            to_remove = set()
            for fn_call in fn_calls:
                anc_fns_free_vars = anc_step.fns_free_variables
                if fn_call in anc_fns_free_vars.keys():
                    # the current step needs to load these variables
                    fn_free_vars, used_params = anc_fns_free_vars[fn_call]
                    # search if this function calls other functions (i.e. if
                    # its free variables are found in the free variables dict)
                    _left = list(fn_free_vars)
                    while _left:
                        _cur = _left.pop(0)
                        # if the free var is itself a fn with free vars
                        if _cur in anc_fns_free_vars:
                            fn_free_vars.update(anc_fns_free_vars[_cur][0])
                            _left = _left + list(anc_fns_free_vars[_cur][0])
                    ins.update(fn_free_vars)
                    # the current ancestor needs to save these variables
                    outs.update(fn_free_vars)
                    # add the parameters used by the function to the list
                    # of pipeline parameters used by the step
                    for param in used_params:
                        parameters[param] = pipeline.pipeline_parameters[param]
                    # Remove this function as it has been served. We don't want
                    # other ancestors to save free variables for this function.
                    # Using the helper to_remove because the set can not be
                    # resized during iteration.
                    to_remove.add(fn_call)
                    # add the function and its free variables to the current
                    # step as well. This is useful in case *another* function
                    # will call this one (`fn_call`) in a child step. In this
                    # way we can track the calls up to the last free variable.
                    # (refer to test `test_dependencies_detection_recursive`)
                    fns_free_variables[fn_call] = anc_fns_free_vars[fn_call]
            fn_calls.difference_update(to_remove)
            # Add to ancestor the new outs annotations. First merge the current
            # outs present in the anc with the new ones
            anc_step.outs.update(outs)

        step.ins = sorted(ins)
        step.parameters = parameters
        step.fns_free_variables = fns_free_variables