Ejemplo n.º 1
0
    def generate_lightweight_component(self, step: Step):
        """Generate Python code using the function template."""
        step_source_raw = step.source

        def _encode_source(s):
            # Encode line by line a multiline string
            return "\n".join([
                line.encode("unicode_escape").decode("utf-8")
                for line in s.splitlines()
            ])

        if self.pipeline.processor.id == "nb":
            # Since the code will be wrapped in triple quotes inside the
            # template, we need to escape triple quotes as they will not be
            # escaped by encode("unicode_escape").
            step.source = [
                re.sub(r"'''", "\\'\\'\\'", _encode_source(s))
                for s in step_source_raw
            ]

        _template_filename = PIPELINE_ORIGIN.get(self.pipeline.processor.id)
        template = self._get_templating_env().get_template(_template_filename)
        fn_code = template.render(step=step, **self.pipeline.config.to_dict())
        # fix code style using pep8 guidelines
        return autopep8.fix_code(fn_code)
Ejemplo n.º 2
0
    def parse_notebook(self):
        """Creates a NetworkX graph based on the input notebook's tags.

        Cell's source code are embedded into the graph as node attributes.
        """
        # will be assigned at the end of each for loop
        prev_step_name = None

        # All the code cells that have to be pre-pended to every pipeline step
        # (i.e., imports and functions) are merged here
        imports_block = list()
        functions_block = list()

        # Variables that will become pipeline parameters
        pipeline_parameters = list()
        # Variables that will become pipeline metrics
        pipeline_metrics = list()

        for c in self.notebook.cells:
            if c.cell_type != "code":
                continue

            tags = self.parse_cell_metadata(c.metadata)

            if len(tags['step_names']) > 1:
                raise NotImplementedError("Kale does not yet support multiple"
                                          " step names in a single notebook"
                                          " cell. One notebook cell was found"
                                          " with %s  step names" %
                                          tags['step_names'])

            step_name = (tags['step_names'][0]
                         if 0 < len(tags['step_names']) else None)

            if step_name == 'skip':
                # when the cell is skipped, don't store `skip` as the previous
                # active cell
                continue
            if step_name == 'pipeline-parameters':
                pipeline_parameters.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'imports':
                imports_block.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'functions':
                functions_block.append(c.source)
                prev_step_name = step_name
                continue
            if step_name == 'pipeline-metrics':
                pipeline_metrics.append(c.source)
                prev_step_name = step_name
                continue

            # if none of the above apply, then we are parsing a code cell with
            # a block names and (possibly) some dependencies

            # if the cell was not tagged with a step name,
            # add the code to the previous cell
            if not step_name:
                if prev_step_name == 'imports':
                    imports_block.append(c.source)
                elif prev_step_name == 'functions':
                    functions_block.append(c.source)
                elif prev_step_name == 'pipeline-parameters':
                    pipeline_parameters.append(c.source)
                elif prev_step_name == 'pipeline-metrics':
                    pipeline_metrics.append(c.source)
                # current_block might be None in case the first cells of the
                # notebooks have not been tagged.
                elif prev_step_name:
                    # this notebook cell will be merged to a previous one that
                    # specified a step name
                    self.pipeline.get_step(prev_step_name).merge_code(c.source)
            else:
                # in this branch we are sure that we are reading a code cell
                # with a step tag, so we must not allow for pipeline-metrics
                if prev_step_name == 'pipeline-metrics':
                    raise ValueError("Tag pipeline-metrics must be placed on a"
                                     " cell at the end of the Notebook."
                                     " Pipeline metrics should be considered"
                                     " as a result of the pipeline execution"
                                     " and not of single steps.")
                # add node to DAG, adding tags and source code of notebook cell
                if step_name not in self.pipeline.nodes:
                    step = Step(name=step_name,
                                source=[c.source],
                                ins=set(),
                                outs=set(),
                                limits=tags.get("limits", {}),
                                labels=tags.get("labels", {}),
                                annotations=tags.get("annotations", {}))
                    self.pipeline.add_step(step)
                    for _prev_step in tags['prev_steps']:
                        if _prev_step not in self.pipeline.nodes:
                            raise ValueError("Step %s does not exist. It was "
                                             "defined as previous step of %s" %
                                             (_prev_step, tags['step_names']))
                        self.pipeline.add_edge(_prev_step, step_name)
                else:
                    self.pipeline.get_step(step_name).merge_code(c.source)

                prev_step_name = step_name

        # Prepend any `imports` and `functions` cells to every Pipeline step
        for step in self.pipeline.steps:
            step.source = imports_block + functions_block + step.source

        # merge together pipeline parameters
        pipeline_parameters = '\n'.join(pipeline_parameters)
        # merge together pipeline metrics
        pipeline_metrics = '\n'.join(pipeline_metrics)

        imports_and_functions = "\n".join(imports_block + functions_block)
        return pipeline_parameters, pipeline_metrics, imports_and_functions