Ejemplo n.º 1
0
    def test_sequential_run_arg(self, dummy_context, dummy_dataframe, caplog):
        dummy_context.catalog.save("cars", dummy_dataframe)
        dummy_context.run(runner=SequentialRunner())

        log_msgs = [record.getMessage() for record in caplog.records]
        log_names = [record.name for record in caplog.records]
        assert "kedro.runner.sequential_runner" in log_names
        assert "Pipeline execution completed successfully." in log_msgs
Ejemplo n.º 2
0
def _from_missing(pipeline, catalog):
    """Create a new pipeline based on missing outputs."""
    name = "kedro.runner.runner.AbstractRunner.run"
    with mock.patch(name) as run:
        SequentialRunner().run_only_missing(pipeline, catalog)
        _, args, _ = run.mock_calls[0]
    new_pipeline = args[0]
    return new_pipeline
Ejemplo n.º 3
0
    def test_spark_load_save(self, is_async, data_catalog):
        """SparkDataSet(load) -> node -> Spark (save)."""
        pipeline = Pipeline([node(identity, "spark_in", "spark_out")])
        SequentialRunner(is_async=is_async).run(pipeline, data_catalog)

        save_path = Path(data_catalog._data_sets["spark_out"]._filepath)
        files = list(save_path.glob("*.parquet"))
        assert len(files) > 0
Ejemplo n.º 4
0
 def test_conflict_feed_catalog(self, memory_catalog,
                                unfinished_outputs_pipeline,
                                conflicting_feed_dict):
     """ds1 and ds3 will be replaced with new inputs."""
     memory_catalog.add_feed_dict(conflicting_feed_dict, replace=True)
     outputs = SequentialRunner().run(unfinished_outputs_pipeline,
                                      memory_catalog)
     assert isinstance(outputs["ds8"], dict)
     assert outputs["ds8"]["data"] == 0
     assert isinstance(outputs["ds6"], pd.DataFrame)
    def __init__(
        self,
        pipeline: Pipeline,
        catalog: DataCatalog,
        input_name: str,
        runner: Optional[AbstractRunner] = None,
        copy_mode: Optional[Union[Dict[str, str], str]] = None,
    ):
        """[summary]

        Args:
            pipeline (Pipeline): A Kedro Pipeline object to
            store as a Mlflow Model. Also works with kedro_mlflow PipelineML objects.

            catalog (DataCatalog): The DataCatalog associated
            to the PipelineMl

            runner (Optional[AbstractRunner], optional): The kedro
            AbstractRunner to use. Defaults to SequentialRunner if
            None.

            copy_mode (Optional[Union[Dict[str,str], str]]):
            The copy_mode of each DataSet of the catalog
            when reconstructing the DataCatalog in memory.
            You can pass either:
                - None to use Kedro default mode for each dataset
                - a single string ("deepcopy", "copy" and "assign")
                to apply to all datasets
                - a dictionnary with (dataset name, copy_mode) key/values
                pairs. The associated mode must be a valid kedro mode
                ("deepcopy", "copy" and "assign") for each. Defaults to None.
        """

        self.pipeline = (pipeline.inference if isinstance(
            pipeline, PipelineML) else pipeline)
        self.input_name = input_name
        self.initial_catalog = self._extract_pipeline_catalog(catalog)

        nb_outputs = len(self.pipeline.outputs())
        if nb_outputs != 1:
            outputs_list_str = "\n - ".join(self.pipeline.outputs())
            raise ValueError(
                f"Pipeline must have one and only one output, got '{nb_outputs}' outputs: \n - {outputs_list_str}"
            )
        self.output_name = list(self.pipeline.outputs())[0]
        self.runner = runner or SequentialRunner()
        self.copy_mode = copy_mode or {}
        # copy mode has been converted because it is a property
        # TODO: we need to use the runner's default dataset in case of multithreading
        self.loaded_catalog = DataCatalog(
            data_sets={
                name: MemoryDataSet(copy_mode=copy_mode)
                for name, copy_mode in self.copy_mode.items()
            })
Ejemplo n.º 6
0
def test_log_time_with_partial(recwarn):
    pipeline = Pipeline(
        [node(partial(identity, 1), None, "output",
              name="identity1")]).decorate(log_time)
    catalog = DataCatalog({}, dict(number=1))
    result = SequentialRunner().run(pipeline, catalog)
    assert result["output"] == 1
    warning = recwarn.pop(UserWarning)
    assert ("The node producing outputs `['output']` is made from a "
            "`partial` function. Partial functions do not have a "
            "`__name__` attribute" in str(warning.message))
Ejemplo n.º 7
0
 def _run_one_task(self, config_filename):
     # create node from Task
     expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
     config, log = self._init_config_log(config_filename,
                                         self.base_directory,
                                         self.config_directory)
     # Prepare a data catalog
     data_catalog = DataCatalog({
         'config': MemoryDataSet(),
         'log': MemoryDataSet(),
         'expanded_config': MemoryDataSet()
     })
     data_catalog.save('config', config)
     data_catalog.save('log', log)
     # Assemble nodes into a pipeline
     pipeline = Pipeline([expand_config_node])
     # Create a runner to run the pipeline
     runner = SequentialRunner()
     # Run the pipeline
     runner.run(pipeline, data_catalog)
     return log, data_catalog
Ejemplo n.º 8
0
class CustomRunner(AbstractRunner):
    def __init__(self, *args, **kwargs):
        self.runner = SequentialRunner(*args, **kwargs)

    def run(self, *args, **kwargs):
        return self.runner.run_only_missing(*args, **kwargs)

    def _run(self, pipeline, catalog):
        return super()._run(pipeline, catalog)

    def create_default_data_set(self, ds_name):
        return super().create_default_data_set(ds_name)
Ejemplo n.º 9
0
 def run(
         self,
         *args,  # type: Any
         runner=None,  # type: Union[AbstractRunner, str]
         **kwargs,  # type: Any
 ):
     # type: (...) -> Dict[str, Any]
     if isinstance(runner, str):
         assert runner in {"ParallelRunner", "SequentialRunner"}
         runner = (ParallelRunner()
                   if runner == "ParallelRunner" else SequentialRunner())
     return super().run(*args, runner=runner, **kwargs)
Ejemplo n.º 10
0
def test_mlflow_pipeline_hook_with_different_pipeline_types(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict,
                                       model_name="model")
    runner = SequentialRunner()
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"])
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)
    # test : parameters should have been logged
    mlflow_conf = get_mlflow_config(tmp_path)
    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    run_data = mlflow_client.get_run(run_id).data
    # all run_params are recorded as tags
    for k, v in dummy_run_params.items():
        if v:
            assert run_data.tags[k] == str(v)
    # params are not recorded because we don't have MlflowNodeHook here
    # and the model should not be logged when it is not a PipelineML
    nb_artifacts = len(mlflow_client.list_artifacts(run_id))
    if isinstance(pipeline_to_run, PipelineML):
        assert nb_artifacts == 1
    else:
        assert nb_artifacts == 0
Ejemplo n.º 11
0
    def test_spark_memory_spark(self, is_async, data_catalog):
        """SparkDataSet(load) -> node -> MemoryDataSet (save and then load) ->
        node -> SparkDataSet (save)"""
        pipeline = Pipeline([
            node(identity, "spark_in", "memory_ds"),
            node(identity, "memory_ds", "spark_out"),
        ])
        SequentialRunner(is_async=is_async).run(pipeline, data_catalog)

        save_path = Path(
            data_catalog._data_sets["spark_out"]._filepath.as_posix())
        files = list(save_path.glob("*.parquet"))
        assert len(files) > 0
Ejemplo n.º 12
0
    def test_count_multiple_loads(self, is_async):
        log = []
        pipeline = Pipeline([
            node(source, None, "dataset"),
            node(sink, "dataset", None, name="bob"),
            node(sink, "dataset", None, name="fred"),
        ])
        catalog = DataCatalog({"dataset": LoggingDataSet(log, "dataset")})
        SequentialRunner(is_async=is_async).run(pipeline, catalog)

        # we want to the release after both the loads
        assert log == [("load", "dataset"), ("load", "dataset"),
                       ("release", "dataset")]
Ejemplo n.º 13
0
    def test_result_saved_not_returned(self, saving_result_pipeline):
        """The pipeline runs ds->dsX but save does not save the output."""
        def _load():
            return 0

        def _save(arg):
            assert arg == 0

        catalog = DataCatalog({
            "ds": LambdaDataSet(load=_load, save=_save),
            "dsX": LambdaDataSet(load=_load, save=_save),
        })
        output = SequentialRunner().run(saving_result_pipeline, catalog)
        assert output == {}
Ejemplo n.º 14
0
def create_simple_kedro():
    """
    creates the simple kedro object that holds the pipeline and io objects as well as 
    the run function
    """
    sk = SimpleNamespace()
    sk.root_dir = Path(__file__).parent
    sk.io = build_catalog(sk.root_dir)
    sk.pipeline = create_pipeline()
    sk.runner = SequentialRunner()
    sk.run = (lambda pipeline=None: sk.runner.run(sk.pipeline, sk.io)
              if pipeline is None else sk.runner.run(pipeline, sk.io))

    return sk
def main(
    tags: Iterable[str] = None,
    env: str = None,
    runner: str = None,
):
    """Application main entry point.

    Args:
        tags: An optional list of node tags which should be used to
            filter the nodes of the ``Pipeline``. If specified, only the nodes
            containing *any* of these tags will be added to the ``Pipeline``.
        env: An optional parameter specifying the environment in which
            the ``Pipeline`` should be run. If not specified defaults to "local".
        runner: An optional parameter specifying the runner that you want to run
            the pipeline with.

    Raises:
        KedroCliError: If the resulting ``Pipeline`` is empty.

    """
    # Report project name
    logging.info("** Kedro project {}".format(Path.cwd().name))

    # Load Catalog
    conf = get_config(project_path=str(Path.cwd()), env=env)
    catalog = create_catalog(config=conf)

    # Load the pipeline
    pipeline = create_pipeline()
    pipeline = pipeline.only_nodes_with_tags(*tags) if tags else pipeline
    if not pipeline.nodes:
        if tags:
            raise KedroCliError("Pipeline contains no nodes with tags: " +
                                str(tags))
        raise KedroCliError("Pipeline contains no nodes")

    # Load the runner
    # When either --parallel or --runner is used, class_obj is assigned to runner
    runner = load_obj(runner, "kedro.runner") if runner else SequentialRunner

    # Initialise SparkSession
    spark = init_spark_session()

    # Run the runner
    #runner().run(pipeline, catalog)

    # Run the pipeline
    #io.add_feed_dict({'parameters': parameters}, replace=True)
    SequentialRunner().run(pipeline, catalog)
Ejemplo n.º 16
0
    def test_release_transcoded(self, is_async):
        log = []
        pipeline = Pipeline(
            [node(source, None, "ds@save"),
             node(sink, "ds@load", None)])
        catalog = DataCatalog({
            "ds@save": LoggingDataSet(log, "save"),
            "ds@load": LoggingDataSet(log, "load"),
        })

        SequentialRunner(is_async=is_async).run(pipeline, catalog)

        # we want to see both datasets being released
        assert log == [("release", "save"), ("load", "load"),
                       ("release", "load")]
Ejemplo n.º 17
0
    def test_dont_release_inputs_and_outputs(self, is_async):
        log = []
        pipeline = Pipeline(
            [node(identity, "in", "middle"),
             node(identity, "middle", "out")])
        catalog = DataCatalog({
            "in": LoggingDataSet(log, "in", "stuff"),
            "middle": LoggingDataSet(log, "middle"),
            "out": LoggingDataSet(log, "out"),
        })
        SequentialRunner(is_async=is_async).run(pipeline, catalog)

        # we don't want to see release in or out in here
        assert log == [("load", "in"), ("load", "middle"),
                       ("release", "middle")]
Ejemplo n.º 18
0
    def run(
        self,
        tags=None,  # type: Iterable[str]
        runner=None,  # type: AbstractRunner
        node_names=None,  # type: Iterable[str]
        only_missing=False,  # type: bool
    ):
        # type: (...) -> Dict[str, Any]
        """Runs the pipeline wi th a specified runner.

        Args:
            tags: An optional list of node tags which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                containing *any* of these tags will be run.
            runner: An optional parameter specifying the runner that you want to run
                the pipeline with.
            node_names: An optional list of node names which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                with these names will be run.
            only_missing: An option to run only missing nodes.
        Raises:
            KedroContextError: If the resulting ``Pipeline`` is empty
                or incorrect tags are provided.
        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.
        """

        # Load the pipeline
        pipeline = self.pipeline
        if node_names:
            pipeline = pipeline.only_nodes(*node_names)
        if tags:
            pipeline = pipeline.only_nodes_with_tags(*tags)

        if not pipeline.nodes:
            msg = "Pipeline contains no nodes"
            if tags:
                msg += " with tags: {}".format(str(tags))
            raise KedroContextError(msg)

        # Run the runner
        runner = runner or SequentialRunner()
        if only_missing and _skippable(self.catalog):
            return runner.run_only_missing(pipeline, self.catalog)
        return runner.run(pipeline, self.catalog)
Ejemplo n.º 19
0
    def __init__(
        self,
        pipeline_ml: PipelineML,
        catalog: DataCatalog,
        runner: Optional[AbstractRunner] = None,
        copy_mode: Optional[Union[Dict[str, str], str]] = None,
    ):
        """[summary]

        Args:
            pipeline_ml (PipelineML): A PipelineML object to
            store as a Mlflow Model

            catalog (DataCatalog): The DataCatalog associated
            to the PipelineMl

            runner (Optional[AbstractRunner], optional): The kedro
            AbstractRunner to use. Defaults to SequentialRunner if
            None.

            copy_mode (Optional[Union[Dict[str,str], str]]):
            The copy_mode of each DataSet of the catalog
            when reconstructing the DataCatalog in memory.
            You can pass either:
                - None to use Kedro default mode for each dataset
                - a single string ("deepcopy", "copy" and "assign")
                to apply to all datasets
                - a dictionnary with (dataset name, copy_mode) key/values
                pairs. The associated mode must be a valid kedro mode
                ("deepcopy", "copy" and "assign") for each. Defaults to None.
        """

        self.pipeline_ml = pipeline_ml
        self.initial_catalog = pipeline_ml._extract_pipeline_catalog(catalog)
        # we have the guarantee that there is only one output in inference
        self.output_name = list(pipeline_ml.inference.outputs())[0]
        self.runner = runner or SequentialRunner()
        self.copy_mode = copy_mode or {}

        # copy mode has been converted because it is a property
        # TODO: we need to use the runner's default dataset in case of multithreading
        self.loaded_catalog = DataCatalog(
            data_sets={
                name: MemoryDataSet(copy_mode=copy_mode)
                for name, copy_mode in self.copy_mode.items()
            }
        )
Ejemplo n.º 20
0
def test_feature_engineering_pipeline(sample_data_catalog_train: DataCatalog,
                                      runner: SequentialRunner):
    train_pipeline = create_pipeline(
        output_X_train_normalized="sample_iris_X_train_normalized",
        output_X_test_normalized="sample_iris_X_test_normalized",
        output_y_train="sample_iris_y_train",
        output_y_test="sample_iris_y_test",
        normalizer="sample_normalizer",
    )

    output = runner.run(pipeline=train_pipeline,
                        catalog=sample_data_catalog_train)

    assert output["sample_iris_X_train_normalized"].shape == (3, 4)
    assert output["sample_iris_X_test_normalized"].shape == (1, 4)
    assert output["sample_iris_y_train"].shape == (3, )
    assert output["sample_iris_y_test"].shape == (1, )
Ejemplo n.º 21
0
    def test_apply(self):
        nodes = sorted(
            [
                node(identity, "number", "output1", name="identity1"),
                node(identity, "output1", "output2", name="biconcat"),
                node(identity, "output2", "output", name="identity3"),
            ],
            key=lambda x: x.name,
        )

        pipeline = Pipeline(nodes).decorate(apply_f, apply_g)
        catalog = DataCatalog({}, dict(number=1))
        result = SequentialRunner().run(pipeline, catalog)
        decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name)

        assert result["output"] == "g(f(g(f(g(f(1))))))"
        assert len(pipeline.nodes) == 3
        assert all(n1.name == n2.name for n1, n2 in zip(nodes, decorated_nodes))
Ejemplo n.º 22
0
def test_table_embedding() -> None:
    conf_loader: ConfigLoader = ConfigLoader(
        conf_paths=["eos/conf/base", "eos/conf/local"]
    )

    conf_logging: Dict[str, Any] = conf_loader.get("logging*", "logging*/**")
    logging.config.dictConfig(conf_logging)

    conf_catalog: Dict[str, Any] = conf_loader.get("catalog*", "catalog*/**")
    data_catalog: DataCatalog = DataCatalog.from_config(conf_catalog)

    conf_params: Dict[str, Any] = conf_loader.get("parameters*", "parameters*/**")
    data_catalog.add_feed_dict(feed_dict=get_feed_dict(params=conf_params))

    conf_pipeline: Dict[str, Any] = conf_loader.get("pipelines*", "pipelines*/**")
    ae_pipeline: FlexiblePipeline = HatchDict(conf_pipeline).get("autoencoder_pipeline")

    runner: SequentialRunner = SequentialRunner()
    runner.run(pipeline=ae_pipeline, catalog=data_catalog)
Ejemplo n.º 23
0
    def test_release_at_earliest_opportunity(self, is_async):
        log = []
        pipeline = Pipeline([
            node(source, None, "first"),
            node(identity, "first", "second"),
            node(sink, "second", None),
        ])
        catalog = DataCatalog({
            "first": LoggingDataSet(log, "first"),
            "second": LoggingDataSet(log, "second"),
        })
        SequentialRunner(is_async=is_async).run(pipeline, catalog)

        # we want to see "release first" before "load second"
        assert log == [
            ("load", "first"),
            ("release", "first"),
            ("load", "second"),
            ("release", "second"),
        ]
Ejemplo n.º 24
0
    def test_apply(self):
        nodes = sorted(
            [
                node(identity, "number", "output1", name="identity1"),
                node(identity, "output1", "output2", name="biconcat"),
                node(identity, "output2", "output", name="identity3"),
            ],
            key=lambda x: x.name,
        )
        pattern = (
            "The pipeline's `decorate` API will be deprecated in Kedro 0.18.0."
            "Please use a node's Hooks to extend the node's behaviour in a pipeline."
            "For more information, please visit"
            "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html"
        )
        with pytest.warns(DeprecationWarning, match=re.escape(pattern)):
            pipeline = Pipeline(nodes).decorate(apply_f, apply_g)
        catalog = DataCatalog({}, dict(number=1))
        result = SequentialRunner().run(pipeline, catalog)
        decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name)

        assert result["output"] == "g(f(g(f(g(f(1))))))"
        assert len(pipeline.nodes) == 3
        assert all(n1.name == n2.name for n1, n2 in zip(nodes, decorated_nodes))
Ejemplo n.º 25
0
    df = df.dropna()
    return df


# Plot the amount of people who survived and who died.
def plot_survival_breakdown(df):
    plt.figure(figsize=(6, 4))
    fig, ax = plt.subplots()
    df.Survived.value_counts().plot(kind="barh", color="blue", alpha=0.65)
    ax.set_ylim(-1, len(df.Survived.value_counts()))
    plt.title("Survival Breakdown (1 = Survived, 0 = Died)")
    return fig


# Create nodes
clean_data_node = node(clean_raw_data,
                       inputs="titanic_training_data",
                       outputs="df_clean")
plot_survival_breakdown_node = node(plot_survival_breakdown,
                                    inputs="df_clean",
                                    outputs="survival_breakdown_chart")

# Assemble nodes into a pipeline
pipeline = Pipeline([clean_data_node, plot_survival_breakdown_node])

# Create a runner to run the pipeline
runner = SequentialRunner()

# Run the pipeline
print(runner.run(pipeline, io))
Ejemplo n.º 26
0
    def run(  # pylint: disable=too-many-arguments,too-many-locals
        self,
        pipeline_name: str = None,
        tags: Iterable[str] = None,
        runner: AbstractRunner = None,
        node_names: Iterable[str] = None,
        from_nodes: Iterable[str] = None,
        to_nodes: Iterable[str] = None,
        from_inputs: Iterable[str] = None,
        load_versions: Dict[str, str] = None,
        extra_params: Dict[str, Any] = None,
    ) -> Dict[str, Any]:
        """Runs the pipeline with a specified runner.

        Args:
            pipeline_name: Name of the pipeline that is being run.
            tags: An optional list of node tags which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                containing *any* of these tags will be run.
            runner: An optional parameter specifying the runner that you want to run
                the pipeline with.
            node_names: An optional list of node names which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                with these names will be run.
            from_nodes: An optional list of node names which should be used as a
                starting point of the new ``Pipeline``.
            to_nodes: An optional list of node names which should be used as an
                end point of the new ``Pipeline``.
            from_inputs: An optional list of input datasets which should be
                used as a starting point of the new ``Pipeline``.
            load_versions: An optional flag to specify a particular dataset
                version timestamp to load.
            extra_params: Additional run parameters.
        Raises:
            Exception: Any uncaught exception during the run will be re-raised
                after being passed to ``on_pipeline_error`` hook.
        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.
        """
        # pylint: disable=protected-access,no-member
        # Report project name
        logging.info("** Kedro project %s", self._project_path.name)

        save_version = run_id = self.store["session_id"]
        extra_params = deepcopy(extra_params) or dict()
        context = self.context

        pipeline = context._get_pipeline(name=pipeline_name)
        filtered_pipeline = context._filter_pipeline(
            pipeline=pipeline,
            tags=tags,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            node_names=node_names,
            from_inputs=from_inputs,
        )

        record_data = {
            "run_id": run_id,
            "project_path": self._project_path.as_posix(),
            "env": context.env,
            "kedro_version": self.store["kedro_version"],
            "tags": tags,
            "from_nodes": from_nodes,
            "to_nodes": to_nodes,
            "node_names": node_names,
            "from_inputs": from_inputs,
            "load_versions": load_versions,
            "extra_params": extra_params,
            "pipeline_name": pipeline_name,
        }

        catalog = context._get_catalog(save_version=save_version,
                                       load_versions=load_versions)

        # Run the runner
        runner = runner or SequentialRunner()
        hook = get_hook_manager().hook
        hook.before_pipeline_run(run_params=record_data,
                                 pipeline=filtered_pipeline,
                                 catalog=catalog)

        try:
            run_result = runner.run(filtered_pipeline, catalog, run_id)
        except Exception as error:
            hook.on_pipeline_error(
                error=error,
                run_params=record_data,
                pipeline=filtered_pipeline,
                catalog=catalog,
            )
            raise

        hook.after_pipeline_run(
            run_params=record_data,
            run_result=run_result,
            pipeline=filtered_pipeline,
            catalog=catalog,
        )
        return run_result
Ejemplo n.º 27
0
    def run(  # pylint: disable=too-many-arguments,too-many-locals
        self,
        tags: Iterable[str] = None,
        runner: AbstractRunner = None,
        node_names: Iterable[str] = None,
        from_nodes: Iterable[str] = None,
        to_nodes: Iterable[str] = None,
        from_inputs: Iterable[str] = None,
        load_versions: Dict[str, str] = None,
        pipeline_name: str = None,
    ) -> Dict[str, Any]:
        """Runs the pipeline with a specified runner.

        Args:
            tags: An optional list of node tags which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                containing *any* of these tags will be run.
            runner: An optional parameter specifying the runner that you want to run
                the pipeline with.
            node_names: An optional list of node names which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                with these names will be run.
            from_nodes: An optional list of node names which should be used as a
                starting point of the new ``Pipeline``.
            to_nodes: An optional list of node names which should be used as an
                end point of the new ``Pipeline``.
            from_inputs: An optional list of input datasets which should be used as a
                starting point of the new ``Pipeline``.
            load_versions: An optional flag to specify a particular dataset version timestamp
                to load.
            pipeline_name: Name of the ``Pipeline`` to execute.
                Defaults to "__default__".
        Raises:
            KedroContextError: If the resulting ``Pipeline`` is empty
                or incorrect tags are provided.
            Exception: Any uncaught exception will be re-raised
                after being passed to``on_pipeline_error``.
        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.
        """
        # Report project name
        logging.info("** Kedro project %s", self.project_path.name)

        try:
            pipeline = self._get_pipeline(name=pipeline_name)
        except NotImplementedError:
            common_migration_message = (
                "`ProjectContext._get_pipeline(self, name)` method is expected. "
                "Please refer to the 'Modular Pipelines' section of the documentation."
            )
            if pipeline_name:
                raise KedroContextError(
                    "The project is not fully migrated to use multiple pipelines. "
                    + common_migration_message)

            warn(
                "You are using the deprecated pipeline construction mechanism. "
                + common_migration_message,
                DeprecationWarning,
            )
            pipeline = self.pipeline

        filtered_pipeline = self._filter_pipeline(
            pipeline=pipeline,
            tags=tags,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            node_names=node_names,
            from_inputs=from_inputs,
        )

        save_version = self._get_save_version()
        run_id = self.run_id or save_version

        record_data = {
            "run_id": run_id,
            "project_path": str(self.project_path),
            "env": self.env,
            "kedro_version": self.project_version,
            "tags": tags,
            "from_nodes": from_nodes,
            "to_nodes": to_nodes,
            "node_names": node_names,
            "from_inputs": from_inputs,
            "load_versions": load_versions,
            "pipeline_name": pipeline_name,
            "extra_params": self._extra_params,
        }
        journal = Journal(record_data)

        catalog = self._get_catalog(save_version=save_version,
                                    journal=journal,
                                    load_versions=load_versions)

        # Run the runner
        runner = runner or SequentialRunner()
        self._hook_manager.hook.before_pipeline_run(  # pylint: disable=no-member
            run_params=record_data,
            pipeline=filtered_pipeline,
            catalog=catalog)

        try:
            run_result = runner.run(filtered_pipeline, catalog, run_id)
        except Exception as error:
            self._hook_manager.hook.on_pipeline_error(  # pylint: disable=no-member
                error=error,
                run_params=record_data,
                pipeline=filtered_pipeline,
                catalog=catalog,
            )
            raise error

        self._hook_manager.hook.after_pipeline_run(  # pylint: disable=no-member
            run_params=record_data,
            run_result=run_result,
            pipeline=filtered_pipeline,
            catalog=catalog,
        )
        return run_result
Ejemplo n.º 28
0
 def __init__(self, *args, **kwargs):
     self.runner = SequentialRunner(*args, **kwargs)
Ejemplo n.º 29
0
 def test_spark_pickle(self, is_async, data_catalog):
     """SparkDataSet(load) -> node -> PickleDataSet (save)"""
     pipeline = Pipeline([node(identity, "spark_in", "pickle_ds")])
     pattern = ".* was not serialized due to.*"
     with pytest.raises(DataSetError, match=pattern):
         SequentialRunner(is_async=is_async).run(pipeline, data_catalog)
Ejemplo n.º 30
0
# Prepare a data catalog
data_catalog = DataCatalog({"my_salutation": MemoryDataSet()})


# Prepare first node
def return_greeting():
    return "Hello"


return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation")


# Prepare second node
def join_statements(greeting):
    return f"{greeting} Kedro!"


join_statements_node = node(
    join_statements, inputs="my_salutation", outputs="my_message"
)

# Assemble nodes into a pipeline
pipeline = Pipeline([return_greeting_node, join_statements_node])

# Create a runner to run the pipeline
runner = SequentialRunner()

# Run the pipeline
print(runner.run(pipeline, data_catalog))