def test_sequential_run_arg(self, dummy_context, dummy_dataframe, caplog): dummy_context.catalog.save("cars", dummy_dataframe) dummy_context.run(runner=SequentialRunner()) log_msgs = [record.getMessage() for record in caplog.records] log_names = [record.name for record in caplog.records] assert "kedro.runner.sequential_runner" in log_names assert "Pipeline execution completed successfully." in log_msgs
def _from_missing(pipeline, catalog): """Create a new pipeline based on missing outputs.""" name = "kedro.runner.runner.AbstractRunner.run" with mock.patch(name) as run: SequentialRunner().run_only_missing(pipeline, catalog) _, args, _ = run.mock_calls[0] new_pipeline = args[0] return new_pipeline
def test_spark_load_save(self, is_async, data_catalog): """SparkDataSet(load) -> node -> Spark (save).""" pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) SequentialRunner(is_async=is_async).run(pipeline, data_catalog) save_path = Path(data_catalog._data_sets["spark_out"]._filepath) files = list(save_path.glob("*.parquet")) assert len(files) > 0
def test_conflict_feed_catalog(self, memory_catalog, unfinished_outputs_pipeline, conflicting_feed_dict): """ds1 and ds3 will be replaced with new inputs.""" memory_catalog.add_feed_dict(conflicting_feed_dict, replace=True) outputs = SequentialRunner().run(unfinished_outputs_pipeline, memory_catalog) assert isinstance(outputs["ds8"], dict) assert outputs["ds8"]["data"] == 0 assert isinstance(outputs["ds6"], pd.DataFrame)
def __init__( self, pipeline: Pipeline, catalog: DataCatalog, input_name: str, runner: Optional[AbstractRunner] = None, copy_mode: Optional[Union[Dict[str, str], str]] = None, ): """[summary] Args: pipeline (Pipeline): A Kedro Pipeline object to store as a Mlflow Model. Also works with kedro_mlflow PipelineML objects. catalog (DataCatalog): The DataCatalog associated to the PipelineMl runner (Optional[AbstractRunner], optional): The kedro AbstractRunner to use. Defaults to SequentialRunner if None. copy_mode (Optional[Union[Dict[str,str], str]]): The copy_mode of each DataSet of the catalog when reconstructing the DataCatalog in memory. You can pass either: - None to use Kedro default mode for each dataset - a single string ("deepcopy", "copy" and "assign") to apply to all datasets - a dictionnary with (dataset name, copy_mode) key/values pairs. The associated mode must be a valid kedro mode ("deepcopy", "copy" and "assign") for each. Defaults to None. """ self.pipeline = (pipeline.inference if isinstance( pipeline, PipelineML) else pipeline) self.input_name = input_name self.initial_catalog = self._extract_pipeline_catalog(catalog) nb_outputs = len(self.pipeline.outputs()) if nb_outputs != 1: outputs_list_str = "\n - ".join(self.pipeline.outputs()) raise ValueError( f"Pipeline must have one and only one output, got '{nb_outputs}' outputs: \n - {outputs_list_str}" ) self.output_name = list(self.pipeline.outputs())[0] self.runner = runner or SequentialRunner() self.copy_mode = copy_mode or {} # copy mode has been converted because it is a property # TODO: we need to use the runner's default dataset in case of multithreading self.loaded_catalog = DataCatalog( data_sets={ name: MemoryDataSet(copy_mode=copy_mode) for name, copy_mode in self.copy_mode.items() })
def test_log_time_with_partial(recwarn): pipeline = Pipeline( [node(partial(identity, 1), None, "output", name="identity1")]).decorate(log_time) catalog = DataCatalog({}, dict(number=1)) result = SequentialRunner().run(pipeline, catalog) assert result["output"] == 1 warning = recwarn.pop(UserWarning) assert ("The node producing outputs `['output']` is made from a " "`partial` function. Partial functions do not have a " "`__name__` attribute" in str(warning.message))
def _run_one_task(self, config_filename): # create node from Task expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) # Prepare a data catalog data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'expanded_config': MemoryDataSet() }) data_catalog.save('config', config) data_catalog.save('log', log) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, data_catalog
class CustomRunner(AbstractRunner): def __init__(self, *args, **kwargs): self.runner = SequentialRunner(*args, **kwargs) def run(self, *args, **kwargs): return self.runner.run_only_missing(*args, **kwargs) def _run(self, pipeline, catalog): return super()._run(pipeline, catalog) def create_default_data_set(self, ds_name): return super().create_default_data_set(ds_name)
def run( self, *args, # type: Any runner=None, # type: Union[AbstractRunner, str] **kwargs, # type: Any ): # type: (...) -> Dict[str, Any] if isinstance(runner, str): assert runner in {"ParallelRunner", "SequentialRunner"} runner = (ParallelRunner() if runner == "ParallelRunner" else SequentialRunner()) return super().run(*args, runner=runner, **kwargs)
def test_mlflow_pipeline_hook_with_different_pipeline_types( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict, model_name="model") runner = SequentialRunner() pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"]) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged mlflow_conf = get_mlflow_config(tmp_path) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0
def test_spark_memory_spark(self, is_async, data_catalog): """SparkDataSet(load) -> node -> MemoryDataSet (save and then load) -> node -> SparkDataSet (save)""" pipeline = Pipeline([ node(identity, "spark_in", "memory_ds"), node(identity, "memory_ds", "spark_out"), ]) SequentialRunner(is_async=is_async).run(pipeline, data_catalog) save_path = Path( data_catalog._data_sets["spark_out"]._filepath.as_posix()) files = list(save_path.glob("*.parquet")) assert len(files) > 0
def test_count_multiple_loads(self, is_async): log = [] pipeline = Pipeline([ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ]) catalog = DataCatalog({"dataset": LoggingDataSet(log, "dataset")}) SequentialRunner(is_async=is_async).run(pipeline, catalog) # we want to the release after both the loads assert log == [("load", "dataset"), ("load", "dataset"), ("release", "dataset")]
def test_result_saved_not_returned(self, saving_result_pipeline): """The pipeline runs ds->dsX but save does not save the output.""" def _load(): return 0 def _save(arg): assert arg == 0 catalog = DataCatalog({ "ds": LambdaDataSet(load=_load, save=_save), "dsX": LambdaDataSet(load=_load, save=_save), }) output = SequentialRunner().run(saving_result_pipeline, catalog) assert output == {}
def create_simple_kedro(): """ creates the simple kedro object that holds the pipeline and io objects as well as the run function """ sk = SimpleNamespace() sk.root_dir = Path(__file__).parent sk.io = build_catalog(sk.root_dir) sk.pipeline = create_pipeline() sk.runner = SequentialRunner() sk.run = (lambda pipeline=None: sk.runner.run(sk.pipeline, sk.io) if pipeline is None else sk.runner.run(pipeline, sk.io)) return sk
def main( tags: Iterable[str] = None, env: str = None, runner: str = None, ): """Application main entry point. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be added to the ``Pipeline``. env: An optional parameter specifying the environment in which the ``Pipeline`` should be run. If not specified defaults to "local". runner: An optional parameter specifying the runner that you want to run the pipeline with. Raises: KedroCliError: If the resulting ``Pipeline`` is empty. """ # Report project name logging.info("** Kedro project {}".format(Path.cwd().name)) # Load Catalog conf = get_config(project_path=str(Path.cwd()), env=env) catalog = create_catalog(config=conf) # Load the pipeline pipeline = create_pipeline() pipeline = pipeline.only_nodes_with_tags(*tags) if tags else pipeline if not pipeline.nodes: if tags: raise KedroCliError("Pipeline contains no nodes with tags: " + str(tags)) raise KedroCliError("Pipeline contains no nodes") # Load the runner # When either --parallel or --runner is used, class_obj is assigned to runner runner = load_obj(runner, "kedro.runner") if runner else SequentialRunner # Initialise SparkSession spark = init_spark_session() # Run the runner #runner().run(pipeline, catalog) # Run the pipeline #io.add_feed_dict({'parameters': parameters}, replace=True) SequentialRunner().run(pipeline, catalog)
def test_release_transcoded(self, is_async): log = [] pipeline = Pipeline( [node(source, None, "ds@save"), node(sink, "ds@load", None)]) catalog = DataCatalog({ "ds@save": LoggingDataSet(log, "save"), "ds@load": LoggingDataSet(log, "load"), }) SequentialRunner(is_async=is_async).run(pipeline, catalog) # we want to see both datasets being released assert log == [("release", "save"), ("load", "load"), ("release", "load")]
def test_dont_release_inputs_and_outputs(self, is_async): log = [] pipeline = Pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")]) catalog = DataCatalog({ "in": LoggingDataSet(log, "in", "stuff"), "middle": LoggingDataSet(log, "middle"), "out": LoggingDataSet(log, "out"), }) SequentialRunner(is_async=is_async).run(pipeline, catalog) # we don't want to see release in or out in here assert log == [("load", "in"), ("load", "middle"), ("release", "middle")]
def run( self, tags=None, # type: Iterable[str] runner=None, # type: AbstractRunner node_names=None, # type: Iterable[str] only_missing=False, # type: bool ): # type: (...) -> Dict[str, Any] """Runs the pipeline wi th a specified runner. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. only_missing: An option to run only missing nodes. Raises: KedroContextError: If the resulting ``Pipeline`` is empty or incorrect tags are provided. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # Load the pipeline pipeline = self.pipeline if node_names: pipeline = pipeline.only_nodes(*node_names) if tags: pipeline = pipeline.only_nodes_with_tags(*tags) if not pipeline.nodes: msg = "Pipeline contains no nodes" if tags: msg += " with tags: {}".format(str(tags)) raise KedroContextError(msg) # Run the runner runner = runner or SequentialRunner() if only_missing and _skippable(self.catalog): return runner.run_only_missing(pipeline, self.catalog) return runner.run(pipeline, self.catalog)
def __init__( self, pipeline_ml: PipelineML, catalog: DataCatalog, runner: Optional[AbstractRunner] = None, copy_mode: Optional[Union[Dict[str, str], str]] = None, ): """[summary] Args: pipeline_ml (PipelineML): A PipelineML object to store as a Mlflow Model catalog (DataCatalog): The DataCatalog associated to the PipelineMl runner (Optional[AbstractRunner], optional): The kedro AbstractRunner to use. Defaults to SequentialRunner if None. copy_mode (Optional[Union[Dict[str,str], str]]): The copy_mode of each DataSet of the catalog when reconstructing the DataCatalog in memory. You can pass either: - None to use Kedro default mode for each dataset - a single string ("deepcopy", "copy" and "assign") to apply to all datasets - a dictionnary with (dataset name, copy_mode) key/values pairs. The associated mode must be a valid kedro mode ("deepcopy", "copy" and "assign") for each. Defaults to None. """ self.pipeline_ml = pipeline_ml self.initial_catalog = pipeline_ml._extract_pipeline_catalog(catalog) # we have the guarantee that there is only one output in inference self.output_name = list(pipeline_ml.inference.outputs())[0] self.runner = runner or SequentialRunner() self.copy_mode = copy_mode or {} # copy mode has been converted because it is a property # TODO: we need to use the runner's default dataset in case of multithreading self.loaded_catalog = DataCatalog( data_sets={ name: MemoryDataSet(copy_mode=copy_mode) for name, copy_mode in self.copy_mode.items() } )
def test_feature_engineering_pipeline(sample_data_catalog_train: DataCatalog, runner: SequentialRunner): train_pipeline = create_pipeline( output_X_train_normalized="sample_iris_X_train_normalized", output_X_test_normalized="sample_iris_X_test_normalized", output_y_train="sample_iris_y_train", output_y_test="sample_iris_y_test", normalizer="sample_normalizer", ) output = runner.run(pipeline=train_pipeline, catalog=sample_data_catalog_train) assert output["sample_iris_X_train_normalized"].shape == (3, 4) assert output["sample_iris_X_test_normalized"].shape == (1, 4) assert output["sample_iris_y_train"].shape == (3, ) assert output["sample_iris_y_test"].shape == (1, )
def test_apply(self): nodes = sorted( [ node(identity, "number", "output1", name="identity1"), node(identity, "output1", "output2", name="biconcat"), node(identity, "output2", "output", name="identity3"), ], key=lambda x: x.name, ) pipeline = Pipeline(nodes).decorate(apply_f, apply_g) catalog = DataCatalog({}, dict(number=1)) result = SequentialRunner().run(pipeline, catalog) decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name) assert result["output"] == "g(f(g(f(g(f(1))))))" assert len(pipeline.nodes) == 3 assert all(n1.name == n2.name for n1, n2 in zip(nodes, decorated_nodes))
def test_table_embedding() -> None: conf_loader: ConfigLoader = ConfigLoader( conf_paths=["eos/conf/base", "eos/conf/local"] ) conf_logging: Dict[str, Any] = conf_loader.get("logging*", "logging*/**") logging.config.dictConfig(conf_logging) conf_catalog: Dict[str, Any] = conf_loader.get("catalog*", "catalog*/**") data_catalog: DataCatalog = DataCatalog.from_config(conf_catalog) conf_params: Dict[str, Any] = conf_loader.get("parameters*", "parameters*/**") data_catalog.add_feed_dict(feed_dict=get_feed_dict(params=conf_params)) conf_pipeline: Dict[str, Any] = conf_loader.get("pipelines*", "pipelines*/**") ae_pipeline: FlexiblePipeline = HatchDict(conf_pipeline).get("autoencoder_pipeline") runner: SequentialRunner = SequentialRunner() runner.run(pipeline=ae_pipeline, catalog=data_catalog)
def test_release_at_earliest_opportunity(self, is_async): log = [] pipeline = Pipeline([ node(source, None, "first"), node(identity, "first", "second"), node(sink, "second", None), ]) catalog = DataCatalog({ "first": LoggingDataSet(log, "first"), "second": LoggingDataSet(log, "second"), }) SequentialRunner(is_async=is_async).run(pipeline, catalog) # we want to see "release first" before "load second" assert log == [ ("load", "first"), ("release", "first"), ("load", "second"), ("release", "second"), ]
def test_apply(self): nodes = sorted( [ node(identity, "number", "output1", name="identity1"), node(identity, "output1", "output2", name="biconcat"), node(identity, "output2", "output", name="identity3"), ], key=lambda x: x.name, ) pattern = ( "The pipeline's `decorate` API will be deprecated in Kedro 0.18.0." "Please use a node's Hooks to extend the node's behaviour in a pipeline." "For more information, please visit" "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html" ) with pytest.warns(DeprecationWarning, match=re.escape(pattern)): pipeline = Pipeline(nodes).decorate(apply_f, apply_g) catalog = DataCatalog({}, dict(number=1)) result = SequentialRunner().run(pipeline, catalog) decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name) assert result["output"] == "g(f(g(f(g(f(1))))))" assert len(pipeline.nodes) == 3 assert all(n1.name == n2.name for n1, n2 in zip(nodes, decorated_nodes))
df = df.dropna() return df # Plot the amount of people who survived and who died. def plot_survival_breakdown(df): plt.figure(figsize=(6, 4)) fig, ax = plt.subplots() df.Survived.value_counts().plot(kind="barh", color="blue", alpha=0.65) ax.set_ylim(-1, len(df.Survived.value_counts())) plt.title("Survival Breakdown (1 = Survived, 0 = Died)") return fig # Create nodes clean_data_node = node(clean_raw_data, inputs="titanic_training_data", outputs="df_clean") plot_survival_breakdown_node = node(plot_survival_breakdown, inputs="df_clean", outputs="survival_breakdown_chart") # Assemble nodes into a pipeline pipeline = Pipeline([clean_data_node, plot_survival_breakdown_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline print(runner.run(pipeline, io))
def run( # pylint: disable=too-many-arguments,too-many-locals self, pipeline_name: str = None, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, load_versions: Dict[str, str] = None, extra_params: Dict[str, Any] = None, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: pipeline_name: Name of the pipeline that is being run. tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. extra_params: Additional run parameters. Raises: Exception: Any uncaught exception during the run will be re-raised after being passed to ``on_pipeline_error`` hook. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # pylint: disable=protected-access,no-member # Report project name logging.info("** Kedro project %s", self._project_path.name) save_version = run_id = self.store["session_id"] extra_params = deepcopy(extra_params) or dict() context = self.context pipeline = context._get_pipeline(name=pipeline_name) filtered_pipeline = context._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, ) record_data = { "run_id": run_id, "project_path": self._project_path.as_posix(), "env": context.env, "kedro_version": self.store["kedro_version"], "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "load_versions": load_versions, "extra_params": extra_params, "pipeline_name": pipeline_name, } catalog = context._get_catalog(save_version=save_version, load_versions=load_versions) # Run the runner runner = runner or SequentialRunner() hook = get_hook_manager().hook hook.before_pipeline_run(run_params=record_data, pipeline=filtered_pipeline, catalog=catalog) try: run_result = runner.run(filtered_pipeline, catalog, run_id) except Exception as error: hook.on_pipeline_error( error=error, run_params=record_data, pipeline=filtered_pipeline, catalog=catalog, ) raise hook.after_pipeline_run( run_params=record_data, run_result=run_result, pipeline=filtered_pipeline, catalog=catalog, ) return run_result
def run( # pylint: disable=too-many-arguments,too-many-locals self, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, load_versions: Dict[str, str] = None, pipeline_name: str = None, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. pipeline_name: Name of the ``Pipeline`` to execute. Defaults to "__default__". Raises: KedroContextError: If the resulting ``Pipeline`` is empty or incorrect tags are provided. Exception: Any uncaught exception will be re-raised after being passed to``on_pipeline_error``. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # Report project name logging.info("** Kedro project %s", self.project_path.name) try: pipeline = self._get_pipeline(name=pipeline_name) except NotImplementedError: common_migration_message = ( "`ProjectContext._get_pipeline(self, name)` method is expected. " "Please refer to the 'Modular Pipelines' section of the documentation." ) if pipeline_name: raise KedroContextError( "The project is not fully migrated to use multiple pipelines. " + common_migration_message) warn( "You are using the deprecated pipeline construction mechanism. " + common_migration_message, DeprecationWarning, ) pipeline = self.pipeline filtered_pipeline = self._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, ) save_version = self._get_save_version() run_id = self.run_id or save_version record_data = { "run_id": run_id, "project_path": str(self.project_path), "env": self.env, "kedro_version": self.project_version, "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "load_versions": load_versions, "pipeline_name": pipeline_name, "extra_params": self._extra_params, } journal = Journal(record_data) catalog = self._get_catalog(save_version=save_version, journal=journal, load_versions=load_versions) # Run the runner runner = runner or SequentialRunner() self._hook_manager.hook.before_pipeline_run( # pylint: disable=no-member run_params=record_data, pipeline=filtered_pipeline, catalog=catalog) try: run_result = runner.run(filtered_pipeline, catalog, run_id) except Exception as error: self._hook_manager.hook.on_pipeline_error( # pylint: disable=no-member error=error, run_params=record_data, pipeline=filtered_pipeline, catalog=catalog, ) raise error self._hook_manager.hook.after_pipeline_run( # pylint: disable=no-member run_params=record_data, run_result=run_result, pipeline=filtered_pipeline, catalog=catalog, ) return run_result
def __init__(self, *args, **kwargs): self.runner = SequentialRunner(*args, **kwargs)
def test_spark_pickle(self, is_async, data_catalog): """SparkDataSet(load) -> node -> PickleDataSet (save)""" pipeline = Pipeline([node(identity, "spark_in", "pickle_ds")]) pattern = ".* was not serialized due to.*" with pytest.raises(DataSetError, match=pattern): SequentialRunner(is_async=is_async).run(pipeline, data_catalog)
# Prepare a data catalog data_catalog = DataCatalog({"my_salutation": MemoryDataSet()}) # Prepare first node def return_greeting(): return "Hello" return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation") # Prepare second node def join_statements(greeting): return f"{greeting} Kedro!" join_statements_node = node( join_statements, inputs="my_salutation", outputs="my_message" ) # Assemble nodes into a pipeline pipeline = Pipeline([return_greeting_node, join_statements_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline print(runner.run(pipeline, data_catalog))