Ejemplo n.º 1
0
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node:
    inputs = {name: catalog.load(name) for name in node.inputs}
    hook_manager = get_hook_manager()
    is_async = False
    hook_manager.hook.before_node_run(  # pylint: disable=no-member
        node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id
    )
    try:
        outputs = node.run(inputs)
    except Exception as exc:
        hook_manager.hook.on_node_error(  # pylint: disable=no-member
            error=exc,
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id,
        )
        raise exc
    hook_manager.hook.after_node_run(  # pylint: disable=no-member
        node=node,
        catalog=catalog,
        inputs=inputs,
        outputs=outputs,
        is_async=is_async,
        run_id=run_id,
    )

    for name, data in outputs.items():
        catalog.save(name, data)
    return node
Ejemplo n.º 2
0
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node:
    inputs = {}
    hook_manager = get_hook_manager()

    for name in node.inputs:
        hook_manager.hook.before_dataset_loaded(  # pylint: disable=no-member
            dataset_name=name
        )
        inputs[name] = catalog.load(name)
        hook_manager.hook.after_dataset_loaded(  # pylint: disable=no-member
            dataset_name=name, data=inputs[name]
        )

    is_async = False

    additional_inputs = _collect_inputs_from_hook(
        node, catalog, inputs, is_async, run_id=run_id
    )
    inputs.update(additional_inputs)

    outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id)

    for name, data in outputs.items():
        hook_manager.hook.before_dataset_saved(  # pylint: disable=no-member
            dataset_name=name, data=data
        )
        catalog.save(name, data)
        hook_manager.hook.after_dataset_saved(  # pylint: disable=no-member
            dataset_name=name, data=data
        )
    return node
Ejemplo n.º 3
0
    def test_add_save_and_load(self, data_set, dummy_dataframe):
        """Test adding and then saving and reloading the data set"""
        catalog = DataCatalog(data_sets={})
        catalog.add("test", data_set)
        catalog.save("test", dummy_dataframe)
        reloaded_df = catalog.load("test")

        assert_frame_equal(reloaded_df, dummy_dataframe)
Ejemplo n.º 4
0
    def test_all_before_adding(self, fake_data_set, fake_transformer):
        catalog = DataCatalog()
        catalog.add_transformer(fake_transformer)
        catalog.add("test", fake_data_set)

        catalog.save("test", 42)
        assert catalog.load("test") == 44
        assert fake_data_set.log == [("save", 43), ("load", 43)]
        assert fake_transformer.log == [("save", 42), ("load", 43)]
Ejemplo n.º 5
0
def all_catalog(dataframex, dataframey, dataframey_bad):
    #создадим DF, так как они лежат в Кедро, после загрузки в память
    # https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html
    from kedro.io import DataCatalog, MemoryDataSet
    catalog = DataCatalog({
        "dataframex": MemoryDataSet(),
        "dataframey": MemoryDataSet(),
        "dataframey_bad": MemoryDataSet()
    })
    catalog.save("dataframex", dataframex)
    catalog.save("dataframey", dataframey)
    catalog.save("dataframey_bad", dataframey_bad)
    return catalog
Ejemplo n.º 6
0
def run_node(node: Node, catalog: DataCatalog) -> Node:
    """Run a single `Node` with inputs from and outputs to the `catalog`.

    Args:
        node: The ``Node`` to run.
        catalog: A ``DataCatalog`` containing the node's inputs and outputs.

    Returns:
        The node argument.

    """
    inputs = {name: catalog.load(name) for name in node.inputs}
    outputs = node.run(inputs)
    for name, data in outputs.items():
        catalog.save(name, data)
    return node
Ejemplo n.º 7
0
 def _run_one_task(self, config_filename):
     # create node from Task
     expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
     config, log = self._init_config_log(config_filename,
                                         self.base_directory,
                                         self.config_directory)
     # Prepare a data catalog
     data_catalog = DataCatalog({
         'config': MemoryDataSet(),
         'log': MemoryDataSet(),
         'expanded_config': MemoryDataSet()
     })
     data_catalog.save('config', config)
     data_catalog.save('log', log)
     # Assemble nodes into a pipeline
     pipeline = Pipeline([expand_config_node])
     # Create a runner to run the pipeline
     runner = SequentialRunner()
     # Run the pipeline
     runner.run(pipeline, data_catalog)
     return log, data_catalog
    def run(self):
        """
        Run all tasks
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet(),
            'dataset': MemoryDataSet(),
            'data': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
        prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
        split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
        learn_node = mls.sl.workflows.tasks.LearnTask.get_node()
        evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node()
        # Assemble nodes into a pipeline
        pipeline = Pipeline([
            load_data_node, prepare_data_node, split_data_node, learn_node,
            evaluate_node
        ])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        runner.run(pipeline, data_catalog)
        self.terminate()
Ejemplo n.º 9
0
 def _run_one_task(self, config_filename):
     # create node from Task
     load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
     prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
     split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
     learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node()
     config, log = self._init_config_log(config_filename,
                                         self.base_directory,
                                         self.config_directory)
     # Prepare a data catalog
     data_catalog = DataCatalog({
         'config': MemoryDataSet(),
         'log': MemoryDataSet(),
         'base_directory': MemoryDataSet()
     })
     data_catalog.save('config', config)
     data_catalog.save('log', log)
     data_catalog.save('base_directory', self.base_directory)
     # Assemble nodes into a pipeline
     pipeline = Pipeline([
         load_data_node, prepare_data_node, split_data_node, learn_data_node
     ])
     # Create a runner to run the pipeline
     runner = SequentialRunner()
     # Run the pipeline
     runner.run(pipeline, data_catalog)
     return log, config, data_catalog
    def run(self):
        """
        Run the workflow : run each config
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
        multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node(
        )

        # Assemble nodes into a pipeline
        pipeline = Pipeline([expand_config_node, multiple_learning_node])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        result = runner.run(pipeline, data_catalog)
        if len(result) == 0:
            self.terminate()
Ejemplo n.º 11
0
 def test_save_to_unregistered(self, dummy_dataframe):
     """Check the error when attempting to save to unregistered data set"""
     catalog = DataCatalog(data_sets={})
     pattern = r"DataSet 'test' not found in the catalog"
     with pytest.raises(DataSetNotFoundError, match=pattern):
         catalog.save("test", dummy_dataframe)
Ejemplo n.º 12
0
    join_statements, inputs="my_salutation", outputs="my_message"
)

#

# adder_node = node(
#     func=add, inputs=["a", "b"], outputs="sum"
# )

generation_node = node(
    generation, inputs=["num_samples"], outputs=None
)

# Assemble nodes into a pipeline
#pipeline = Pipeline([return_greeting_node, join_statements_node])
#pipeline = Pipeline([return_greeting_node])
pipeline = Pipeline([generation_node])
# Create a runner to run the pipeline
runner = SequentialRunner()

# Run the pipeline
#print(runner.run(pipeline, data_catalog))
#print(runner.run(pipeline, DataCatalog(dict(a=2,b=3))))


#io = DataCatalog(dict(a=MemoryDataSet(),b=MemoryDataSet()))
io = DataCatalog(dict(num_samples=MemoryDataSet()))
io.save("num_samples",1)
# print(adder_node.run(dict(a=2, b=3)))
print(runner.run(pipeline, io))
Ejemplo n.º 13
0
    def _run_one_task(self, config_filename):
        # create node from Task
        expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
        multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node()
        # Prepare a data catalog
        config, log = self._init_config_log(config_filename,
                                            self.base_directory,
                                            self.config_directory)
        expanded_config = [{"input": {"type": "NClassRandomClassificationWithNoise",
                                      "parameters": {"n_samples": 100, "shuffle": True, "random_state": 0, "noise": 0}
                                      },
                            "split": {"type": "traintest",
                                      "parameters": {"test_size": 20, "random_state": 0, "shuffle": True}
                                      },
                            "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier",
                                          "hyperparameters": {
                                              "n_neighbors": 15,
                                              "algorithm": "auto",
                                              "weights": "uniform"
                                          }
                                          }
                            },
                           {"input": {"type": "make_circles",
                                      "parameters": {
                                          "n_samples": 100,
                                          "shuffle": True,
                                          "noise": 0,
                                          "random_state": 0,
                                          "factor": 0.3
                                      }},
                            "split": {"type": "traintest",
                                      "parameters": {"test_size": 20, "random_state": 0, "shuffle": True}
                                      },
                            "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier",
                                          "hyperparameters": {
                                              "n_neighbors": 15,
                                              "algorithm": "auto",
                                              "weights": "uniform"
                                          }
                                          }
                            },
                           {"input": {"type": "load_iris",
                                      "parameters": {}
                                      },
                            "split": {"type": "traintest",
                                      "parameters": {"test_size": 20, "random_state": 0, "shuffle": True}
                                      },
                            "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier",
                                          "hyperparameters": {
                                              "n_neighbors": 15,
                                              "algorithm": "auto",
                                              "weights": "uniform"
                                          }
                                          }
                            }]

        data_catalog = DataCatalog({'config': MemoryDataSet(),
                                    'log': MemoryDataSet(),
                                    'base_directory': MemoryDataSet(),
                                    'expanded_config': MemoryDataSet()})
        data_catalog.save('config', config)
        data_catalog.save('log', log)
        data_catalog.save('base_directory', self.base_directory)
        data_catalog.save('expanded_config', expanded_config)
        # Assemble nodes into a pipeline
        pipeline = Pipeline([expand_config_node, multiple_learning_node])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        runner.run(pipeline, data_catalog)
        return log, data_catalog
Ejemplo n.º 14
0
def basic_catalog(basic_data):
    from kedro.io import DataCatalog, MemoryDataSet

    catalog = DataCatalog({"basic_data": MemoryDataSet()})
    catalog.save("basic_data", basic_data)
    return catalog
class KedroPipelineModel(PythonModel):
    def __init__(
        self,
        pipeline: Pipeline,
        catalog: DataCatalog,
        input_name: str,
        runner: Optional[AbstractRunner] = None,
        copy_mode: Optional[Union[Dict[str, str], str]] = None,
    ):
        """[summary]

        Args:
            pipeline (Pipeline): A Kedro Pipeline object to
            store as a Mlflow Model. Also works with kedro_mlflow PipelineML objects.

            catalog (DataCatalog): The DataCatalog associated
            to the PipelineMl

            runner (Optional[AbstractRunner], optional): The kedro
            AbstractRunner to use. Defaults to SequentialRunner if
            None.

            copy_mode (Optional[Union[Dict[str,str], str]]):
            The copy_mode of each DataSet of the catalog
            when reconstructing the DataCatalog in memory.
            You can pass either:
                - None to use Kedro default mode for each dataset
                - a single string ("deepcopy", "copy" and "assign")
                to apply to all datasets
                - a dictionnary with (dataset name, copy_mode) key/values
                pairs. The associated mode must be a valid kedro mode
                ("deepcopy", "copy" and "assign") for each. Defaults to None.
        """

        self.pipeline = (pipeline.inference if isinstance(
            pipeline, PipelineML) else pipeline)
        self.input_name = input_name
        self.initial_catalog = self._extract_pipeline_catalog(catalog)

        nb_outputs = len(self.pipeline.outputs())
        if nb_outputs != 1:
            outputs_list_str = "\n - ".join(self.pipeline.outputs())
            raise ValueError(
                f"Pipeline must have one and only one output, got '{nb_outputs}' outputs: \n - {outputs_list_str}"
            )
        self.output_name = list(self.pipeline.outputs())[0]
        self.runner = runner or SequentialRunner()
        self.copy_mode = copy_mode or {}
        # copy mode has been converted because it is a property
        # TODO: we need to use the runner's default dataset in case of multithreading
        self.loaded_catalog = DataCatalog(
            data_sets={
                name: MemoryDataSet(copy_mode=copy_mode)
                for name, copy_mode in self.copy_mode.items()
            })

    @property
    def _logger(self) -> logging.Logger:
        return logging.getLogger(__name__)

    @property
    def copy_mode(self):
        return self._copy_mode

    @copy_mode.setter
    def copy_mode(self, copy_mode):

        if isinstance(copy_mode, str) or copy_mode is None:
            # if it is a string, we must create manually the dictionary
            # of all catalog entries with this copy_mode
            self._copy_mode = {
                name: copy_mode
                for name in self.pipeline.data_sets()
                if name != self.output_name
            }
        elif isinstance(copy_mode, dict):
            # if it is a dict we will retrieve the copy mode when necessary
            # it does not matter if this dict does not contain all the catalog entries
            # the others will be returned as None when accessing with dict.get()
            self._copy_mode = {
                name: None
                for name in self.pipeline.data_sets()
                if name != self.output_name
            }
            self._copy_mode.update(copy_mode)
        else:
            raise TypeError(
                f"'copy_mode' must be a 'str' or a 'dict', not '{type(copy_mode)}'"
            )

    def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:

        sub_catalog = DataCatalog()
        for data_set_name in self.pipeline.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # and even if it is, we keep only an ampty memory dataset to avoid
                # extra uneccessary dependencies: this dataset will be replaced at
                # inference time and we do not need to know the original type, see
                # https://github.com/Galileo-Galilei/kedro-mlflow/issues/273
                sub_catalog.add(data_set_name=data_set_name,
                                data_set=MemoryDataSet())
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(
                            data_set, MemoryDataSet
                    ) and not data_set_name.startswith("params:"):
                        raise KedroPipelineModelError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    self._logger.info(
                        f"The data_set '{data_set_name}' is added to the Pipeline catalog."
                    )
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroPipelineModelError((
                        f"The provided catalog must contains '{data_set_name}' data_set "
                        "since it is the input of the pipeline."))

        return sub_catalog

    def extract_pipeline_artifacts(
            self, parameters_saving_folder: Optional[Path] = None):

        artifacts = {}
        for name, dataset in self.initial_catalog._data_sets.items():
            if name != self.input_name:
                if name.startswith("params:"):
                    # we need to persist it locally for mlflow access
                    absolute_param_path = (parameters_saving_folder /
                                           f"params_{name[7:]}.pkl")
                    persisted_dataset = PickleDataSet(
                        filepath=absolute_param_path.as_posix())
                    persisted_dataset.save(dataset.load())
                    artifact_path = absolute_param_path.as_uri()
                    self._logger.info((
                        f"The parameter '{name[7:]}' is persisted (as pickle) "
                        "at the following location: '{artifact_path}'"))
                else:
                    # In this second case, we know it cannot be a MemoryDataSet
                    # weird bug when directly converting PurePosixPath to windows: it is considered as relative
                    artifact_path = (Path(
                        dataset._filepath.as_posix()).resolve().as_uri())

                artifacts[name] = artifact_path

        return artifacts

    def load_context(self, context):

        # a consistency check is made when loading the model
        # it would be better to check when saving the model
        # but we rely on a mlflow function for saving, and it is unaware of kedro
        # pipeline structure
        mlflow_artifacts_keys = set(context.artifacts.keys())
        kedro_artifacts_keys = set(self.pipeline.inputs() - {self.input_name})
        if mlflow_artifacts_keys != kedro_artifacts_keys:
            in_artifacts_but_not_inference = (mlflow_artifacts_keys -
                                              kedro_artifacts_keys)
            in_inference_but_not_artifacts = (kedro_artifacts_keys -
                                              mlflow_artifacts_keys)
            raise ValueError((
                "Provided artifacts do not match catalog entries:"
                f"\n    - 'artifacts - inference.inputs()' = : {in_artifacts_but_not_inference}"
                f"\n    - 'inference.inputs() - artifacts' = : {in_inference_but_not_artifacts}"
            ))

        updated_catalog = self.initial_catalog.shallow_copy()
        for name, uri in context.artifacts.items():
            updated_catalog._data_sets[name]._filepath = Path(uri)
            self.loaded_catalog.save(name=name,
                                     data=updated_catalog.load(name))

    def predict(self, context, model_input):

        # we create an empty hook manager but do NOT register hooks
        # because we want this model be executable outside of a kedro project
        hook_manager = _create_hook_manager()

        self.loaded_catalog.save(
            name=self.input_name,
            data=model_input,
        )

        run_output = self.runner.run(
            pipeline=self.pipeline,
            catalog=self.loaded_catalog,
            hook_manager=hook_manager,
        )

        # unpack the result to avoid messing the json
        # file with the name of the Kedro dataset
        unpacked_output = run_output[self.output_name]

        return unpacked_output
Ejemplo n.º 16
0
class KedroPipelineModel(PythonModel):
    def __init__(
        self,
        pipeline_ml: PipelineML,
        catalog: DataCatalog,
        runner: Optional[AbstractRunner] = None,
        copy_mode: Optional[Union[Dict[str, str], str]] = None,
    ):
        """[summary]

        Args:
            pipeline_ml (PipelineML): A PipelineML object to
            store as a Mlflow Model

            catalog (DataCatalog): The DataCatalog associated
            to the PipelineMl

            runner (Optional[AbstractRunner], optional): The kedro
            AbstractRunner to use. Defaults to SequentialRunner if
            None.

            copy_mode (Optional[Union[Dict[str,str], str]]):
            The copy_mode of each DataSet of the catalog
            when reconstructing the DataCatalog in memory.
            You can pass either:
                - None to use Kedro default mode for each dataset
                - a single string ("deepcopy", "copy" and "assign")
                to apply to all datasets
                - a dictionnary with (dataset name, copy_mode) key/values
                pairs. The associated mode must be a valid kedro mode
                ("deepcopy", "copy" and "assign") for each. Defaults to None.
        """

        self.pipeline_ml = pipeline_ml
        self.initial_catalog = pipeline_ml._extract_pipeline_catalog(catalog)
        # we have the guarantee that there is only one output in inference
        self.output_name = list(pipeline_ml.inference.outputs())[0]
        self.runner = runner or SequentialRunner()
        self.copy_mode = copy_mode or {}

        # copy mode has been converted because it is a property
        # TODO: we need to use the runner's default dataset in case of multithreading
        self.loaded_catalog = DataCatalog(
            data_sets={
                name: MemoryDataSet(copy_mode=copy_mode)
                for name, copy_mode in self.copy_mode.items()
            }
        )

    @property
    def copy_mode(self):
        return self._copy_mode

    @copy_mode.setter
    def copy_mode(self, copy_mode):

        if isinstance(copy_mode, str) or copy_mode is None:
            # if it is a string, we must create manually the dictionary
            # of all catalog entries with this copy_mode
            self._copy_mode = {
                name: copy_mode
                for name in self.pipeline_ml.inference.data_sets()
                if name != self.output_name
            }
        elif isinstance(copy_mode, dict):
            # if it is a dict we will retrieve the copy mode when necessary
            # it does not matter if this dict does not contain all the catalog entries
            # the others will be returned as None when accessing with dict.get()
            self._copy_mode = {
                name: None
                for name in self.pipeline_ml.inference.data_sets()
                if name != self.output_name
            }
            self._copy_mode.update(copy_mode)
        else:
            raise TypeError(
                f"'copy_mode' must be a 'str' or a 'dict', not '{type(copy_mode)}'"
            )

    def load_context(self, context):

        # a consistency check is made when loading the model
        # it would be better to check when saving the model
        # but we rely on a mlflow function for saving, and it is unaware of kedro
        # pipeline structure
        mlflow_artifacts_keys = set(context.artifacts.keys())
        kedro_artifacts_keys = set(
            self.pipeline_ml.inference.inputs() - {self.pipeline_ml.input_name}
        )
        if mlflow_artifacts_keys != kedro_artifacts_keys:
            in_artifacts_but_not_inference = (
                mlflow_artifacts_keys - kedro_artifacts_keys
            )
            in_inference_but_not_artifacts = (
                kedro_artifacts_keys - mlflow_artifacts_keys
            )
            raise ValueError(
                (
                    "Provided artifacts do not match catalog entries:"
                    f"\n    - 'artifacts - inference.inputs()' = : {in_artifacts_but_not_inference}"
                    f"\n    - 'inference.inputs() - artifacts' = : {in_inference_but_not_artifacts}"
                )
            )

        updated_catalog = deepcopy(self.initial_catalog)
        for name, uri in context.artifacts.items():
            updated_catalog._data_sets[name]._filepath = Path(uri)
            self.loaded_catalog.save(name=name, data=updated_catalog.load(name))

    def predict(self, context, model_input):
        # TODO : checkout out how to pass extra args in predict
        # for instance, to enable parallelization

        self.loaded_catalog.save(
            name=self.pipeline_ml.input_name,
            data=model_input,
        )

        run_output = self.runner.run(
            pipeline=self.pipeline_ml.inference, catalog=self.loaded_catalog
        )

        # unpack the result to avoid messing the json
        # file with the name of the Kedro dataset
        unpacked_output = run_output[self.output_name]

        return unpacked_output