def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:

        sub_catalog = DataCatalog()
        for data_set_name in self.pipeline.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # and even if it is, we keep only an ampty memory dataset to avoid
                # extra uneccessary dependencies: this dataset will be replaced at
                # inference time and we do not need to know the original type, see
                # https://github.com/Galileo-Galilei/kedro-mlflow/issues/273
                sub_catalog.add(data_set_name=data_set_name,
                                data_set=MemoryDataSet())
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(
                            data_set, MemoryDataSet
                    ) and not data_set_name.startswith("params:"):
                        raise KedroPipelineModelError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    self._logger.info(
                        f"The data_set '{data_set_name}' is added to the Pipeline catalog."
                    )
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroPipelineModelError((
                        f"The provided catalog must contains '{data_set_name}' data_set "
                        "since it is the input of the pipeline."))

        return sub_catalog
Beispiel #2
0
    def extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:
        sub_catalog = DataCatalog()
        for data_set_name in self.inference.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # thus it is allowed to be an empty memory dataset
                data_set = catalog._data_sets.get(
                    data_set_name) or MemoryDataSet()
                sub_catalog.add(data_set_name=data_set_name, data_set=data_set)
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(data_set, MemoryDataSet):
                        raise KedroMlflowPipelineMLDatasetsError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroMlflowPipelineMLDatasetsError("""
                                The provided catalog must contains '{data_set_name}' data_set
                                since it is an input for inference pipeline.
                                """.format(data_set_name=data_set_name))

        return sub_catalog
Beispiel #3
0
    def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]:
        """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``
        and save results back to the same objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """

        catalog = catalog.shallow_copy()

        unsatisfied = pipeline.inputs() - set(catalog.list())
        if unsatisfied:
            raise ValueError("Pipeline input(s) {} not found in the "
                             "DataCatalog".format(unsatisfied))

        free_outputs = pipeline.outputs() - set(catalog.list())
        unregistered_ds = pipeline.data_sets() - set(catalog.list())
        for ds_name in unregistered_ds:
            catalog.add(ds_name, self.create_default_data_set(ds_name))

        self._run(pipeline, catalog)

        self._logger.info("Pipeline execution completed successfully.")

        return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}
Beispiel #4
0
    def test_add_save_and_load(self, data_set, dummy_dataframe):
        """Test adding and then saving and reloading the data set"""
        catalog = DataCatalog(data_sets={})
        catalog.add("test", data_set)
        catalog.save("test", dummy_dataframe)
        reloaded_df = catalog.load("test")

        assert_frame_equal(reloaded_df, dummy_dataframe)
Beispiel #5
0
    def test_all_before_adding(self, fake_data_set, fake_transformer):
        catalog = DataCatalog()
        catalog.add_transformer(fake_transformer)
        catalog.add("test", fake_data_set)

        catalog.save("test", 42)
        assert catalog.load("test") == 44
        assert fake_data_set.log == [("save", 43), ("load", 43)]
        assert fake_transformer.log == [("save", 42), ("load", 43)]
Beispiel #6
0
class KedroPipelineModel(PythonModel):
    def __init__(self, pipeline_ml: PipelineML, catalog: DataCatalog):

        self.pipeline_ml = pipeline_ml
        self.initial_catalog = pipeline_ml._extract_pipeline_catalog(catalog)
        self.loaded_catalog = DataCatalog()
        # we have the guarantee that there is only one output in inference
        self.output_name = list(pipeline_ml.inference.outputs())[0]

    def load_context(self, context):

        # a consistency check is made when loading the model
        # it would be better to check when saving the model
        # but we rely on a mlflow function for saving, and it is unaware of kedro
        # pipeline structure
        mlflow_artifacts_keys = set(context.artifacts.keys())
        kedro_artifacts_keys = set(
            self.pipeline_ml.inference.inputs() - {self.pipeline_ml.input_name}
        )
        if mlflow_artifacts_keys != kedro_artifacts_keys:
            in_artifacts_but_not_inference = (
                mlflow_artifacts_keys - kedro_artifacts_keys
            )
            in_inference_but_not_artifacts = (
                kedro_artifacts_keys - mlflow_artifacts_keys
            )
            raise ValueError(
                (
                    "Provided artifacts do not match catalog entries:"
                    f"\n    - 'artifacts - inference.inputs()' = : {in_artifacts_but_not_inference}"
                    f"\n    - 'inference.inputs() - artifacts' = : {in_inference_but_not_artifacts}"
                )
            )

        self.loaded_catalog = deepcopy(self.initial_catalog)
        for name, uri in context.artifacts.items():
            self.loaded_catalog._data_sets[name]._filepath = Path(uri)

    def predict(self, context, model_input):
        # TODO : checkout out how to pass extra args in predict
        # for instance, to enable parallelization

        self.loaded_catalog.add(
            data_set_name=self.pipeline_ml.input_name,
            data_set=MemoryDataSet(model_input),
            replace=True,
        )
        runner = SequentialRunner()
        run_outputs = runner.run(
            pipeline=self.pipeline_ml.inference, catalog=self.loaded_catalog
        )
        return run_outputs[
            self.output_name
        ]  # unpack the result to avoid messing the json output
Beispiel #7
0
    def before_pipeline_run(self, run_params: Dict, pipeline: Pipeline,
                            catalog: DataCatalog):
        if not self._enabled:
            return

        logger.info("KedroWings is Enabled")
        all_dataset_names = set([
            ds for node in pipeline.nodes
            for ds in [inp for inp in node.inputs] +
            [outp for outp in node.outputs]
        ])

        catalog_entries = self._create_catalog_entries(all_dataset_names)

        existing_catalog_names = set(catalog.list())
        for catalog_name, catalog_dataset in catalog_entries.items():
            if catalog_name in existing_catalog_names:
                continue
            catalog.add(catalog_name, catalog_dataset)
Beispiel #8
0
    def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:

        # check that the pipeline is consistent in case its attributes have been
        # modified manually
        self._check_consistency()

        sub_catalog = DataCatalog()
        for data_set_name in self.inference.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # thus it is allowed to be an empty memory dataset
                data_set = catalog._data_sets.get(
                    data_set_name) or MemoryDataSet()
                sub_catalog.add(data_set_name=data_set_name, data_set=data_set)
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(
                            data_set, MemoryDataSet
                    ) and not data_set_name.startswith("params:"):
                        raise KedroMlflowPipelineMLDatasetsError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    self._logger.info(
                        f"The data_set '{data_set_name}' is added to the PipelineML catalog."
                    )
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroMlflowPipelineMLDatasetsError("""
                                The provided catalog must contains '{data_set_name}' data_set
                                since it is an input for inference pipeline.
                                """.format(data_set_name=data_set_name))

        return sub_catalog
Beispiel #9
0
def sample_data_catalog_train(sample_data: pd.DataFrame) -> DataCatalog:
    """Generate data catalog for end to end feature engineering pipeline test.

    Args:
        sample_data (pd.DataFrame): Some sample training data.

    Returns:
        DataCatalog: Data catalog with sample training data.
    """
    catalog = DataCatalog()

    catalog.add("iris", MemoryDataSet(data=sample_data))
    catalog.add("params:target", MemoryDataSet(data="species"))
    catalog.add("params:test_fraction", MemoryDataSet(data=0.25))
    catalog.add("params:seed", MemoryDataSet(data=42))

    return catalog
def _make_catalog(
    existent=None, non_existent=None, no_exists_method=None, feed_dict=None
):
    """Creates a catalog of existent and non-existent DataSets."""
    existent = [] if existent is None else existent
    non_existent = [] if non_existent is None else non_existent
    no_exists_method = [] if no_exists_method is None else no_exists_method

    catalog = DataCatalog(feed_dict=feed_dict)
    for source in existent:
        catalog.add(source, LambdaDataSet(None, None, lambda: True))
    for source in non_existent:
        catalog.add(source, LambdaDataSet(None, None, lambda: False))
    # Some LambdaDataSet do not have exists() method
    for source in no_exists_method:
        catalog.add(source, LambdaDataSet(None, None))
    return catalog