def catalog_with_stopwords(tmp_path):
    catalog_with_stopwords = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "stopwords_from_nltk":
        PickleDataSet((tmp_path / "stopwords.pkl").resolve().as_posix()),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
    })
    return catalog_with_stopwords
def catalog_with_encoder(tmp_path):
    catalog_with_encoder = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "encoder":
        PickleDataSet((tmp_path / "encoder.pkl").resolve().as_posix()),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
    })
    return catalog_with_encoder
Esempio n. 3
0
def new_catalog_with_layers():
    data_sets = {
        "bob_in": PickleDataSet("raw.csv"),
        "params:key": MemoryDataSet("value"),
        "result": PickleDataSet("final.csv"),
    }
    layers = {"raw": {"bob_in"}, "final": {"result"}}

    catalog = DataCatalog(data_sets=data_sets)
    setattr(catalog, "layers", layers)

    return catalog
Esempio n. 4
0
def old_catalog_with_layers():
    data_sets = {
        "bob_in": PickleDataSet("raw.csv"),
        "params:key": MemoryDataSet("value"),
        "result": PickleDataSet("final.csv"),
    }
    setattr(data_sets["bob_in"], "_layer", "raw")
    setattr(data_sets["result"], "_layer", "final")
    catalog = DataCatalog(data_sets=data_sets)
    try:
        catalog.__dict__.pop("layers")
    except KeyError:
        pass

    return catalog
Esempio n. 5
0
def test_model_packaging(tmp_path, pipeline_ml_obj):

    catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    catalog._data_sets["model"].save(2)  # emulate model fitting

    artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog)

    kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj,
                                     catalog=catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={"python": "3.7.0"},
        )
        run_id = mlflow.active_run().info.run_id

    loaded_model = mlflow.pyfunc.load_model(
        model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
    assert loaded_model.predict(1) == 2
Esempio n. 6
0
    def test_http_filesystem_no_versioning(self):
        pattern = r"HTTP\(s\) DataSet doesn't support versioning\."

        with pytest.raises(DataSetError, match=pattern):
            PickleDataSet(
                filepath="https://example.com/file.pkl", version=Version(None, None)
            )
def test_model_packaging_missing_artifacts(tmp_path, pipeline_ml_obj):

    catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj,
                                     catalog=catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=None,  # no artifacts provided
            conda_env={"python": "3.7.0"},
        )
        run_id = mlflow.active_run().info.run_id

    with pytest.raises(
            ValueError,
            match="Provided artifacts do not match catalog entries"):
        mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id /
                                            "model").as_posix())
Esempio n. 8
0
 def __init__(self, layers):
     self._data_sets = {
         "cat": PickleDataSet(filepath=str(tmp_path)),
         "parameters": MemoryDataSet({"name": "value"}),
         "params:rabbit": MemoryDataSet("value"),
     }
     self.layers = layers
    def extract_pipeline_artifacts(
            self, parameters_saving_folder: Optional[Path] = None):

        artifacts = {}
        for name, dataset in self.initial_catalog._data_sets.items():
            if name != self.input_name:
                if name.startswith("params:"):
                    # we need to persist it locally for mlflow access
                    absolute_param_path = (parameters_saving_folder /
                                           f"params_{name[7:]}.pkl")
                    persisted_dataset = PickleDataSet(
                        filepath=absolute_param_path.as_posix())
                    persisted_dataset.save(dataset.load())
                    artifact_path = absolute_param_path.as_uri()
                    self._logger.info((
                        f"The parameter '{name[7:]}' is persisted (as pickle) "
                        "at the following location: '{artifact_path}'"))
                else:
                    # In this second case, we know it cannot be a MemoryDataSet
                    # weird bug when directly converting PurePosixPath to windows: it is considered as relative
                    artifact_path = (Path(
                        dataset._filepath.as_posix()).resolve().as_uri())

                artifacts[name] = artifact_path

        return artifacts
Esempio n. 10
0
def pickle_data_set(filepath_pickle, load_args, save_args, fs_args):
    return PickleDataSet(
        filepath=filepath_pickle,
        load_args=load_args,
        save_args=save_args,
        fs_args=fs_args,
    )
Esempio n. 11
0
    def _load(self):
        self._init_dataset()

        if self._cache and self._cache.exists():
            return self._cache.load()

        if self.file_caching and self._dataset.exists():
            return self._dataset.load()

        import mlflow

        client = mlflow.tracking.MlflowClient(
            tracking_uri=self.loading_tracking_uri)

        self.loading_run_id = self.loading_run_id or mlflow.active_run(
        ).info.run_id

        if self.dataset in {"p"}:
            run = client.get_run(self.loading_run_id)
            value = run.data.params.get(self.dataset_name, None)
            if value is None:
                raise KeyError("param '{}' not found in run_id '{}'.".format(
                    self.dataset_name, self.loading_run_id))

            PickleDataSet(filepath=self.filepath).save(value)

        elif self.dataset in {"m"}:
            run = client.get_run(self.loading_run_id)
            value = run.data.metrics.get(self.dataset_name, None)
            if value is None:
                raise KeyError("metric '{}' not found in run_id '{}'.".format(
                    self.dataset_name, self.loading_run_id))
            PickleDataSet(filepath=self.filepath).save(value)

        else:
            p = Path(self.filepath)

            dst_path = tempfile.gettempdir()
            downloaded_path = client.download_artifacts(
                run_id=self.loading_run_id,
                path=p.name,
                dst_path=dst_path,
            )
            if Path(downloaded_path) != p:
                Path(downloaded_path).rename(p)

        return self._dataset.load()
def test_model_packaging_too_many_artifacts(tmp_path,
                                            pipeline_inference_dummy):

    catalog = DataCatalog({
        "raw_data":
        PickleDataSet(filepath=(tmp_path /
                                "raw_data.pkl").resolve().as_posix()),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    catalog._data_sets["raw_data"].save(1)  # emulate input on disk
    catalog._data_sets["model"].save(2)  # emulate model fitting

    # the input is persited
    artifacts = {
        name: Path(dataset._filepath.as_posix()).resolve().as_uri(
        )  # weird bug when directly converting PurePosixPath to windows: it is considered as relative
        for name, dataset in catalog._data_sets.items()
        if not isinstance(dataset, MemoryDataSet)
    }

    kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy,
                                     catalog=catalog,
                                     input_name="raw_data")

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={
                "python": "3.7.0",
                "dependencies": ["kedro==0.16.5"]
            },
        )
        run_id = mlflow.active_run().info.run_id

    with pytest.raises(
            ValueError,
            match="Provided artifacts do not match catalog entries"):
        mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id /
                                            "model").as_posix())
Esempio n. 13
0
    def test_protocol_usage(self, filepath, instance_type):
        data_set = PickleDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)

        path = filepath.split(PROTOCOL_DELIMITER, 1)[-1]

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
Esempio n. 14
0
def data_catalog(tmp_path):
    source_path = Path(__file__).parent / "data/test.parquet"
    spark_in = SparkDataSet(source_path.as_posix())
    spark_out = SparkDataSet((tmp_path / "spark_data").as_posix())
    pickle_ds = PickleDataSet((tmp_path / "pickle/test.pkl").as_posix())

    return DataCatalog(
        {"spark_in": spark_in, "spark_out": spark_out, "pickle_ds": pickle_ds}
    )
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })
    return dummy_catalog
Esempio n. 16
0
    def test_version_str_repr(self, load_version, save_version):
        """Test that version is in string representation of the class instance
        when applicable."""
        filepath = "test.pkl"
        ds = PickleDataSet(filepath=filepath)
        ds_versioned = PickleDataSet(filepath=filepath,
                                     version=Version(load_version,
                                                     save_version))
        assert filepath in str(ds)
        assert "version" not in str(ds)

        assert filepath in str(ds_versioned)
        ver_str = "version=Version(load={}, save='{}')".format(
            load_version, save_version)
        assert ver_str in str(ds_versioned)
        assert "PickleDataSet" in str(ds_versioned)
        assert "PickleDataSet" in str(ds)
        assert "protocol" in str(ds_versioned)
        assert "protocol" in str(ds)
Esempio n. 17
0
 def test_invalid_backend(self, mocker):
     pattern = (
         r"Selected backend 'invalid' should satisfy the pickle interface. "
         r"Missing one of `load` and `dump` on the backend.")
     mocker.patch(
         "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module",
         return_value=object,
     )
     with pytest.raises(ValueError, match=pattern):
         PickleDataSet(filepath="test.pkl", backend="invalid")
Esempio n. 18
0
 def test_no_backend(self, mocker):
     pattern = (
         r"Selected backend 'fake.backend.does.not.exist' could not be imported. "
         r"Make sure it is installed and importable.")
     mocker.patch(
         "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module",
         side_effect=ImportError,
     )
     with pytest.raises(ImportError, match=pattern):
         PickleDataSet(filepath="test.pkl",
                       backend="fake.backend.does.not.exist")
Esempio n. 19
0
    def test_protocol_usage(self, filepath, instance_type):
        data_set = PickleDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)

        # _strip_protocol() doesn't strip http(s) protocol
        if data_set._protocol == "https":
            path = filepath.split("://")[-1]
        else:
            path = data_set._fs._strip_protocol(filepath)

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(1),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
Esempio n. 22
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "data" / "06_models" /
                                "model.pkl").resolve().as_posix()),
    })
    dummy_catalog._data_sets["model"].save(2)  # emulate model fitting

    return dummy_catalog
Esempio n. 23
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog(
        {
            "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
            "params:unused_param": MemoryDataSet("blah"),
            "data": MemoryDataSet(),
            "model": PickleDataSet((tmp_path / "model.csv").as_posix()),
            "my_metrics": MlflowMetricsDataSet(),
            "another_metrics": MlflowMetricsDataSet(prefix="foo"),
            "my_metric": MlflowMetricDataSet(),
            "another_metric": MlflowMetricDataSet(key="foo"),
            "my_metric_history": MlflowMetricHistoryDataSet(),
            "another_metric_history": MlflowMetricHistoryDataSet(key="bar"),
        }
    )
    return dummy_catalog
def catalog_with_parameters(tmp_path):
    catalog_with_parameters = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "params:stopwords":
        MemoryDataSet(["Hello", "Hi"]),
        "params:penalty":
        MemoryDataSet(0),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
        "params:threshold":
        MemoryDataSet(0.5),
    })
    return catalog_with_parameters
Esempio n. 25
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleDataSet(filepath=str(tmp_path / "data.pkl"))
        catalog = DataCatalog(
            data_sets={
                "spark_in": spark_in,
                "pickle": pickle_data,
                "spark_out": spark_out,
            }
        )
        pipeline = Pipeline(
            [
                node(identity, "spark_in", "pickle"),
                node(identity, "pickle", "spark_out"),
            ]
        )
        runner = ParallelRunner()

        pattern = r"Failed while saving data to data set PickleDataSet"

        with pytest.raises(DataSetError, match=pattern):
            runner.run(pipeline, catalog)
Esempio n. 26
0
    def extract_pipeline_artifacts(self, catalog: DataCatalog,
                                   temp_folder: Path):
        pipeline_catalog = self._extract_pipeline_catalog(catalog)

        artifacts = {}
        for name, dataset in pipeline_catalog._data_sets.items():
            if name != self.input_name:
                if name.startswith("params:"):
                    # we need to persist it locally for mlflow access
                    absolute_param_path = temp_folder / f"params_{name[7:]}.pkl"
                    persisted_dataset = PickleDataSet(
                        filepath=absolute_param_path.as_posix())
                    persisted_dataset.save(dataset.load())
                    artifact_path = absolute_param_path.as_uri()
                else:
                    # In this second case, we know it cannot be a MemoryDataSet
                    # weird bug when directly converting PurePosixPath to windows: it is considered as relative
                    artifact_path = (Path(
                        dataset._filepath.as_posix()).resolve().as_uri())

                artifacts[name] = artifact_path

        return artifacts
Esempio n. 27
0
def versioned_pickle_data_set(filepath_pickle, load_version, save_version):
    return PickleDataSet(filepath=filepath_pickle,
                         version=Version(load_version, save_version))
Esempio n. 28
0
 def test_no_joblib(self, mocker):
     mocker.patch.object(PickleDataSet, "BACKENDS", {"joblib": None})
     with pytest.raises(ImportError):
         PickleDataSet(filepath="test.pkl", backend="joblib")
Esempio n. 29
0
 def test_invalid_backend(self):
     pattern = r"'backend' should be one of \['pickle', 'joblib'\], got 'invalid'\."
     with pytest.raises(ValueError, match=pattern):
         PickleDataSet(filepath="test.pkl", backend="invalid")
Esempio n. 30
0
 def test_catalog_release(self, mocker):
     fs_mock = mocker.patch("fsspec.filesystem").return_value
     filepath = "test.pkl"
     data_set = PickleDataSet(filepath=filepath)
     data_set.release()
     fs_mock.invalidate_cache.assert_called_once_with(filepath)