Example #1
0
def test_model_packaging(tmp_path, pipeline_ml_obj):

    catalog = DataCatalog(
        {
            "raw_data": MemoryDataSet(),
            "data": MemoryDataSet(),
            "model": PickleDataSet(
                filepath=(tmp_path / "model.pkl").resolve().as_posix()
            ),
        }
    )

    catalog._data_sets["model"].save(2)  # emulate model fitting

    artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog)

    kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={"python": "3.7.0"},
        )
        run_id = mlflow.active_run().info.run_id

    loaded_model = mlflow.pyfunc.load_model(
        model_uri=(Path(r"runs:/") / run_id / "model").as_posix()
    )
    assert loaded_model.predict(1) == {"predictions": 2}
Example #2
0
def pickle_data_set(filepath_pickle, load_args, save_args, fs_args):
    return PickleDataSet(
        filepath=filepath_pickle,
        load_args=load_args,
        save_args=save_args,
        fs_args=fs_args,
    )
Example #3
0
 def __init__(self, layers):
     self._data_sets = {
         "cat": PickleDataSet(filepath=str(tmp_path)),
         "parameters": MemoryDataSet("value"),
         "params:rabbit": MemoryDataSet("value"),
     }
     self.layers = layers
Example #4
0
    def test_http_filesystem_no_versioning(self):
        pattern = r"HTTP\(s\) DataSet doesn't support versioning\."

        with pytest.raises(DataSetError, match=pattern):
            PickleDataSet(
                filepath="https://example.com/file.pkl", version=Version(None, None)
            )
def test_model_packaging_missing_artifacts(tmp_path, pipeline_inference_dummy):

    catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy,
                                     catalog=catalog,
                                     input_name="raw_data")

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=None,  # no artifacts provided
            conda_env={
                "python": "3.7.0",
                "dependencies": ["kedro==0.16.5"]
            },
        )
        run_id = mlflow.active_run().info.run_id

    with pytest.raises(
            ValueError,
            match="Provided artifacts do not match catalog entries"):
        mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id /
                                            "model").as_posix())
Example #6
0
    def _load(self):
        self._init_dataset()

        if self._cache and self._cache.exists():
            return self._cache.load()

        if self.file_caching and self._dataset.exists():
            return self._dataset.load()

        import mlflow

        client = mlflow.tracking.MlflowClient(
            tracking_uri=self.loading_tracking_uri)

        self.loading_run_id = self.loading_run_id or mlflow.active_run(
        ).info.run_id

        if self.dataset in {"p"}:
            run = client.get_run(self.loading_run_id)
            value = run.data.params.get(self.dataset_name, None)
            if value is None:
                raise KeyError("param '{}' not found in run_id '{}'.".format(
                    self.dataset_name, self.loading_run_id))

            PickleDataSet(filepath=self.filepath).save(value)

        elif self.dataset in {"m"}:
            run = client.get_run(self.loading_run_id)
            value = run.data.metrics.get(self.dataset_name, None)
            if value is None:
                raise KeyError("metric '{}' not found in run_id '{}'.".format(
                    self.dataset_name, self.loading_run_id))
            PickleDataSet(filepath=self.filepath).save(value)

        else:
            p = Path(self.filepath)

            dst_path = tempfile.gettempdir()
            downloaded_path = client.download_artifacts(
                run_id=self.loading_run_id,
                path=p.name,
                dst_path=dst_path,
            )
            if Path(downloaded_path) != p:
                Path(downloaded_path).rename(p)

        return self._dataset.load()
def test_model_packaging_too_many_artifacts(tmp_path,
                                            pipeline_inference_dummy):

    catalog = DataCatalog({
        "raw_data":
        PickleDataSet(filepath=(tmp_path /
                                "raw_data.pkl").resolve().as_posix()),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    catalog._data_sets["raw_data"].save(1)  # emulate input on disk
    catalog._data_sets["model"].save(2)  # emulate model fitting

    # the input is persited
    artifacts = {
        name: Path(dataset._filepath.as_posix()).resolve().as_uri(
        )  # weird bug when directly converting PurePosixPath to windows: it is considered as relative
        for name, dataset in catalog._data_sets.items()
        if not isinstance(dataset, MemoryDataSet)
    }

    kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy,
                                     catalog=catalog,
                                     input_name="raw_data")

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={
                "python": "3.7.0",
                "dependencies": ["kedro==0.16.5"]
            },
        )
        run_id = mlflow.active_run().info.run_id

    with pytest.raises(
            ValueError,
            match="Provided artifacts do not match catalog entries"):
        mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id /
                                            "model").as_posix())
Example #8
0
    def test_protocol_usage(self, filepath, instance_type):
        data_set = PickleDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)

        path = filepath.split(PROTOCOL_DELIMITER, 1)[-1]

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
def data_catalog(tmp_path):
    source_path = Path(__file__).parent / "data/test.parquet"
    spark_in = SparkDataSet(source_path.as_posix())
    spark_out = SparkDataSet((tmp_path / "spark_data").as_posix())
    pickle_ds = PickleDataSet((tmp_path / "pickle/test.pkl").as_posix())

    return DataCatalog(
        {"spark_in": spark_in, "spark_out": spark_out, "pickle_ds": pickle_ds}
    )
Example #10
0
 def test_invalid_backend(self, mocker):
     pattern = (
         r"Selected backend 'invalid' should satisfy the pickle interface. "
         r"Missing one of `load` and `dump` on the backend.")
     mocker.patch(
         "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module",
         return_value=object,
     )
     with pytest.raises(ValueError, match=pattern):
         PickleDataSet(filepath="test.pkl", backend="invalid")
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })
    return dummy_catalog
Example #12
0
    def test_version_str_repr(self, load_version, save_version):
        """Test that version is in string representation of the class instance
        when applicable."""
        filepath = "test.pkl"
        ds = PickleDataSet(filepath=filepath)
        ds_versioned = PickleDataSet(filepath=filepath,
                                     version=Version(load_version,
                                                     save_version))
        assert filepath in str(ds)
        assert "version" not in str(ds)

        assert filepath in str(ds_versioned)
        ver_str = "version=Version(load={}, save='{}')".format(
            load_version, save_version)
        assert ver_str in str(ds_versioned)
        assert "PickleDataSet" in str(ds_versioned)
        assert "PickleDataSet" in str(ds)
        assert "protocol" in str(ds_versioned)
        assert "protocol" in str(ds)
Example #13
0
 def test_no_backend(self, mocker):
     pattern = (
         r"Selected backend 'fake.backend.does.not.exist' could not be imported. "
         r"Make sure it is installed and importable.")
     mocker.patch(
         "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module",
         side_effect=ImportError,
     )
     with pytest.raises(ImportError, match=pattern):
         PickleDataSet(filepath="test.pkl",
                       backend="fake.backend.does.not.exist")
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
Example #15
0
    def test_protocol_usage(self, filepath, instance_type):
        data_set = PickleDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)

        # _strip_protocol() doesn't strip http(s) protocol
        if data_set._protocol == "https":
            path = filepath.split("://")[-1]
        else:
            path = data_set._fs._strip_protocol(filepath)

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(1),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
Example #17
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "data" / "06_models" /
                                "model.pkl").resolve().as_posix()),
    })
    dummy_catalog._data_sets["model"].save(2)  # emulate model fitting

    return dummy_catalog
Example #18
0
    def extract_pipeline_artifacts(self, catalog: DataCatalog,
                                   temp_folder: Path):
        pipeline_catalog = self._extract_pipeline_catalog(catalog)

        artifacts = {}
        for name, dataset in pipeline_catalog._data_sets.items():
            if name != self.input_name:
                if name.startswith("params:"):
                    # we need to persist it locally for mlflow access
                    absolute_param_path = temp_folder / f"params_{name[7:]}.pkl"
                    persisted_dataset = PickleDataSet(
                        filepath=absolute_param_path.as_posix())
                    persisted_dataset.save(dataset.load())
                    artifact_path = absolute_param_path.as_uri()
                else:
                    # In this second case, we know it cannot be a MemoryDataSet
                    # weird bug when directly converting PurePosixPath to windows: it is considered as relative
                    artifact_path = (Path(
                        dataset._filepath.as_posix()).resolve().as_uri())

                artifacts[name] = artifact_path

        return artifacts
def catalog_with_parameters(tmp_path):
    catalog_with_parameters = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "params:stopwords":
        MemoryDataSet(["Hello", "Hi"]),
        "params:penalty":
        MemoryDataSet(0),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
        "params:threshold":
        MemoryDataSet(0.5),
    })
    return catalog_with_parameters
Example #20
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog(
        {
            "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
            "params:unused_param": MemoryDataSet("blah"),
            "data": MemoryDataSet(),
            "model": PickleDataSet((tmp_path / "model.csv").as_posix()),
            "my_metrics": MlflowMetricsDataSet(),
            "another_metrics": MlflowMetricsDataSet(prefix="foo"),
            "my_metric": MlflowMetricDataSet(),
            "another_metric": MlflowMetricDataSet(key="foo"),
            "my_metric_history": MlflowMetricHistoryDataSet(),
            "another_metric_history": MlflowMetricHistoryDataSet(key="bar"),
        }
    )
    return dummy_catalog
Example #21
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleDataSet(filepath=str(tmp_path / "data.pkl"))
        catalog = DataCatalog(
            data_sets={
                "spark_in": spark_in,
                "pickle": pickle_data,
                "spark_out": spark_out,
            }
        )
        pipeline = Pipeline(
            [
                node(identity, "spark_in", "pickle"),
                node(identity, "pickle", "spark_out"),
            ]
        )
        runner = ParallelRunner()

        pattern = r"Failed while saving data to data set PickleDataSet"

        with pytest.raises(DataSetError, match=pattern):
            runner.run(pipeline, catalog)
def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters(
    mocker,
    monkeypatch,
    config_dir,  # a fixture to be in a kedro project
    dummy_mlflow_conf,  # a fixture to setup mlflow configuration
    tmp_path,
    pipeline_ml_with_parameters,
    dummy_run_params,
):
    # config_with_base_mlflow_conf is a conftest fixture
    monkeypatch.chdir(tmp_path)

    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

    catalog_with_parameters = DataCatalog({
        "data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "cleaned_data":
        MemoryDataSet(),
        "params:stopwords":
        MemoryDataSet(["Hello", "Hi"]),
        "params:penalty":
        MemoryDataSet(0.1),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
        "params:threshold":
        MemoryDataSet(0.5),
    })

    pipeline_hook = MlflowPipelineHook()

    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=catalog_with_parameters,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(
        run_params=dummy_run_params,
        pipeline=pipeline_ml_with_parameters,
        catalog=catalog_with_parameters,
    )
    runner.run(pipeline_ml_with_parameters, catalog_with_parameters)

    current_run_id = mlflow.active_run().info.run_id

    # This is what we want to test: model must be saved and the parameters automatically persisted on disk
    pipeline_hook.after_pipeline_run(
        run_params=dummy_run_params,
        pipeline=pipeline_ml_with_parameters,
        catalog=catalog_with_parameters,
    )

    # the 2 parameters which are inputs of inference pipeline
    # must have been persisted and logged inside the model's artifacts
    model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model")
    assert set(model.metadata.to_dict()["flavors"]["python_function"]
               ["artifacts"].keys()) == {
                   "model", "params:stopwords", "params:threshold"
               }

    # the model should be loadable and predict() should work (this tests KedroPipelineModel)
    assert model.predict(pd.DataFrame(data=[1],
                                      columns=["a"])).values[0][0] == 1
Example #23
0
def versioned_pickle_data_set(filepath_pickle, load_version, save_version):
    return PickleDataSet(filepath=filepath_pickle,
                         version=Version(load_version, save_version))
Example #24
0
 def test_no_joblib(self, mocker):
     mocker.patch.object(PickleDataSet, "BACKENDS", {"joblib": None})
     with pytest.raises(ImportError):
         PickleDataSet(filepath="test.pkl", backend="joblib")
Example #25
0
 def test_invalid_backend(self):
     pattern = r"'backend' should be one of \['pickle', 'joblib'\], got 'invalid'\."
     with pytest.raises(ValueError, match=pattern):
         PickleDataSet(filepath="test.pkl", backend="invalid")
Example #26
0
 def test_catalog_release(self, mocker):
     fs_mock = mocker.patch("fsspec.filesystem").return_value
     filepath = "test.pkl"
     data_set = PickleDataSet(filepath=filepath)
     data_set.release()
     fs_mock.invalidate_cache.assert_called_once_with(filepath)
def test_mlflow_pipeline_hook_metrics_with_run_id(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    dummy_pipeline_ml,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)

    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

    with mlflow.start_run():
        existing_run_id = mlflow.active_run().info.run_id

    dummy_catalog_with_run_id = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
        "my_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id),
        "another_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"),
    })

    pipeline_hook = MlflowPipelineHook()

    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog_with_run_id,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )
    runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

    current_run_id = mlflow.active_run().info.run_id

    pipeline_hook.after_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )

    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    all_runs_id = set([
        run.run_id for run in mlflow_client.list_run_infos(experiment_id="0")
    ])

    # the metrics are supposed to have been logged inside existing_run_id
    run_data = mlflow_client.get_run(existing_run_id).data

    # Check if metrics datasets have prefix with its names.
    # for metric
    assert all_runs_id == {current_run_id, existing_run_id}
    assert run_data.metrics["my_metrics.metric_key"] == 1.1
    assert run_data.metrics["foo.metric_key"] == 1.1
Example #28
0
 def test_no_backend(self, mocker, backend):
     mocker.patch.object(PickleDataSet, "BACKENDS", {backend: None})
     with pytest.raises(ImportError):
         PickleDataSet(filepath="test.pkl", backend=backend)
Example #29
0
def test_mlflow_hook_metrics_dataset_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params
):

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
                "my_metric": MlflowMetricDataSet(run_id=existing_run_id),
                "another_metric": MlflowMetricDataSet(
                    run_id=existing_run_id, key="foo"
                ),
                "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id),
                "another_metric_history": MlflowMetricHistoryDataSet(
                    run_id=existing_run_id, key="bar"
                ),
            }
        )

        mlflow_hook = MlflowHook()
        runner = SequentialRunner()

        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager)

        current_run_id = mlflow.active_run().info.run_id

        mlflow_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}

        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1
        assert run_data.metrics["my_metric"] == 1.1
        assert run_data.metrics["foo"] == 1.1
        assert (
            run_data.metrics["my_metric_history"] == 0.2
        )  # the list is stored, but only the last value is retrieved
        assert (
            run_data.metrics["bar"] == 0.2
        )  # the list is stored, but only the last value is retrieved
Example #30
0
 def test_no_compress_pickle(self, mocker):
     mocker.patch.object(PickleDataSet, "BACKENDS",
                         {"compress_pickle": None})
     with pytest.raises(ImportError):
         PickleDataSet(filepath="test.pkl", backend="compress_pickle")