def test_model_packaging(tmp_path, pipeline_ml_obj): catalog = DataCatalog( { "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet( filepath=(tmp_path / "model.pkl").resolve().as_posix() ), } ) catalog._data_sets["model"].save(2) # emulate model fitting artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog) kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={"python": "3.7.0"}, ) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model( model_uri=(Path(r"runs:/") / run_id / "model").as_posix() ) assert loaded_model.predict(1) == {"predictions": 2}
def pickle_data_set(filepath_pickle, load_args, save_args, fs_args): return PickleDataSet( filepath=filepath_pickle, load_args=load_args, save_args=save_args, fs_args=fs_args, )
def __init__(self, layers): self._data_sets = { "cat": PickleDataSet(filepath=str(tmp_path)), "parameters": MemoryDataSet("value"), "params:rabbit": MemoryDataSet("value"), } self.layers = layers
def test_http_filesystem_no_versioning(self): pattern = r"HTTP\(s\) DataSet doesn't support versioning\." with pytest.raises(DataSetError, match=pattern): PickleDataSet( filepath="https://example.com/file.pkl", version=Version(None, None) )
def test_model_packaging_missing_artifacts(tmp_path, pipeline_inference_dummy): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy, catalog=catalog, input_name="raw_data") mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=None, # no artifacts provided conda_env={ "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, ) run_id = mlflow.active_run().info.run_id with pytest.raises( ValueError, match="Provided artifacts do not match catalog entries"): mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
def _load(self): self._init_dataset() if self._cache and self._cache.exists(): return self._cache.load() if self.file_caching and self._dataset.exists(): return self._dataset.load() import mlflow client = mlflow.tracking.MlflowClient( tracking_uri=self.loading_tracking_uri) self.loading_run_id = self.loading_run_id or mlflow.active_run( ).info.run_id if self.dataset in {"p"}: run = client.get_run(self.loading_run_id) value = run.data.params.get(self.dataset_name, None) if value is None: raise KeyError("param '{}' not found in run_id '{}'.".format( self.dataset_name, self.loading_run_id)) PickleDataSet(filepath=self.filepath).save(value) elif self.dataset in {"m"}: run = client.get_run(self.loading_run_id) value = run.data.metrics.get(self.dataset_name, None) if value is None: raise KeyError("metric '{}' not found in run_id '{}'.".format( self.dataset_name, self.loading_run_id)) PickleDataSet(filepath=self.filepath).save(value) else: p = Path(self.filepath) dst_path = tempfile.gettempdir() downloaded_path = client.download_artifacts( run_id=self.loading_run_id, path=p.name, dst_path=dst_path, ) if Path(downloaded_path) != p: Path(downloaded_path).rename(p) return self._dataset.load()
def test_model_packaging_too_many_artifacts(tmp_path, pipeline_inference_dummy): catalog = DataCatalog({ "raw_data": PickleDataSet(filepath=(tmp_path / "raw_data.pkl").resolve().as_posix()), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) catalog._data_sets["raw_data"].save(1) # emulate input on disk catalog._data_sets["model"].save(2) # emulate model fitting # the input is persited artifacts = { name: Path(dataset._filepath.as_posix()).resolve().as_uri( ) # weird bug when directly converting PurePosixPath to windows: it is considered as relative for name, dataset in catalog._data_sets.items() if not isinstance(dataset, MemoryDataSet) } kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy, catalog=catalog, input_name="raw_data") mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={ "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, ) run_id = mlflow.active_run().info.run_id with pytest.raises( ValueError, match="Provided artifacts do not match catalog entries"): mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
def test_protocol_usage(self, filepath, instance_type): data_set = PickleDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def data_catalog(tmp_path): source_path = Path(__file__).parent / "data/test.parquet" spark_in = SparkDataSet(source_path.as_posix()) spark_out = SparkDataSet((tmp_path / "spark_data").as_posix()) pickle_ds = PickleDataSet((tmp_path / "pickle/test.pkl").as_posix()) return DataCatalog( {"spark_in": spark_in, "spark_out": spark_out, "pickle_ds": pickle_ds} )
def test_invalid_backend(self, mocker): pattern = ( r"Selected backend 'invalid' should satisfy the pickle interface. " r"Missing one of `load` and `dump` on the backend.") mocker.patch( "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", return_value=object, ) with pytest.raises(ValueError, match=pattern): PickleDataSet(filepath="test.pkl", backend="invalid")
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) return dummy_catalog
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "test.pkl" ds = PickleDataSet(filepath=filepath) ds_versioned = PickleDataSet(filepath=filepath, version=Version(load_version, save_version)) assert filepath in str(ds) assert "version" not in str(ds) assert filepath in str(ds_versioned) ver_str = "version=Version(load={}, save='{}')".format( load_version, save_version) assert ver_str in str(ds_versioned) assert "PickleDataSet" in str(ds_versioned) assert "PickleDataSet" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds)
def test_no_backend(self, mocker): pattern = ( r"Selected backend 'fake.backend.does.not.exist' could not be imported. " r"Make sure it is installed and importable.") mocker.patch( "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", side_effect=ImportError, ) with pytest.raises(ImportError, match=pattern): PickleDataSet(filepath="test.pkl", backend="fake.backend.does.not.exist")
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def test_protocol_usage(self, filepath, instance_type): data_set = PickleDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) # _strip_protocol() doesn't strip http(s) protocol if data_set._protocol == "https": path = filepath.split("://")[-1] else: path = data_set._fs._strip_protocol(filepath) assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(1), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "data" / "06_models" / "model.pkl").resolve().as_posix()), }) dummy_catalog._data_sets["model"].save(2) # emulate model fitting return dummy_catalog
def extract_pipeline_artifacts(self, catalog: DataCatalog, temp_folder: Path): pipeline_catalog = self._extract_pipeline_catalog(catalog) artifacts = {} for name, dataset in pipeline_catalog._data_sets.items(): if name != self.input_name: if name.startswith("params:"): # we need to persist it locally for mlflow access absolute_param_path = temp_folder / f"params_{name[7:]}.pkl" persisted_dataset = PickleDataSet( filepath=absolute_param_path.as_posix()) persisted_dataset.save(dataset.load()) artifact_path = absolute_param_path.as_uri() else: # In this second case, we know it cannot be a MemoryDataSet # weird bug when directly converting PurePosixPath to windows: it is considered as relative artifact_path = (Path( dataset._filepath.as_posix()).resolve().as_uri()) artifacts[name] = artifact_path return artifacts
def catalog_with_parameters(tmp_path): catalog_with_parameters = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), "params:threshold": MemoryDataSet(0.5), }) return catalog_with_parameters
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(), "another_metrics": MlflowMetricsDataSet(prefix="foo"), "my_metric": MlflowMetricDataSet(), "another_metric": MlflowMetricDataSet(key="foo"), "my_metric_history": MlflowMetricHistoryDataSet(), "another_metric_history": MlflowMetricHistoryDataSet(key="bar"), } ) return dummy_catalog
def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleDataSet(filepath=str(tmp_path / "data.pkl")) catalog = DataCatalog( data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, } ) pipeline = Pipeline( [ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ] ) runner = ParallelRunner() pattern = r"Failed while saving data to data set PickleDataSet" with pytest.raises(DataSetError, match=pattern): runner.run(pipeline, catalog)
def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters( mocker, monkeypatch, config_dir, # a fixture to be in a kedro project dummy_mlflow_conf, # a fixture to setup mlflow configuration tmp_path, pipeline_ml_with_parameters, dummy_run_params, ): # config_with_base_mlflow_conf is a conftest fixture monkeypatch.chdir(tmp_path) context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) catalog_with_parameters = DataCatalog({ "data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0.1), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "params:threshold": MemoryDataSet(0.5), }) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=catalog_with_parameters, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) runner.run(pipeline_ml_with_parameters, catalog_with_parameters) current_run_id = mlflow.active_run().info.run_id # This is what we want to test: model must be saved and the parameters automatically persisted on disk pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) # the 2 parameters which are inputs of inference pipeline # must have been persisted and logged inside the model's artifacts model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model") assert set(model.metadata.to_dict()["flavors"]["python_function"] ["artifacts"].keys()) == { "model", "params:stopwords", "params:threshold" } # the model should be loadable and predict() should work (this tests KedroPipelineModel) assert model.predict(pd.DataFrame(data=[1], columns=["a"])).values[0][0] == 1
def versioned_pickle_data_set(filepath_pickle, load_version, save_version): return PickleDataSet(filepath=filepath_pickle, version=Version(load_version, save_version))
def test_no_joblib(self, mocker): mocker.patch.object(PickleDataSet, "BACKENDS", {"joblib": None}) with pytest.raises(ImportError): PickleDataSet(filepath="test.pkl", backend="joblib")
def test_invalid_backend(self): pattern = r"'backend' should be one of \['pickle', 'joblib'\], got 'invalid'\." with pytest.raises(ValueError, match=pattern): PickleDataSet(filepath="test.pkl", backend="invalid")
def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.pkl" data_set = PickleDataSet(filepath=filepath) data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath)
def test_mlflow_pipeline_hook_metrics_with_run_id( mocker, monkeypatch, tmp_path, config_dir, dummy_pipeline_ml, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"), }) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) all_runs_id = set([ run.run_id for run in mlflow_client.list_run_infos(experiment_id="0") ]) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1
def test_no_backend(self, mocker, backend): mocker.patch.object(PickleDataSet, "BACKENDS", {backend: None}) with pytest.raises(ImportError): PickleDataSet(filepath="test.pkl", backend=backend)
def test_mlflow_hook_metrics_dataset_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params ): bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), "my_metric": MlflowMetricDataSet(run_id=existing_run_id), "another_metric": MlflowMetricDataSet( run_id=existing_run_id, key="foo" ), "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id), "another_metric_history": MlflowMetricHistoryDataSet( run_id=existing_run_id, key="bar" ), } ) mlflow_hook = MlflowHook() runner = SequentialRunner() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager) current_run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1 assert run_data.metrics["my_metric"] == 1.1 assert run_data.metrics["foo"] == 1.1 assert ( run_data.metrics["my_metric_history"] == 0.2 ) # the list is stored, but only the last value is retrieved assert ( run_data.metrics["bar"] == 0.2 ) # the list is stored, but only the last value is retrieved
def test_no_compress_pickle(self, mocker): mocker.patch.object(PickleDataSet, "BACKENDS", {"compress_pickle": None}) with pytest.raises(ImportError): PickleDataSet(filepath="test.pkl", backend="compress_pickle")