def catalog_with_stopwords(tmp_path): catalog_with_stopwords = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "stopwords_from_nltk": PickleDataSet((tmp_path / "stopwords.pkl").resolve().as_posix()), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), }) return catalog_with_stopwords
def catalog_with_encoder(tmp_path): catalog_with_encoder = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "encoder": PickleDataSet((tmp_path / "encoder.pkl").resolve().as_posix()), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), }) return catalog_with_encoder
def new_catalog_with_layers(): data_sets = { "bob_in": PickleDataSet("raw.csv"), "params:key": MemoryDataSet("value"), "result": PickleDataSet("final.csv"), } layers = {"raw": {"bob_in"}, "final": {"result"}} catalog = DataCatalog(data_sets=data_sets) setattr(catalog, "layers", layers) return catalog
def old_catalog_with_layers(): data_sets = { "bob_in": PickleDataSet("raw.csv"), "params:key": MemoryDataSet("value"), "result": PickleDataSet("final.csv"), } setattr(data_sets["bob_in"], "_layer", "raw") setattr(data_sets["result"], "_layer", "final") catalog = DataCatalog(data_sets=data_sets) try: catalog.__dict__.pop("layers") except KeyError: pass return catalog
def test_model_packaging(tmp_path, pipeline_ml_obj): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) catalog._data_sets["model"].save(2) # emulate model fitting artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog) kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={"python": "3.7.0"}, ) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model( model_uri=(Path(r"runs:/") / run_id / "model").as_posix()) assert loaded_model.predict(1) == 2
def test_http_filesystem_no_versioning(self): pattern = r"HTTP\(s\) DataSet doesn't support versioning\." with pytest.raises(DataSetError, match=pattern): PickleDataSet( filepath="https://example.com/file.pkl", version=Version(None, None) )
def test_model_packaging_missing_artifacts(tmp_path, pipeline_ml_obj): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=None, # no artifacts provided conda_env={"python": "3.7.0"}, ) run_id = mlflow.active_run().info.run_id with pytest.raises( ValueError, match="Provided artifacts do not match catalog entries"): mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
def __init__(self, layers): self._data_sets = { "cat": PickleDataSet(filepath=str(tmp_path)), "parameters": MemoryDataSet({"name": "value"}), "params:rabbit": MemoryDataSet("value"), } self.layers = layers
def extract_pipeline_artifacts( self, parameters_saving_folder: Optional[Path] = None): artifacts = {} for name, dataset in self.initial_catalog._data_sets.items(): if name != self.input_name: if name.startswith("params:"): # we need to persist it locally for mlflow access absolute_param_path = (parameters_saving_folder / f"params_{name[7:]}.pkl") persisted_dataset = PickleDataSet( filepath=absolute_param_path.as_posix()) persisted_dataset.save(dataset.load()) artifact_path = absolute_param_path.as_uri() self._logger.info(( f"The parameter '{name[7:]}' is persisted (as pickle) " "at the following location: '{artifact_path}'")) else: # In this second case, we know it cannot be a MemoryDataSet # weird bug when directly converting PurePosixPath to windows: it is considered as relative artifact_path = (Path( dataset._filepath.as_posix()).resolve().as_uri()) artifacts[name] = artifact_path return artifacts
def pickle_data_set(filepath_pickle, load_args, save_args, fs_args): return PickleDataSet( filepath=filepath_pickle, load_args=load_args, save_args=save_args, fs_args=fs_args, )
def _load(self): self._init_dataset() if self._cache and self._cache.exists(): return self._cache.load() if self.file_caching and self._dataset.exists(): return self._dataset.load() import mlflow client = mlflow.tracking.MlflowClient( tracking_uri=self.loading_tracking_uri) self.loading_run_id = self.loading_run_id or mlflow.active_run( ).info.run_id if self.dataset in {"p"}: run = client.get_run(self.loading_run_id) value = run.data.params.get(self.dataset_name, None) if value is None: raise KeyError("param '{}' not found in run_id '{}'.".format( self.dataset_name, self.loading_run_id)) PickleDataSet(filepath=self.filepath).save(value) elif self.dataset in {"m"}: run = client.get_run(self.loading_run_id) value = run.data.metrics.get(self.dataset_name, None) if value is None: raise KeyError("metric '{}' not found in run_id '{}'.".format( self.dataset_name, self.loading_run_id)) PickleDataSet(filepath=self.filepath).save(value) else: p = Path(self.filepath) dst_path = tempfile.gettempdir() downloaded_path = client.download_artifacts( run_id=self.loading_run_id, path=p.name, dst_path=dst_path, ) if Path(downloaded_path) != p: Path(downloaded_path).rename(p) return self._dataset.load()
def test_model_packaging_too_many_artifacts(tmp_path, pipeline_inference_dummy): catalog = DataCatalog({ "raw_data": PickleDataSet(filepath=(tmp_path / "raw_data.pkl").resolve().as_posix()), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) catalog._data_sets["raw_data"].save(1) # emulate input on disk catalog._data_sets["model"].save(2) # emulate model fitting # the input is persited artifacts = { name: Path(dataset._filepath.as_posix()).resolve().as_uri( ) # weird bug when directly converting PurePosixPath to windows: it is considered as relative for name, dataset in catalog._data_sets.items() if not isinstance(dataset, MemoryDataSet) } kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy, catalog=catalog, input_name="raw_data") mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={ "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, ) run_id = mlflow.active_run().info.run_id with pytest.raises( ValueError, match="Provided artifacts do not match catalog entries"): mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
def test_protocol_usage(self, filepath, instance_type): data_set = PickleDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def data_catalog(tmp_path): source_path = Path(__file__).parent / "data/test.parquet" spark_in = SparkDataSet(source_path.as_posix()) spark_out = SparkDataSet((tmp_path / "spark_data").as_posix()) pickle_ds = PickleDataSet((tmp_path / "pickle/test.pkl").as_posix()) return DataCatalog( {"spark_in": spark_in, "spark_out": spark_out, "pickle_ds": pickle_ds} )
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) return dummy_catalog
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "test.pkl" ds = PickleDataSet(filepath=filepath) ds_versioned = PickleDataSet(filepath=filepath, version=Version(load_version, save_version)) assert filepath in str(ds) assert "version" not in str(ds) assert filepath in str(ds_versioned) ver_str = "version=Version(load={}, save='{}')".format( load_version, save_version) assert ver_str in str(ds_versioned) assert "PickleDataSet" in str(ds_versioned) assert "PickleDataSet" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds)
def test_invalid_backend(self, mocker): pattern = ( r"Selected backend 'invalid' should satisfy the pickle interface. " r"Missing one of `load` and `dump` on the backend.") mocker.patch( "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", return_value=object, ) with pytest.raises(ValueError, match=pattern): PickleDataSet(filepath="test.pkl", backend="invalid")
def test_no_backend(self, mocker): pattern = ( r"Selected backend 'fake.backend.does.not.exist' could not be imported. " r"Make sure it is installed and importable.") mocker.patch( "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", side_effect=ImportError, ) with pytest.raises(ImportError, match=pattern): PickleDataSet(filepath="test.pkl", backend="fake.backend.does.not.exist")
def test_protocol_usage(self, filepath, instance_type): data_set = PickleDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) # _strip_protocol() doesn't strip http(s) protocol if data_set._protocol == "https": path = filepath.split("://")[-1] else: path = data_set._fs._strip_protocol(filepath) assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(1), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "data" / "06_models" / "model.pkl").resolve().as_posix()), }) dummy_catalog._data_sets["model"].save(2) # emulate model fitting return dummy_catalog
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(), "another_metrics": MlflowMetricsDataSet(prefix="foo"), "my_metric": MlflowMetricDataSet(), "another_metric": MlflowMetricDataSet(key="foo"), "my_metric_history": MlflowMetricHistoryDataSet(), "another_metric_history": MlflowMetricHistoryDataSet(key="bar"), } ) return dummy_catalog
def catalog_with_parameters(tmp_path): catalog_with_parameters = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), "params:threshold": MemoryDataSet(0.5), }) return catalog_with_parameters
def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleDataSet(filepath=str(tmp_path / "data.pkl")) catalog = DataCatalog( data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, } ) pipeline = Pipeline( [ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ] ) runner = ParallelRunner() pattern = r"Failed while saving data to data set PickleDataSet" with pytest.raises(DataSetError, match=pattern): runner.run(pipeline, catalog)
def extract_pipeline_artifacts(self, catalog: DataCatalog, temp_folder: Path): pipeline_catalog = self._extract_pipeline_catalog(catalog) artifacts = {} for name, dataset in pipeline_catalog._data_sets.items(): if name != self.input_name: if name.startswith("params:"): # we need to persist it locally for mlflow access absolute_param_path = temp_folder / f"params_{name[7:]}.pkl" persisted_dataset = PickleDataSet( filepath=absolute_param_path.as_posix()) persisted_dataset.save(dataset.load()) artifact_path = absolute_param_path.as_uri() else: # In this second case, we know it cannot be a MemoryDataSet # weird bug when directly converting PurePosixPath to windows: it is considered as relative artifact_path = (Path( dataset._filepath.as_posix()).resolve().as_uri()) artifacts[name] = artifact_path return artifacts
def versioned_pickle_data_set(filepath_pickle, load_version, save_version): return PickleDataSet(filepath=filepath_pickle, version=Version(load_version, save_version))
def test_no_joblib(self, mocker): mocker.patch.object(PickleDataSet, "BACKENDS", {"joblib": None}) with pytest.raises(ImportError): PickleDataSet(filepath="test.pkl", backend="joblib")
def test_invalid_backend(self): pattern = r"'backend' should be one of \['pickle', 'joblib'\], got 'invalid'\." with pytest.raises(ValueError, match=pattern): PickleDataSet(filepath="test.pkl", backend="invalid")
def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.pkl" data_set = PickleDataSet(filepath=filepath) data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath)