def test_kedro_pipeline_ml_loading_deepcopiable_catalog(tmp_path, tmp_folder): # create pipelien and catalog. The training will not be triggered def fit_fun(data): pass def predict_fun(model, data): return model.predict(data) training_pipeline = Pipeline( [node(func=fit_fun, inputs="data", outputs="model")]) inference_pipeline = Pipeline([ node(func=predict_fun, inputs=["model", "data"], outputs="predictions"), ]) ml_pipeline = pipeline_ml_factory( training=training_pipeline, inference=inference_pipeline, input_name="data", ) # emulate training by creating the model manually model_dataset = MlflowModelSaverDataSet( filepath=(tmp_path / "model.pkl").resolve().as_posix(), flavor="mlflow.sklearn") data = pd.DataFrame( data=[ [1, 2], [3, 4], ], columns=["a", "b"], ) labels = [4, 6] linreg = LinearRegression() linreg.fit(data, labels) model_dataset.save(linreg) # check that mlflow loading is ok catalog = DataCatalog({"data": MemoryDataSet(), "model": model_dataset}) kedro_model = KedroPipelineModel(pipeline=ml_pipeline, catalog=catalog, input_name=ml_pipeline.input_name) artifacts = kedro_model.extract_pipeline_artifacts(tmp_folder) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model(artifact_path="model", python_model=kedro_model, artifacts=artifacts) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model( model_uri=(Path(r"runs:/") / run_id / "model").as_posix()) loaded_model.predict(data) == [4.0, 6.0]
def test_kedro_pipeline_model_save_and_load(tmp_path, pipeline, catalog, input_name, result): kedro_pipeline_model = KedroPipelineModel(pipeline=pipeline, catalog=catalog, input_name=input_name) # emulate artifacts persistence for ds in catalog._data_sets.values(): if hasattr(ds, "_filepath") is not None: ds.save(1) artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_path) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_pipeline_model, artifacts=artifacts, ) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") data = pd.DataFrame([1, 2, 3]) assert (loaded_model.predict(data) == result).all(axis=None)
def after_pipeline_run( self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: """Hook to be invoked after a pipeline runs. Args: run_params: The params needed for the given run. Should be identical to the data logged by Journal. # @fixme: this needs to be modelled explicitly as code, instead of comment Schema: { "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]], } pipeline: The ``Pipeline`` that was run. catalog: The ``DataCatalog`` used during the run. """ if self._is_mlflow_enabled: if isinstance(pipeline, PipelineML): with TemporaryDirectory() as tmp_dir: # This will be removed at the end of the context manager, # but we need to log in mlflow before moving the folder kedro_pipeline_model = KedroPipelineModel( pipeline=pipeline.inference, catalog=catalog, input_name=pipeline.input_name, **pipeline.kpm_kwargs, ) artifacts = kedro_pipeline_model.extract_pipeline_artifacts( parameters_saving_folder=Path(tmp_dir)) log_model_kwargs = pipeline.log_model_kwargs.copy() model_signature = log_model_kwargs.pop("signature", None) if isinstance(model_signature, str): if model_signature == "auto": input_data = catalog.load(pipeline.input_name) model_signature = infer_signature( model_input=input_data) mlflow.pyfunc.log_model( python_model=kedro_pipeline_model, artifacts=artifacts, signature=model_signature, **log_model_kwargs, ) # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs mlflow.end_run() else: switch_catalog_logging(catalog, True)
def kedro_pipeline_model(tmp_path, pipeline_ml_obj, dummy_catalog): kedro_pipeline_model = KedroPipelineModel( pipeline_ml=pipeline_ml_obj, catalog=dummy_catalog ) return kedro_pipeline_model
def test_kedro_pipeline_ml_with_wrong_copy_mode_type(pipeline_ml_obj, dummy_catalog): with pytest.raises(TypeError, match="'copy_mode' must be a 'str' or a 'dict'"): KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=dummy_catalog, copy_mode=1346)
def test_model_packaging_missing_artifacts(tmp_path, pipeline_ml_obj): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=None, # no artifacts provided conda_env={"python": "3.7.0"}, ) run_id = mlflow.active_run().info.run_id with pytest.raises( ValueError, match="Provided artifacts do not match catalog entries"): mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
def test_model_packaging(tmp_path, pipeline_ml_obj): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) catalog._data_sets["model"].save(2) # emulate model fitting artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog) kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={"python": "3.7.0"}, ) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model( model_uri=(Path(r"runs:/") / run_id / "model").as_posix()) assert loaded_model.predict(1) == 2
def test_pyfunc_flavor_python_model_save_and_load( tmp_folder, tracking_uri, pipeline, dummy_catalog, ): kedro_pipeline_model = KedroPipelineModel( pipeline=pipeline, catalog=dummy_catalog, input_name="raw_data", ) artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_folder) model_config = { "name": "kedro_pipeline_model", "config": { "type": "kedro_mlflow.io.models.MlflowModelLoggerDataSet", "flavor": "mlflow.pyfunc", "pyfunc_workflow": "python_model", "artifact_path": "test_model", "save_args": { "artifacts": artifacts, "conda_env": { "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, }, }, } mlflow.set_tracking_uri(tracking_uri) mlflow_model_ds = MlflowModelLoggerDataSet.from_config(**model_config) mlflow_model_ds.save(kedro_pipeline_model) current_run_id = mlflow.active_run().info.run_id # close the run, create another dataset and reload # (emulate a new "kedro run" with the launch of the ) mlflow.end_run() model_config2 = model_config.copy() model_config2["config"]["run_id"] = current_run_id mlflow_model_ds2 = MlflowModelLoggerDataSet.from_config(**model_config2) loaded_model = mlflow_model_ds2.load() loaded_model.predict(pd.DataFrame( data=[1], columns=["a"])) == pd.DataFrame(data=[2], columns=["a"])
def kedro_pipeline_model(pipeline_ml_obj, dummy_catalog): kedro_pipeline_model = KedroPipelineModel( pipeline=pipeline_ml_obj, catalog=dummy_catalog, input_name=pipeline_ml_obj.input_name, ) return kedro_pipeline_model
def after_pipeline_run( self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: """Hook to be invoked after a pipeline runs. Args: run_params: The params needed for the given run. Should be identical to the data logged by Journal. # @fixme: this needs to be modelled explicitly as code, instead of comment Schema: { "run_id": str, "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]], } pipeline: The ``Pipeline`` that was run. catalog: The ``DataCatalog`` used during the run. """ if isinstance(pipeline, PipelineML): with TemporaryDirectory() as tmp_dir: # This will be removed at the end of the context manager, # but we need to log in mlflow beforeremoving the folder pipeline_catalog = pipeline._extract_pipeline_catalog(catalog) artifacts = pipeline.extract_pipeline_artifacts( pipeline_catalog, temp_folder=Path(tmp_dir)) if pipeline.model_signature == "auto": input_data = pipeline_catalog.load(pipeline.input_name) model_signature = infer_signature(model_input=input_data) else: model_signature = pipeline.model_signature mlflow.pyfunc.log_model( artifact_path=pipeline.model_name, python_model=KedroPipelineModel( pipeline_ml=pipeline, catalog=pipeline_catalog, **pipeline.kwargs, ), artifacts=artifacts, conda_env=_format_conda_env(pipeline.conda_env), signature=model_signature, ) # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs mlflow.end_run()
def test_kedro_pipeline_model_with_wrong_copy_mode_type( pipeline_inference_dummy, dummy_catalog): with pytest.raises(TypeError, match="'copy_mode' must be a 'str' or a 'dict'"): KedroPipelineModel( pipeline=pipeline_inference_dummy, catalog=dummy_catalog, copy_mode=1346, input_name="raw_data", )
def test_pyfunc_flavor_python_model_save_and_load(tmp_path, tmp_folder, pipeline, dummy_catalog): kedro_pipeline_model = KedroPipelineModel( pipeline=pipeline, catalog=dummy_catalog, input_name="raw_data", ) artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_folder) model_config = { "name": "kedro_pipeline_model", "config": { "type": "kedro_mlflow.io.models.MlflowModelSaverDataSet", "filepath": (tmp_path / "data" / "06_models" / "my_custom_model").as_posix(), "flavor": "mlflow.pyfunc", "pyfunc_workflow": "python_model", "save_args": { "artifacts": artifacts, "conda_env": { "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, }, }, } mlflow_model_ds = MlflowModelSaverDataSet.from_config(**model_config) mlflow_model_ds.save(kedro_pipeline_model) assert mlflow.active_run() is None # close the run, create another dataset and reload # (emulate a new "kedro run" with the launch of the ) loaded_model = mlflow_model_ds.load() loaded_model.predict(pd.DataFrame( data=[1], columns=["a"])) == pd.DataFrame(data=[2], columns=["a"])
def test_model_packaging_with_copy_mode(tmp_path, tmp_folder, pipeline_inference_dummy, dummy_catalog, copy_mode, expected): dummy_catalog._data_sets["model"].save(2) # emulate model fitting kedro_model = KedroPipelineModel( pipeline=pipeline_inference_dummy, catalog=dummy_catalog, copy_mode=copy_mode, input_name="raw_data", ) artifacts = kedro_model.extract_pipeline_artifacts(tmp_folder) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={ "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, ) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model") # first assertion: prediction works assert loaded_model.predict(1) == 2 # second assertion: copy_mode works actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model.loaded_catalog. _data_sets.items() } assert actual_copy_mode == expected
def test_catalog_extraction_missing_inference_input(pipeline_inference_dummy): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet() }) # "model" is missing in the catalog with pytest.raises( KedroPipelineModelError, match="since it is the input of the pipeline", ): KedroPipelineModel( pipeline=pipeline_inference_dummy, catalog=catalog, input_name="raw_data", )
def test_model_packaging_too_many_artifacts(tmp_path, pipeline_inference_dummy): catalog = DataCatalog({ "raw_data": PickleDataSet(filepath=(tmp_path / "raw_data.pkl").resolve().as_posix()), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) catalog._data_sets["raw_data"].save(1) # emulate input on disk catalog._data_sets["model"].save(2) # emulate model fitting # the input is persited artifacts = { name: Path(dataset._filepath.as_posix()).resolve().as_uri( ) # weird bug when directly converting PurePosixPath to windows: it is considered as relative for name, dataset in catalog._data_sets.items() if not isinstance(dataset, MemoryDataSet) } kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy, catalog=catalog, input_name="raw_data") mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={ "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, ) run_id = mlflow.active_run().info.run_id with pytest.raises( ValueError, match="Provided artifacts do not match catalog entries"): mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
def test_catalog_extraction_unpersisted_inference_input( pipeline_inference_dummy): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": MemoryDataSet() }) # "model" is a MemoryDataSet in the catalog with pytest.raises( KedroPipelineModelError, match= "The datasets of the training pipeline must be persisted locally", ): KedroPipelineModel( pipeline=pipeline_inference_dummy, catalog=catalog, input_name="raw_data", )
def after_pipeline_run( self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: """Hook to be invoked after a pipeline runs. Args: run_params: The params needed for the given run. Should be identical to the data logged by Journal. # @fixme: this needs to be modelled explicitly as code, instead of comment Schema: { "run_id": str, "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]], } pipeline: The ``Pipeline`` that was run. catalog: The ``DataCatalog`` used during the run. """ if isinstance(pipeline, PipelineML): pipeline_catalog = pipeline.extract_pipeline_catalog(catalog) artifacts = pipeline.extract_pipeline_artifacts(pipeline_catalog) mlflow.pyfunc.log_model( artifact_path=self.model_name, python_model=KedroPipelineModel(pipeline_ml=pipeline, catalog=pipeline_catalog), artifacts=artifacts, conda_env=self.conda_env, ) # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs mlflow.end_run()
def test_kedro_pipeline_model_too_many_outputs(): catalog = DataCatalog({ "data": MemoryDataSet(), "predictions": MemoryDataSet(), "metrics": MemoryDataSet(), }) def predict_and_evaluate(data): return 1, 1 pipeline = Pipeline([ node( func=predict_and_evaluate, inputs={"data": "data"}, outputs=["predictions", "metrics"], ), ]) with pytest.raises(ValueError, match="Pipeline must have one and only one output"): KedroPipelineModel(pipeline, catalog, input_name="data")
def modelify( # ctx, pipeline_name: str, input_name: str, flag_infer_signature: Optional[bool], flag_infer_input_example: Optional[bool], run_id: Optional[str], copy_mode: Optional[Union[str, Dict[str, str]]], artifact_path: str, code_path: str, conda_env: str, registered_model_name: str, await_registration_for: int, pip_requirements: str, extra_pip_requirements: str, ): """Export a kedro pipeline as a mlflow model for serving""" # if the command is available, we are necessarily at the root of a kedro project project_path = Path.cwd() bootstrap_project(project_path) with KedroSession.create(project_path=project_path) as session: # "pipeline" is the Pipeline object you want to convert to a mlflow model pipeline = pipelines[pipeline_name] context = (session.load_context() ) # triggers config setup with after_context_created hook catalog = context.catalog input_name = input_name if input_name not in pipeline.inputs(): valid_inputs = "\n - ".join(pipeline.inputs()) raise ValueError( f"'{input_name}' is not a valid 'input_name', it must be an input of 'pipeline', i.e. one of: \n - {valid_inputs}" ) # artifacts are all the inputs of the inference pipelines that are persisted in the catalog # (optional) get the schema of the input dataset model_signature = None if flag_infer_signature: input_data = catalog.load(input_name) model_signature = infer_signature(model_input=input_data) input_example = None if flag_infer_input_example: if flag_infer_signature is False: # else we have already loaded the data input_data = catalog.load(input_name) input_example = input_data.iloc[ 0: 1, :] # 0:1 forces a dataframe, iloc returns a Series which raises a mlflow error with TemporaryDirectory() as tmp_dir: # you can optionnally pass other arguments, like the "copy_mode" to be used for each dataset kedro_pipeline_model = KedroPipelineModel( pipeline=pipeline, catalog=catalog, input_name=input_name, copy_mode=copy_mode, # add runner option ) artifacts = kedro_pipeline_model.extract_pipeline_artifacts( Path(tmp_dir)) if conda_env is None: conda_env = { "python": "3.7.0", "dependencies": ["kedro==0.16.5"] } log_model_kwargs = dict( artifact_path=artifact_path, python_model=kedro_pipeline_model, artifacts=artifacts, code_path=code_path, conda_env=conda_env, signature=model_signature, input_example=input_example, registered_model_name=registered_model_name, await_registration_for=await_registration_for, ) if version.parse(f"{mlflow.__version__}") >= version.parse( "1.20.0"): log_model_kwargs["pip_requirements"] = pip_requirements log_model_kwargs[ "extra_pip_requirements"] = extra_pip_requirements with mlflow.start_run(run_id=run_id): mlflow.pyfunc.log_model(**log_model_kwargs) run_id = mlflow.active_run().info.run_id LOGGER.info(f"Model successfully logged in run '{run_id}'")
def test_catalog_extraction(pipeline, catalog, input_name, result): kedro_pipeline_model = KedroPipelineModel(pipeline=pipeline, catalog=catalog, input_name=input_name) filtered_catalog = kedro_pipeline_model.initial_catalog assert set(filtered_catalog.list()) == result