def test_kedro_pipeline_ml_loading_deepcopiable_catalog(tmp_path, tmp_folder):

    # create pipelien and catalog. The training will not be triggered
    def fit_fun(data):
        pass

    def predict_fun(model, data):
        return model.predict(data)

    training_pipeline = Pipeline(
        [node(func=fit_fun, inputs="data", outputs="model")])

    inference_pipeline = Pipeline([
        node(func=predict_fun, inputs=["model", "data"],
             outputs="predictions"),
    ])

    ml_pipeline = pipeline_ml_factory(
        training=training_pipeline,
        inference=inference_pipeline,
        input_name="data",
    )

    # emulate training by creating the model manually
    model_dataset = MlflowModelSaverDataSet(
        filepath=(tmp_path / "model.pkl").resolve().as_posix(),
        flavor="mlflow.sklearn")

    data = pd.DataFrame(
        data=[
            [1, 2],
            [3, 4],
        ],
        columns=["a", "b"],
    )
    labels = [4, 6]
    linreg = LinearRegression()
    linreg.fit(data, labels)
    model_dataset.save(linreg)

    # check that mlflow loading is ok
    catalog = DataCatalog({"data": MemoryDataSet(), "model": model_dataset})

    kedro_model = KedroPipelineModel(pipeline=ml_pipeline,
                                     catalog=catalog,
                                     input_name=ml_pipeline.input_name)
    artifacts = kedro_model.extract_pipeline_artifacts(tmp_folder)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    with mlflow.start_run():
        mlflow.pyfunc.log_model(artifact_path="model",
                                python_model=kedro_model,
                                artifacts=artifacts)
        run_id = mlflow.active_run().info.run_id

    loaded_model = mlflow.pyfunc.load_model(
        model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
    loaded_model.predict(data) == [4.0, 6.0]
def test_kedro_pipeline_model_save_and_load(tmp_path, pipeline, catalog,
                                            input_name, result):
    kedro_pipeline_model = KedroPipelineModel(pipeline=pipeline,
                                              catalog=catalog,
                                              input_name=input_name)
    # emulate artifacts persistence
    for ds in catalog._data_sets.values():
        if hasattr(ds, "_filepath") is not None:
            ds.save(1)

    artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_path)

    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_pipeline_model,
            artifacts=artifacts,
        )
        run_id = mlflow.active_run().info.run_id

    loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")

    data = pd.DataFrame([1, 2, 3])

    assert (loaded_model.predict(data) == result).all(axis=None)
    def after_pipeline_run(
        self,
        run_params: Dict[str, Any],
        pipeline: Pipeline,
        catalog: DataCatalog,
    ) -> None:
        """Hook to be invoked after a pipeline runs.
        Args:
            run_params: The params needed for the given run.
                Should be identical to the data logged by Journal.
                # @fixme: this needs to be modelled explicitly as code, instead of comment
                Schema: {
                    "project_path": str,
                    "env": str,
                    "kedro_version": str,
                    "tags": Optional[List[str]],
                    "from_nodes": Optional[List[str]],
                    "to_nodes": Optional[List[str]],
                    "node_names": Optional[List[str]],
                    "from_inputs": Optional[List[str]],
                    "load_versions": Optional[List[str]],
                    "pipeline_name": str,
                    "extra_params": Optional[Dict[str, Any]],
                }
            pipeline: The ``Pipeline`` that was run.
            catalog: The ``DataCatalog`` used during the run.
        """
        if self._is_mlflow_enabled:
            if isinstance(pipeline, PipelineML):
                with TemporaryDirectory() as tmp_dir:
                    # This will be removed at the end of the context manager,
                    # but we need to log in mlflow before moving the folder
                    kedro_pipeline_model = KedroPipelineModel(
                        pipeline=pipeline.inference,
                        catalog=catalog,
                        input_name=pipeline.input_name,
                        **pipeline.kpm_kwargs,
                    )
                    artifacts = kedro_pipeline_model.extract_pipeline_artifacts(
                        parameters_saving_folder=Path(tmp_dir))

                    log_model_kwargs = pipeline.log_model_kwargs.copy()
                    model_signature = log_model_kwargs.pop("signature", None)
                    if isinstance(model_signature, str):
                        if model_signature == "auto":
                            input_data = catalog.load(pipeline.input_name)
                            model_signature = infer_signature(
                                model_input=input_data)

                    mlflow.pyfunc.log_model(
                        python_model=kedro_pipeline_model,
                        artifacts=artifacts,
                        signature=model_signature,
                        **log_model_kwargs,
                    )
            # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs
            mlflow.end_run()
        else:
            switch_catalog_logging(catalog, True)
Example #4
0
def kedro_pipeline_model(tmp_path, pipeline_ml_obj, dummy_catalog):

    kedro_pipeline_model = KedroPipelineModel(
        pipeline_ml=pipeline_ml_obj, catalog=dummy_catalog
    )

    return kedro_pipeline_model
def test_kedro_pipeline_ml_with_wrong_copy_mode_type(pipeline_ml_obj,
                                                     dummy_catalog):
    with pytest.raises(TypeError,
                       match="'copy_mode' must be a 'str' or a 'dict'"):
        KedroPipelineModel(pipeline_ml=pipeline_ml_obj,
                           catalog=dummy_catalog,
                           copy_mode=1346)
def test_model_packaging_missing_artifacts(tmp_path, pipeline_ml_obj):

    catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj,
                                     catalog=catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=None,  # no artifacts provided
            conda_env={"python": "3.7.0"},
        )
        run_id = mlflow.active_run().info.run_id

    with pytest.raises(
            ValueError,
            match="Provided artifacts do not match catalog entries"):
        mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id /
                                            "model").as_posix())
Example #7
0
def test_model_packaging(tmp_path, pipeline_ml_obj):

    catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    catalog._data_sets["model"].save(2)  # emulate model fitting

    artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog)

    kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj,
                                     catalog=catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={"python": "3.7.0"},
        )
        run_id = mlflow.active_run().info.run_id

    loaded_model = mlflow.pyfunc.load_model(
        model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
    assert loaded_model.predict(1) == 2
def test_pyfunc_flavor_python_model_save_and_load(
    tmp_folder,
    tracking_uri,
    pipeline,
    dummy_catalog,
):

    kedro_pipeline_model = KedroPipelineModel(
        pipeline=pipeline,
        catalog=dummy_catalog,
        input_name="raw_data",
    )
    artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_folder)

    model_config = {
        "name": "kedro_pipeline_model",
        "config": {
            "type": "kedro_mlflow.io.models.MlflowModelLoggerDataSet",
            "flavor": "mlflow.pyfunc",
            "pyfunc_workflow": "python_model",
            "artifact_path": "test_model",
            "save_args": {
                "artifacts": artifacts,
                "conda_env": {
                    "python": "3.7.0",
                    "dependencies": ["kedro==0.16.5"]
                },
            },
        },
    }

    mlflow.set_tracking_uri(tracking_uri)
    mlflow_model_ds = MlflowModelLoggerDataSet.from_config(**model_config)
    mlflow_model_ds.save(kedro_pipeline_model)
    current_run_id = mlflow.active_run().info.run_id

    # close the run, create another dataset and reload
    # (emulate a new "kedro run" with the launch of the )
    mlflow.end_run()
    model_config2 = model_config.copy()
    model_config2["config"]["run_id"] = current_run_id
    mlflow_model_ds2 = MlflowModelLoggerDataSet.from_config(**model_config2)

    loaded_model = mlflow_model_ds2.load()

    loaded_model.predict(pd.DataFrame(
        data=[1], columns=["a"])) == pd.DataFrame(data=[2], columns=["a"])
def kedro_pipeline_model(pipeline_ml_obj, dummy_catalog):

    kedro_pipeline_model = KedroPipelineModel(
        pipeline=pipeline_ml_obj,
        catalog=dummy_catalog,
        input_name=pipeline_ml_obj.input_name,
    )

    return kedro_pipeline_model
Example #10
0
    def after_pipeline_run(
        self,
        run_params: Dict[str, Any],
        pipeline: Pipeline,
        catalog: DataCatalog,
    ) -> None:
        """Hook to be invoked after a pipeline runs.
        Args:
            run_params: The params needed for the given run.
                Should be identical to the data logged by Journal.
                # @fixme: this needs to be modelled explicitly as code, instead of comment
                Schema: {
                    "run_id": str,
                    "project_path": str,
                    "env": str,
                    "kedro_version": str,
                    "tags": Optional[List[str]],
                    "from_nodes": Optional[List[str]],
                    "to_nodes": Optional[List[str]],
                    "node_names": Optional[List[str]],
                    "from_inputs": Optional[List[str]],
                    "load_versions": Optional[List[str]],
                    "pipeline_name": str,
                    "extra_params": Optional[Dict[str, Any]],
                }
            pipeline: The ``Pipeline`` that was run.
            catalog: The ``DataCatalog`` used during the run.
        """

        if isinstance(pipeline, PipelineML):
            with TemporaryDirectory() as tmp_dir:
                # This will be removed at the end of the context manager,
                # but we need to log in mlflow beforeremoving the folder
                pipeline_catalog = pipeline._extract_pipeline_catalog(catalog)
                artifacts = pipeline.extract_pipeline_artifacts(
                    pipeline_catalog, temp_folder=Path(tmp_dir))

                if pipeline.model_signature == "auto":
                    input_data = pipeline_catalog.load(pipeline.input_name)
                    model_signature = infer_signature(model_input=input_data)
                else:
                    model_signature = pipeline.model_signature

                mlflow.pyfunc.log_model(
                    artifact_path=pipeline.model_name,
                    python_model=KedroPipelineModel(
                        pipeline_ml=pipeline,
                        catalog=pipeline_catalog,
                        **pipeline.kwargs,
                    ),
                    artifacts=artifacts,
                    conda_env=_format_conda_env(pipeline.conda_env),
                    signature=model_signature,
                )
        # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs
        mlflow.end_run()
def test_kedro_pipeline_model_with_wrong_copy_mode_type(
        pipeline_inference_dummy, dummy_catalog):
    with pytest.raises(TypeError,
                       match="'copy_mode' must be a 'str' or a 'dict'"):
        KedroPipelineModel(
            pipeline=pipeline_inference_dummy,
            catalog=dummy_catalog,
            copy_mode=1346,
            input_name="raw_data",
        )
Example #12
0
def test_pyfunc_flavor_python_model_save_and_load(tmp_path, tmp_folder,
                                                  pipeline, dummy_catalog):

    kedro_pipeline_model = KedroPipelineModel(
        pipeline=pipeline,
        catalog=dummy_catalog,
        input_name="raw_data",
    )
    artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_folder)

    model_config = {
        "name": "kedro_pipeline_model",
        "config": {
            "type":
            "kedro_mlflow.io.models.MlflowModelSaverDataSet",
            "filepath":
            (tmp_path / "data" / "06_models" / "my_custom_model").as_posix(),
            "flavor":
            "mlflow.pyfunc",
            "pyfunc_workflow":
            "python_model",
            "save_args": {
                "artifacts": artifacts,
                "conda_env": {
                    "python": "3.7.0",
                    "dependencies": ["kedro==0.16.5"]
                },
            },
        },
    }

    mlflow_model_ds = MlflowModelSaverDataSet.from_config(**model_config)
    mlflow_model_ds.save(kedro_pipeline_model)

    assert mlflow.active_run() is None

    # close the run, create another dataset and reload
    # (emulate a new "kedro run" with the launch of the )
    loaded_model = mlflow_model_ds.load()

    loaded_model.predict(pd.DataFrame(
        data=[1], columns=["a"])) == pd.DataFrame(data=[2], columns=["a"])
def test_model_packaging_with_copy_mode(tmp_path, tmp_folder,
                                        pipeline_inference_dummy,
                                        dummy_catalog, copy_mode, expected):

    dummy_catalog._data_sets["model"].save(2)  # emulate model fitting

    kedro_model = KedroPipelineModel(
        pipeline=pipeline_inference_dummy,
        catalog=dummy_catalog,
        copy_mode=copy_mode,
        input_name="raw_data",
    )

    artifacts = kedro_model.extract_pipeline_artifacts(tmp_folder)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={
                "python": "3.7.0",
                "dependencies": ["kedro==0.16.5"]
            },
        )
        run_id = mlflow.active_run().info.run_id

    loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model")

    # first assertion: prediction works
    assert loaded_model.predict(1) == 2

    # second assertion: copy_mode works
    actual_copy_mode = {
        name: ds._copy_mode
        for name, ds in loaded_model._model_impl.python_model.loaded_catalog.
        _data_sets.items()
    }

    assert actual_copy_mode == expected
def test_catalog_extraction_missing_inference_input(pipeline_inference_dummy):

    catalog = DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet()
    })
    # "model" is missing in the catalog
    with pytest.raises(
            KedroPipelineModelError,
            match="since it is the input of the pipeline",
    ):
        KedroPipelineModel(
            pipeline=pipeline_inference_dummy,
            catalog=catalog,
            input_name="raw_data",
        )
def test_model_packaging_too_many_artifacts(tmp_path,
                                            pipeline_inference_dummy):

    catalog = DataCatalog({
        "raw_data":
        PickleDataSet(filepath=(tmp_path /
                                "raw_data.pkl").resolve().as_posix()),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    catalog._data_sets["raw_data"].save(1)  # emulate input on disk
    catalog._data_sets["model"].save(2)  # emulate model fitting

    # the input is persited
    artifacts = {
        name: Path(dataset._filepath.as_posix()).resolve().as_uri(
        )  # weird bug when directly converting PurePosixPath to windows: it is considered as relative
        for name, dataset in catalog._data_sets.items()
        if not isinstance(dataset, MemoryDataSet)
    }

    kedro_model = KedroPipelineModel(pipeline=pipeline_inference_dummy,
                                     catalog=catalog,
                                     input_name="raw_data")

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={
                "python": "3.7.0",
                "dependencies": ["kedro==0.16.5"]
            },
        )
        run_id = mlflow.active_run().info.run_id

    with pytest.raises(
            ValueError,
            match="Provided artifacts do not match catalog entries"):
        mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id /
                                            "model").as_posix())
def test_catalog_extraction_unpersisted_inference_input(
        pipeline_inference_dummy):
    catalog = DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet(),
        "model": MemoryDataSet()
    })

    # "model" is a MemoryDataSet in the catalog
    with pytest.raises(
            KedroPipelineModelError,
            match=
            "The datasets of the training pipeline must be persisted locally",
    ):
        KedroPipelineModel(
            pipeline=pipeline_inference_dummy,
            catalog=catalog,
            input_name="raw_data",
        )
Example #17
0
    def after_pipeline_run(
        self,
        run_params: Dict[str, Any],
        pipeline: Pipeline,
        catalog: DataCatalog,
    ) -> None:
        """Hook to be invoked after a pipeline runs.
        Args:
            run_params: The params needed for the given run.
                Should be identical to the data logged by Journal.
                # @fixme: this needs to be modelled explicitly as code, instead of comment
                Schema: {
                    "run_id": str,
                    "project_path": str,
                    "env": str,
                    "kedro_version": str,
                    "tags": Optional[List[str]],
                    "from_nodes": Optional[List[str]],
                    "to_nodes": Optional[List[str]],
                    "node_names": Optional[List[str]],
                    "from_inputs": Optional[List[str]],
                    "load_versions": Optional[List[str]],
                    "pipeline_name": str,
                    "extra_params": Optional[Dict[str, Any]],
                }
            pipeline: The ``Pipeline`` that was run.
            catalog: The ``DataCatalog`` used during the run.
        """

        if isinstance(pipeline, PipelineML):
            pipeline_catalog = pipeline.extract_pipeline_catalog(catalog)
            artifacts = pipeline.extract_pipeline_artifacts(pipeline_catalog)
            mlflow.pyfunc.log_model(
                artifact_path=self.model_name,
                python_model=KedroPipelineModel(pipeline_ml=pipeline,
                                                catalog=pipeline_catalog),
                artifacts=artifacts,
                conda_env=self.conda_env,
            )
        # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs
        mlflow.end_run()
def test_kedro_pipeline_model_too_many_outputs():

    catalog = DataCatalog({
        "data": MemoryDataSet(),
        "predictions": MemoryDataSet(),
        "metrics": MemoryDataSet(),
    })

    def predict_and_evaluate(data):
        return 1, 1

    pipeline = Pipeline([
        node(
            func=predict_and_evaluate,
            inputs={"data": "data"},
            outputs=["predictions", "metrics"],
        ),
    ])

    with pytest.raises(ValueError,
                       match="Pipeline must have one and only one output"):
        KedroPipelineModel(pipeline, catalog, input_name="data")
Example #19
0
def modelify(
    # ctx,
    pipeline_name: str,
    input_name: str,
    flag_infer_signature: Optional[bool],
    flag_infer_input_example: Optional[bool],
    run_id: Optional[str],
    copy_mode: Optional[Union[str, Dict[str, str]]],
    artifact_path: str,
    code_path: str,
    conda_env: str,
    registered_model_name: str,
    await_registration_for: int,
    pip_requirements: str,
    extra_pip_requirements: str,
):
    """Export a kedro pipeline as a mlflow model for serving"""
    # if the command is available, we are necessarily at the root of a kedro project

    project_path = Path.cwd()
    bootstrap_project(project_path)
    with KedroSession.create(project_path=project_path) as session:
        # "pipeline" is the Pipeline object you want to convert to a mlflow model
        pipeline = pipelines[pipeline_name]
        context = (session.load_context()
                   )  # triggers config setup with after_context_created hook
        catalog = context.catalog
        input_name = input_name

        if input_name not in pipeline.inputs():
            valid_inputs = "\n - ".join(pipeline.inputs())
            raise ValueError(
                f"'{input_name}' is not a valid 'input_name', it must be an input of 'pipeline', i.e. one of: \n - {valid_inputs}"
            )
        # artifacts are all the inputs of the inference pipelines that are persisted in the catalog

        # (optional) get the schema of the input dataset
        model_signature = None
        if flag_infer_signature:
            input_data = catalog.load(input_name)
            model_signature = infer_signature(model_input=input_data)

        input_example = None
        if flag_infer_input_example:
            if flag_infer_signature is False:
                # else we have already loaded the data
                input_data = catalog.load(input_name)
            input_example = input_data.iloc[
                0:
                1, :]  # 0:1 forces a dataframe, iloc returns a Series which raises a mlflow error

        with TemporaryDirectory() as tmp_dir:
            # you can optionnally pass other arguments, like the "copy_mode" to be used for each dataset
            kedro_pipeline_model = KedroPipelineModel(
                pipeline=pipeline,
                catalog=catalog,
                input_name=input_name,
                copy_mode=copy_mode,
                # add runner option
            )

            artifacts = kedro_pipeline_model.extract_pipeline_artifacts(
                Path(tmp_dir))

            if conda_env is None:
                conda_env = {
                    "python": "3.7.0",
                    "dependencies": ["kedro==0.16.5"]
                }

            log_model_kwargs = dict(
                artifact_path=artifact_path,
                python_model=kedro_pipeline_model,
                artifacts=artifacts,
                code_path=code_path,
                conda_env=conda_env,
                signature=model_signature,
                input_example=input_example,
                registered_model_name=registered_model_name,
                await_registration_for=await_registration_for,
            )
            if version.parse(f"{mlflow.__version__}") >= version.parse(
                    "1.20.0"):
                log_model_kwargs["pip_requirements"] = pip_requirements
                log_model_kwargs[
                    "extra_pip_requirements"] = extra_pip_requirements

            with mlflow.start_run(run_id=run_id):
                mlflow.pyfunc.log_model(**log_model_kwargs)
                run_id = mlflow.active_run().info.run_id
                LOGGER.info(f"Model successfully logged in run '{run_id}'")
def test_catalog_extraction(pipeline, catalog, input_name, result):
    kedro_pipeline_model = KedroPipelineModel(pipeline=pipeline,
                                              catalog=catalog,
                                              input_name=input_name)
    filtered_catalog = kedro_pipeline_model.initial_catalog
    assert set(filtered_catalog.list()) == result