Python get_mlflow_config Examples, kedro_mlflow.framework.context.get_mlflow_config Python Examples

Example #1

0

Show file

File: test_mlflow_context.py Project: gabrielbckr/kedro-mlflow

def test_get_mlflow_config_in_uninitialized_project(kedro_project):
    # config_with_base_mlflow_conf is a pytest.fixture in conftest
    with pytest.raises(
            KedroMlflowConfigError,
            match="No 'mlflow.yml' config file found in environment"):
        project_metadata = _get_project_metadata(kedro_project)
        _add_src_to_path(project_metadata.source_dir, kedro_project)
        configure_project(project_metadata.package_name)
        with KedroSession.create(project_metadata.package_name, kedro_project):
            get_mlflow_config()

Example #2

0

Show file

def test_get_mlflow_config_in_uninitialized_project(mocker, tmp_path,
                                                    config_dir):
    # config_with_base_mlflow_conf is a pytest.fixture in conftest
    mocker.patch("logging.config.dictConfig")
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)

    context = load_context(tmp_path)
    with pytest.raises(
            KedroMlflowConfigError,
            match="No 'mlflow.yml' config file found in environment"):
        get_mlflow_config(context)

Example #3

0

Show file

    def before_pipeline_run(self, run_params: Dict[str, Any],
                            pipeline: Pipeline, catalog: DataCatalog) -> None:
        """Hook to be invoked before a pipeline runs.
        Args:
            run_params: The params needed for the given run.
                Should be identical to the data logged by Journal.
                # @fixme: this needs to be modelled explicitly as code, instead of comment
                Schema: {
                    "run_id": str,
                    "project_path": str,
                    "env": str,
                    "kedro_version": str,
                    "tags": Optional[List[str]],
                    "from_nodes": Optional[List[str]],
                    "to_nodes": Optional[List[str]],
                    "node_names": Optional[List[str]],
                    "from_inputs": Optional[List[str]],
                    "load_versions": Optional[List[str]],
                    "pipeline_name": str,
                    "extra_params": Optional[Dict[str, Any]],
                }
            pipeline: The ``Pipeline`` that will be run.
            catalog: The ``DataCatalog`` to be used during the run.
        """
        self.context = load_context(
            project_path=run_params["project_path"],
            env=run_params["env"],
            extra_params=run_params["extra_params"],
        )

        mlflow_conf = get_mlflow_config(self.context)
        mlflow_conf.setup(self.context)

        run_name = (mlflow_conf.run_opts["name"]
                    if mlflow_conf.run_opts["name"] is not None else
                    run_params["pipeline_name"])
        mlflow.start_run(
            run_id=mlflow_conf.run_opts["id"],
            experiment_id=mlflow_conf.experiment.experiment_id,
            run_name=run_name,
            nested=mlflow_conf.run_opts["nested"],
        )
        # Set tags only for run parameters that have values.
        mlflow.set_tags({k: v for k, v in run_params.items() if v})
        # add manually git sha for consistency with the journal
        # TODO : this does not take into account not committed files, so it
        # does not ensure reproducibility. Define what to do.
        mlflow.set_tag("git_sha", _git_sha(run_params["project_path"]))
        mlflow.set_tag(
            "kedro_command",
            _generate_kedro_command(
                tags=run_params["tags"],
                node_names=run_params["node_names"],
                from_nodes=run_params["from_nodes"],
                to_nodes=run_params["to_nodes"],
                from_inputs=run_params["from_inputs"],
                load_versions=run_params["load_versions"],
                pipeline_name=run_params["pipeline_name"],
            ),
        )

Example #4

0

Show file

File: test_mlflow_context.py Project: gabrielbckr/kedro-mlflow

def test_mlflow_config_with_templated_config_loader(kedro_project_with_tcl, ):

    _write_yaml(
        kedro_project_with_tcl / "conf" / "local" / "mlflow.yml",
        dict(
            mlflow_tracking_uri="${mlflow_tracking_uri}",
            credentials=None,
            disable_tracking=dict(pipelines=["my_disabled_pipeline"]),
            experiment=dict(name="fake_package", create=True),
            run=dict(id="123456789", name="my_run", nested=True),
            ui=dict(port="5151", host="localhost"),
            hooks=dict(node=dict(
                flatten_dict_params=True,
                recursive=False,
                sep="-",
                long_parameters_strategy="truncate",
            )),
        ),
    )

    _write_yaml(
        kedro_project_with_tcl / "conf" / "local" / "globals.yml",
        dict(mlflow_tracking_uri="dynamic_mlruns"),
    )

    expected = {
        "mlflow_tracking_uri":
        (kedro_project_with_tcl / "dynamic_mlruns").as_uri(),
        "credentials": None,
        "disable_tracking": {
            "pipelines": ["my_disabled_pipeline"]
        },
        "experiments": {
            "name": "fake_package",
            "create": True
        },
        "run": {
            "id": "123456789",
            "name": "my_run",
            "nested": True
        },
        "ui": {
            "port": "5151",
            "host": "localhost"
        },
        "hooks": {
            "node": {
                "flatten_dict_params": True,
                "recursive": False,
                "sep": "-",
                "long_parameters_strategy": "truncate",
            }
        },
    }
    project_metadata = _get_project_metadata(kedro_project_with_tcl)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_tcl)
    configure_project(project_metadata.package_name)
    with KedroSession.create(project_metadata.package_name,
                             kedro_project_with_tcl):
        assert get_mlflow_config().to_dict() == expected

Example #5

0

Show file

File: cli.py Project: gabrielbckr/kedro-mlflow

def ui(env, port, host):
    """Opens the mlflow user interface with the
    project-specific settings of mlflow.yml. This interface
    enables to browse and compares runs.
    """

    project_path = Path().cwd()
    project_metadata = _get_project_metadata(project_path)
    _add_src_to_path(project_metadata.source_dir, project_path)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=project_path,
        env=env,
    ):

        mlflow_conf = get_mlflow_config()
        host = host or mlflow_conf.ui_opts.get("host")
        port = port or mlflow_conf.ui_opts.get("port")

        # call mlflow ui with specific options
        # TODO : add more options for ui
        subprocess.call(
            [
                "mlflow",
                "ui",
                "--backend-store-uri",
                mlflow_conf.mlflow_tracking_uri,
                "--host",
                host,
                "--port",
                port,
            ]
        )

Example #6

0

Show file

    def before_pipeline_run(self, run_params: Dict[str, Any],
                            pipeline: Pipeline, catalog: DataCatalog) -> None:
        """Hook to be invoked before a pipeline runs.
        Args:
            run_params: The params needed for the given run.
                Should be identical to the data logged by Journal.
                # @fixme: this needs to be modelled explicitly as code, instead of comment
                Schema: {
                    "run_id": str,
                    "project_path": str,
                    "env": str,
                    "kedro_version": str,
                    "tags": Optional[List[str]],
                    "from_nodes": Optional[List[str]],
                    "to_nodes": Optional[List[str]],
                    "node_names": Optional[List[str]],
                    "from_inputs": Optional[List[str]],
                    "load_versions": Optional[List[str]],
                    "pipeline_name": str,
                    "extra_params": Optional[Dict[str, Any]],
                }
            pipeline: The ``Pipeline`` that will be run.
            catalog: The ``DataCatalog`` to be used during the run.
        """

        self.context = load_context(
            project_path=run_params["project_path"],
            env=run_params["env"],
            extra_params=run_params["extra_params"],
        )
        config = get_mlflow_config(self.context)
        self.flatten = config.node_hook_opts["flatten_dict_params"]
        self.recursive = config.node_hook_opts["recursive"]
        self.sep = config.node_hook_opts["sep"]

Example #7

0

Show file

File: test_all_hooks.py Project: gabrielbckr/kedro-mlflow

def test_deactivated_tracking_for_given_pipeline(
    mock_settings_with_mlflow_hooks,
    patched_configure_project,
    mocker,
    kedro_project_path,
):

    mocker.patch("kedro.framework.session.session.KedroSession._setup_logging")

    with KedroSession.create(MOCK_PACKAGE_NAME, kedro_project_path) as session:

        kedro_mlflow_config = get_mlflow_config()
        kedro_mlflow_config.setup()

        mlflow_client = MlflowClient((kedro_project_path / "mlruns").as_uri())

        # 0 is default, 1 is "fake_exp"
        all_runs_id_beginning = set([
            run.run_id for k in range(len(mlflow_client.list_experiments()))
            for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
        ])

        context = session.load_context()
        context.run(pipeline_name="pipeline_off")

        all_runs_id_end = set([
            run.run_id for k in range(len(mlflow_client.list_experiments()))
            for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
        ])

        assert all_runs_id_beginning == all_runs_id_end  # no run is created

Example #8

0

Show file

File: plugin.py Project: fossabot/kedro-kubeflow

def mlflow_start(ctx, kubeflow_run_id: str):
    from kedro_mlflow.framework.context import get_mlflow_config
    import mlflow

    mlflow_conf = get_mlflow_config(ctx.obj["kedro_ctx"])
    mlflow_conf.setup(ctx.obj["kedro_ctx"])
    run = mlflow.start_run(experiment_id=mlflow_conf.experiment.experiment_id,
                           nested=False)
    mlflow.set_tag("kubeflow_run_id", kubeflow_run_id)
    with open("/tmp/mlflow_run_id", "w") as f:
        f.write(run.info.run_id)
    click.echo(f"Started run: {run.info.run_id}")

Example #9

0

Show file

def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir):

    _write_yaml(
        tmp_path / "conf" / "base" / "mlflow.yml",
        dict(
            mlflow_tracking_uri="${mlflow_tracking_uri}",
            credentials=None,
            experiment=dict(name="fake_package", create=True),
            run=dict(id="123456789", name="my_run", nested=True),
            ui=dict(port="5151", host="localhost"),
            hooks=dict(node=dict(
                flatten_dict_params=True,
                recursive=False,
                sep="-",
                long_parameters_strategy="truncate",
            )),
        ),
    )

    _write_yaml(
        tmp_path / "conf" / "base" / "globals.yml",
        dict(mlflow_tracking_uri="testruns"),
    )

    expected = {
        "mlflow_tracking_uri": (tmp_path / "testruns").as_uri(),
        "credentials": None,
        "experiments": {
            "name": "fake_package",
            "create": True
        },
        "run": {
            "id": "123456789",
            "name": "my_run",
            "nested": True
        },
        "ui": {
            "port": "5151",
            "host": "localhost"
        },
        "hooks": {
            "node": {
                "flatten_dict_params": True,
                "recursive": False,
                "sep": "-",
                "long_parameters_strategy": "truncate",
            }
        },
    }

    context = load_context(tmp_path)
    assert get_mlflow_config(context).to_dict() == expected

Example #10

0

Show file

def ui(project_path, env):
    """Opens the mlflow user interface with the
        project-specific settings of mlflow.yml. This interface
        enables to browse and compares runs.

    """

    # the context must contains the self.mlflow attribues with mlflow configuration
    mlflow_conf = get_mlflow_config(project_path=project_path, env=env)

    # call mlflow ui with specific options
    # TODO : add more options for ui
    subprocess.call([
        "mlflow", "ui", "--backend-store-uri", mlflow_conf.mlflow_tracking_uri
    ])

Example #11

0

Show file

def test_get_mlflow_config(mocker, tmp_path, config_dir):
    # config_with_base_mlflow_conf is a pytest.fixture in conftest
    mocker.patch("logging.config.dictConfig")
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)

    _write_yaml(
        tmp_path / "conf" / "base" / "mlflow.yml",
        dict(
            mlflow_tracking_uri="mlruns",
            credentials=None,
            experiment=dict(name="fake_package", create=True),
            run=dict(id="123456789", name="my_run", nested=True),
            ui=dict(port="5151", host="localhost"),
            hooks=dict(node=dict(
                flatten_dict_params=True,
                recursive=False,
                sep="-",
                long_parameters_strategy="truncate",
            )),
        ),
    )
    expected = {
        "mlflow_tracking_uri": (tmp_path / "mlruns").as_uri(),
        "credentials": None,
        "experiments": {
            "name": "fake_package",
            "create": True
        },
        "run": {
            "id": "123456789",
            "name": "my_run",
            "nested": True
        },
        "ui": {
            "port": "5151",
            "host": "localhost"
        },
        "hooks": {
            "node": {
                "flatten_dict_params": True,
                "recursive": False,
                "sep": "-",
                "long_parameters_strategy": "truncate",
            }
        },
    }
    context = load_context(tmp_path)
    assert get_mlflow_config(context).to_dict() == expected

Example #12

0

Show file

def mlflow_start(ctx, kubeflow_run_id: str, output: str):
    import mlflow
    from kedro_mlflow.framework.context import get_mlflow_config

    token = AuthHandler().obtain_id_token()
    if token:
        os.environ["MLFLOW_TRACKING_TOKEN"] = token
        LOG.info("Configuring MLFLOW_TRACKING_TOKEN")

    kedro_context = ctx.obj["context_helper"].context
    mlflow_conf = get_mlflow_config(kedro_context)
    mlflow_conf.setup(kedro_context)
    run = mlflow.start_run(experiment_id=mlflow_conf.experiment.experiment_id,
                           nested=False)
    mlflow.set_tag("kubeflow_run_id", kubeflow_run_id)
    with open(output, "w") as f:
        f.write(run.info.run_id)
    click.echo(f"Started run: {run.info.run_id}")

Example #13

0

Show file

def test_cli_init_existing_config(monkeypatch, tmp_path, kedro_project):
    # "kedro_project" is a pytest.fixture declared in conftest
    project_path = tmp_path / "fake-project"
    monkeypatch.chdir(project_path)
    cli_runner = CliRunner()

    project_context = load_context(project_path.as_posix())
    # emulate first call by writing a mlflow.yml file
    yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto"))
    (project_path / project_context.CONF_ROOT /
     "local/mlflow.yml").write_text(yaml_str)

    result = cli_runner.invoke(cli_init)

    # check an error message is raised
    assert "A 'mlflow.yml' already exists" in result.output

    # check the file remains unmodified
    assert get_mlflow_config(project_context).mlflow_tracking_uri.endswith(
        "toto")

Example #14

0

Show file

File: test_pipeline_hook.py Project: takikadiri/kedro-mlflow

def test_mlflow_pipeline_hook_with_different_pipeline_types(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict,
                                       model_name="model")
    runner = SequentialRunner()
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"])
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)
    # test : parameters should have been logged
    mlflow_conf = get_mlflow_config(tmp_path)
    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    run_data = mlflow_client.get_run(run_id).data
    # all run_params are recorded as tags
    for k, v in dummy_run_params.items():
        if v:
            assert run_data.tags[k] == str(v)
    # params are not recorded because we don't have MlflowNodeHook here
    # and the model should not be logged when it is not a PipelineML
    nb_artifacts = len(mlflow_client.list_artifacts(run_id))
    if isinstance(pipeline_to_run, PipelineML):
        assert nb_artifacts == 1
    else:
        assert nb_artifacts == 0

Example #15

0

Show file

def test_cli_init_existing_config(monkeypatch, kedro_project_with_mlflow_conf):
    # "kedro_project" is a pytest.fixture declared in conftest
    cli_runner = CliRunner()
    monkeypatch.chdir(kedro_project_with_mlflow_conf)
    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir,
                     kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            "fake_project",
            project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()
        # emulate first call by writing a mlflow.yml file
        yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto"))
        (kedro_project_with_mlflow_conf / context.CONF_ROOT / "local" /
         "mlflow.yml").write_text(yaml_str)

        result = cli_runner.invoke(cli_init)

        # check an error message is raised
        assert "A 'mlflow.yml' already exists" in result.output

        # check the file remains unmodified
        assert get_mlflow_config().mlflow_tracking_uri.endswith("toto")

Example #16

0

Show file

File: test_pipeline_hook.py Project: felipeeeantunes/kedro-mlflow

def test_mlflow_pipeline_hook_metrics_with_run_id(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    dummy_pipeline_ml,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)

    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

    with mlflow.start_run():
        existing_run_id = mlflow.active_run().info.run_id

    dummy_catalog_with_run_id = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
        "my_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id),
        "another_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"),
    })

    pipeline_hook = MlflowPipelineHook()

    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog_with_run_id,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )
    runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

    current_run_id = mlflow.active_run().info.run_id

    pipeline_hook.after_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )

    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    all_runs_id = set([
        run.run_id for run in mlflow_client.list_run_infos(experiment_id="0")
    ])

    # the metrics are supposed to have been logged inside existing_run_id
    run_data = mlflow_client.get_run(existing_run_id).data

    # Check if metrics datasets have prefix with its names.
    # for metric
    assert all_runs_id == {current_run_id, existing_run_id}
    assert run_data.metrics["my_metrics.metric_key"] == 1.1
    assert run_data.metrics["foo.metric_key"] == 1.1

Example #17

0

Show file

File: test_pipeline_hook.py Project: gabrielbckr/kedro-mlflow

def test_mlflow_pipeline_hook_metrics_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params
):

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        mlflow_conf = get_mlflow_config()
        mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
            }
        )

        pipeline_hook = MlflowPipelineHook()

        runner = SequentialRunner()
        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

        current_run_id = mlflow.active_run().info.run_id

        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}
        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1

Example #18

0

Show file

File: test_pipeline_hook.py Project: felipeeeantunes/kedro-mlflow

def test_mlflow_pipeline_hook_with_different_pipeline_types(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook()
    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog)
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)
    # test : parameters should have been logged
    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    run_data = mlflow_client.get_run(run_id).data

    # all run_params are recorded as tags
    for k, v in dummy_run_params.items():
        if v:
            assert run_data.tags[k] == str(v)

    # params are not recorded because we don't have MlflowNodeHook here
    # and the model should not be logged when it is not a PipelineML
    nb_artifacts = len(mlflow_client.list_artifacts(run_id))
    if isinstance(pipeline_to_run, PipelineML):
        assert nb_artifacts == 1
    else:
        assert nb_artifacts == 0

    # Check if metrics datasets have prefix with its names.
    # for metric
    assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics"
    assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo"

    if isinstance(pipeline_to_run, PipelineML):
        trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
        assert trained_model.metadata.signature.to_dict() == {
            "inputs": '[{"name": "a", "type": "long"}]',
            "outputs": None,
        }

Example #19

0

Show file

File: test_pipeline_hook.py Project: felipeeeantunes/kedro-mlflow

def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters(
    mocker,
    monkeypatch,
    config_dir,  # a fixture to be in a kedro project
    dummy_mlflow_conf,  # a fixture to setup mlflow configuration
    tmp_path,
    pipeline_ml_with_parameters,
    dummy_run_params,
):
    # config_with_base_mlflow_conf is a conftest fixture
    monkeypatch.chdir(tmp_path)

    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

    catalog_with_parameters = DataCatalog({
        "data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "cleaned_data":
        MemoryDataSet(),
        "params:stopwords":
        MemoryDataSet(["Hello", "Hi"]),
        "params:penalty":
        MemoryDataSet(0.1),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
        "params:threshold":
        MemoryDataSet(0.5),
    })

    pipeline_hook = MlflowPipelineHook()

    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=catalog_with_parameters,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(
        run_params=dummy_run_params,
        pipeline=pipeline_ml_with_parameters,
        catalog=catalog_with_parameters,
    )
    runner.run(pipeline_ml_with_parameters, catalog_with_parameters)

    current_run_id = mlflow.active_run().info.run_id

    # This is what we want to test: model must be saved and the parameters automatically persisted on disk
    pipeline_hook.after_pipeline_run(
        run_params=dummy_run_params,
        pipeline=pipeline_ml_with_parameters,
        catalog=catalog_with_parameters,
    )

    # the 2 parameters which are inputs of inference pipeline
    # must have been persisted and logged inside the model's artifacts
    model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model")
    assert set(model.metadata.to_dict()["flavors"]["python_function"]
               ["artifacts"].keys()) == {
                   "model", "params:stopwords", "params:threshold"
               }

    # the model should be loadable and predict() should work (this tests KedroPipelineModel)
    assert model.predict(pd.DataFrame(data=[1],
                                      columns=["a"])).values[0][0] == 1