Ejemplo n.º 1
0
def test_kedro_mlflow_config_experiment_was_deleted(kedro_project_with_mlflow_conf):

    # create an experiment with the same name and then delete it
    mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri()
    mlflow_client = MlflowClient(mlflow_tracking_uri)
    mlflow_client.create_experiment("exp1")
    mlflow_client.delete_experiment(
        mlflow_client.get_experiment_by_name("exp1").experiment_id
    )

    # the config must restore properly the experiment
    config = KedroMlflowConfig(
        project_path=kedro_project_with_mlflow_conf,
        mlflow_tracking_uri="mlruns",
        experiment_opts=dict(name="exp1"),
    )

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        "fake_project", project_path=kedro_project_with_mlflow_conf
    ):
        config.setup()

    assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
Ejemplo n.º 2
0
def test_pipeline_run_hook_getting_configs(
    kedro_project,
    dummy_run_params,
    dummy_pipeline,
    dummy_catalog,
):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(
            node=dict(flatten_dict_params=True, recursive=False, sep="-")), ),
    ),

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        mlflow_node_hook = MlflowNodeHook()
        mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params,
                                             pipeline=dummy_pipeline,
                                             catalog=dummy_catalog)

        assert (
            mlflow_node_hook.flatten,
            mlflow_node_hook.recursive,
            mlflow_node_hook.sep,
        ) == (True, False, "-")
Ejemplo n.º 3
0
def test_kedro_mlflow_config_setup_tracking_priority(kedro_project_with_mlflow_conf):
    """Test if the mlflow_tracking uri set is the one of mlflow.yml
    if it also eist in credentials.

    Args:
        mocker ([type]): [description]
        tmp_path ([type]): [description]
    """
    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project

    (kedro_project_with_mlflow_conf / "conf/base/credentials.yml").write_text(
        yaml.dump(dict(my_mlflow_creds=dict(mlflow_tracking_uri="mlruns2")))
    )

    config = KedroMlflowConfig(
        project_path=kedro_project_with_mlflow_conf,
        mlflow_tracking_uri="mlruns1",
        credentials="my_mlflow_creds",
    )

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        "fake_project", project_path=kedro_project_with_mlflow_conf
    ):
        config.setup()

    assert (
        mlflow.get_tracking_uri()
        == (kedro_project_with_mlflow_conf / "mlruns1").as_uri()
    )
Ejemplo n.º 4
0
def test_mlflow_pipeline_hook_with_copy_mode(
    kedro_project_with_mlflow_conf,
    dummy_pipeline_ml,
    dummy_catalog,
    dummy_run_params,
    copy_mode,
    expected,
):
    # config_with_base_mlflow_conf is a conftest fixture
    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        pipeline_hook = MlflowPipelineHook()
        runner = SequentialRunner()

        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline_ml.training,
            inference=dummy_pipeline_ml.inference,
            input_name=dummy_pipeline_ml.input_name,
            conda_env={},
            model_name=dummy_pipeline_ml.model_name,
            copy_mode=copy_mode,
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )
        runner.run(pipeline_to_run, dummy_catalog)
        run_id = mlflow.active_run().info.run_id
        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )

        mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri()
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model")

        actual_copy_mode = {
            name: ds._copy_mode
            for name, ds in loaded_model._model_impl.python_model.loaded_catalog._data_sets.items()
        }

        assert actual_copy_mode == expected
Ejemplo n.º 5
0
def main():  # pragma: no cover
    """Main entry point. Look for a ``cli.py``, and, if found, add its
    commands to `kedro`'s before invoking the CLI.
    """
    _init_plugins()

    global_groups = [cli]
    global_groups.extend(load_entry_points("global"))
    project_groups = []
    cli_context = dict()

    path = Path.cwd()
    if _is_project(path):
        # load project commands from cli.py
        metadata = _get_project_metadata(path)
        cli_context = dict(obj=metadata)
        _add_src_to_path(metadata.source_dir, path)

        project_groups.extend(load_entry_points("project"))
        package_name = metadata.package_name

        try:
            project_cli = importlib.import_module(f"{package_name}.cli")
            project_groups.append(project_cli.cli)
        except Exception as exc:
            raise KedroCliError(
                f"Cannot load commands from {package_name}.cli"
            ) from exc

    cli_collection = CommandCollection(
        ("Global commands", global_groups),
        ("Project specific commands", project_groups),
    )
    cli_collection(**cli_context)
Ejemplo n.º 6
0
def ui(env, port, host):
    """Opens the mlflow user interface with the
    project-specific settings of mlflow.yml. This interface
    enables to browse and compares runs.
    """

    project_path = Path().cwd()
    project_metadata = _get_project_metadata(project_path)
    _add_src_to_path(project_metadata.source_dir, project_path)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=project_path,
        env=env,
    ):

        mlflow_conf = get_mlflow_config()
        host = host or mlflow_conf.ui_opts.get("host")
        port = port or mlflow_conf.ui_opts.get("port")

        # call mlflow ui with specific options
        # TODO : add more options for ui
        subprocess.call(
            [
                "mlflow",
                "ui",
                "--backend-store-uri",
                mlflow_conf.mlflow_tracking_uri,
                "--host",
                host,
                "--port",
                port,
            ]
        )
def test_mlflow_config_with_templated_config_loader(kedro_project_with_tcl, ):

    _write_yaml(
        kedro_project_with_tcl / "conf" / "local" / "mlflow.yml",
        dict(
            mlflow_tracking_uri="${mlflow_tracking_uri}",
            credentials=None,
            disable_tracking=dict(pipelines=["my_disabled_pipeline"]),
            experiment=dict(name="fake_package", create=True),
            run=dict(id="123456789", name="my_run", nested=True),
            ui=dict(port="5151", host="localhost"),
            hooks=dict(node=dict(
                flatten_dict_params=True,
                recursive=False,
                sep="-",
                long_parameters_strategy="truncate",
            )),
        ),
    )

    _write_yaml(
        kedro_project_with_tcl / "conf" / "local" / "globals.yml",
        dict(mlflow_tracking_uri="dynamic_mlruns"),
    )

    expected = {
        "mlflow_tracking_uri":
        (kedro_project_with_tcl / "dynamic_mlruns").as_uri(),
        "credentials": None,
        "disable_tracking": {
            "pipelines": ["my_disabled_pipeline"]
        },
        "experiments": {
            "name": "fake_package",
            "create": True
        },
        "run": {
            "id": "123456789",
            "name": "my_run",
            "nested": True
        },
        "ui": {
            "port": "5151",
            "host": "localhost"
        },
        "hooks": {
            "node": {
                "flatten_dict_params": True,
                "recursive": False,
                "sep": "-",
                "long_parameters_strategy": "truncate",
            }
        },
    }
    project_metadata = _get_project_metadata(kedro_project_with_tcl)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_tcl)
    configure_project(project_metadata.package_name)
    with KedroSession.create(project_metadata.package_name,
                             kedro_project_with_tcl):
        assert get_mlflow_config().to_dict() == expected
Ejemplo n.º 8
0
def test_node_hook_logging_above_limit_tag_strategy(kedro_project,
                                                    dummy_run_params,
                                                    param_length):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="tag")), ),
    )

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=Pipeline([]),
                catalog=DataCatalog(),
            )

            # IMPORTANT: Overpassing the parameters limit
            # should raise an error for all mlflow backend
            # but it does not on FileStore backend :
            # https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425
            # Since we use FileStore system for simplicty for tests logging works
            # But we have enforced failure (which is slightly different from mlflow
            # behaviour)
            mlflow_node_hook.before_node_run(
                node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
                catalog=DataCatalog(),  # can be empty
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == {}
        assert {
            k: v
            for k, v in current_run.data.tags.items()
            if not k.startswith("mlflow")
        } == {
            "my_param": param_value
        }
Ejemplo n.º 9
0
 def _load_project(project_path):  # pragma: no cover
     # TODO: This one can potentially become project bootstrap and will be
     #  tested there
     if not _is_project(project_path):
         return None
     metadata = _get_project_metadata(project_path)
     _add_src_to_path(metadata.source_dir, project_path)
     configure_project(metadata.package_name)
     return metadata
Ejemplo n.º 10
0
def test_get_mlflow_config_in_uninitialized_project(kedro_project):
    # config_with_base_mlflow_conf is a pytest.fixture in conftest
    with pytest.raises(
            KedroMlflowConfigError,
            match="No 'mlflow.yml' config file found in environment"):
        project_metadata = _get_project_metadata(kedro_project)
        _add_src_to_path(project_metadata.source_dir, kedro_project)
        configure_project(project_metadata.package_name)
        with KedroSession.create(project_metadata.package_name, kedro_project):
            get_mlflow_config()
Ejemplo n.º 11
0
    def test_add_source_dir_to_sys_path(self, monkeypatch, tmp_path, mocker):
        # test we are also adding source_dir to PYTHONPATH as well
        monkeypatch.delenv("PYTHONPATH", raising=False)
        mocker.patch("kedro.framework.cli.utils._validate_source_path")

        project_path = tmp_path
        source_dir = project_path / "source_dir"

        _add_src_to_path(source_dir, project_path)

        assert str(source_dir) in sys.path[0]
        assert os.environ["PYTHONPATH"] == str(source_dir)
Ejemplo n.º 12
0
def test_mlflow_pipeline_hook_with_pipeline_ml_signature(
    kedro_project_with_mlflow_conf,
    env_from_dict,
    dummy_pipeline,
    dummy_catalog,
    dummy_run_params,
    model_signature,
    expected_signature,
):
    # config_with_base_mlflow_conf is a conftest fixture
    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):
        pipeline_hook = MlflowPipelineHook()
        runner = SequentialRunner()

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline.only_nodes_with_tags("training"),
            inference=dummy_pipeline.only_nodes_with_tags("inference"),
            input_name="raw_data",
            conda_env=env_from_dict,
            model_name="model",
            model_signature=model_signature,
        )

        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )
        runner.run(pipeline_to_run, dummy_catalog)
        run_id = mlflow.active_run().info.run_id
        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )

        # test : parameters should have been logged
        trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
        assert trained_model.metadata.signature == expected_signature
Ejemplo n.º 13
0
def init(env, force, silent):
    """Updates the template of a kedro project.
    Running this command is mandatory to use kedro-mlflow.
    This adds "conf/base/mlflow.yml": This is a configuration file
    used for run parametrization when calling "kedro run" command.
    See INSERT_DOC_URL for further details.
    """

    # get constants
    mlflow_yml = "mlflow.yml"
    project_path = Path().cwd()
    project_metadata = _get_project_metadata(project_path)
    _add_src_to_path(project_metadata.source_dir, project_path)
    configure_project(project_metadata.package_name)
    session = KedroSession.create(
        project_metadata.package_name, project_path=project_path
    )
    context = session.load_context()
    mlflow_yml_path = project_path / context.CONF_ROOT / env / mlflow_yml

    # mlflow.yml is just a static file,
    # but the name of the experiment is set to be the same as the project
    if mlflow_yml_path.is_file() and not force:
        click.secho(
            click.style(
                f"A 'mlflow.yml' already exists at '{mlflow_yml_path}' You can use the ``--force`` option to override it.",
                fg="red",
            )
        )
    else:
        try:
            write_jinja_template(
                src=TEMPLATE_FOLDER_PATH / mlflow_yml,
                is_cookiecutter=False,
                dst=mlflow_yml_path,
                python_package=project_metadata.package_name,
            )
        except FileNotFoundError:
            click.secho(
                click.style(
                    f"No env '{env}' found. Please check this folder exists inside '{context.CONF_ROOT}' folder.",
                    fg="red",
                )
            )
        if not silent:
            click.secho(
                click.style(
                    f"'{context.CONF_ROOT}/{env}/{mlflow_yml}' successfully updated.",
                    fg="green",
                )
            )
Ejemplo n.º 14
0
def test_on_pipeline_error(kedro_project_with_mlflow_conf):

    tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri()

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        def failing_node():
            mlflow.start_run(nested=True)
            raise ValueError("Let's make this pipeline fail")

        class DummyContextWithHook(KedroContext):
            project_name = "fake project"
            package_name = "fake_project"
            project_version = "0.16.5"

            hooks = (MlflowPipelineHook(),)

            def _get_pipeline(self, name: str = None) -> Pipeline:
                return Pipeline(
                    [
                        node(
                            func=failing_node,
                            inputs=None,
                            outputs="fake_output",
                        )
                    ]
                )

        with pytest.raises(ValueError):
            failing_context = DummyContextWithHook(
                "fake_package", kedro_project_with_mlflow_conf.as_posix()
            )
            failing_context.run()

        # the run we want is the last one in Default experiment
        failing_run_info = MlflowClient(tracking_uri).list_run_infos("0")[0]
        assert mlflow.active_run() is None  # the run must have been closed
        assert failing_run_info.status == RunStatus.to_string(
            RunStatus.FAILED
        )  # it must be marked as failed
Ejemplo n.º 15
0
def test_kedro_mlflow_config_new_experiment_does_not_exists(
    kedro_project_with_mlflow_conf,
):

    config = KedroMlflowConfig(
        project_path=kedro_project_with_mlflow_conf,
        mlflow_tracking_uri="mlruns",
        experiment_opts=dict(name="exp1"),
    )

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        "fake_project", project_path=kedro_project_with_mlflow_conf
    ):
        config.setup()

    assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
Ejemplo n.º 16
0
def test_node_hook_logging_above_limit_truncate_strategy(
        kedro_project, dummy_run_params, param_length):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ),
    )

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=Pipeline([]),
                catalog=DataCatalog(),
            )
            mlflow_node_hook.before_node_run(
                node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
                catalog=DataCatalog(),  # can be empty
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == {
            "my_param": param_value[0:MAX_PARAM_VAL_LENGTH]
        }
Ejemplo n.º 17
0
def load_kedro_objects(path, line=None):  # pylint: disable=unused-argument
    """Line magic which reloads all Kedro default variables."""

    import kedro.config.default_logger  # noqa: F401 # pylint: disable=unused-import
    from kedro.framework.cli import load_entry_points
    from kedro.framework.cli.utils import _add_src_to_path
    from kedro.framework.project import configure_project
    from kedro.framework.session import KedroSession
    from kedro.framework.session.session import _activate_session
    from kedro.framework.startup import _get_project_metadata

    global context
    global catalog
    global session

    path = path or project_path
    metadata = _get_project_metadata(path)
    _add_src_to_path(metadata.source_dir, path)
    configure_project(metadata.package_name)

    _clear_hook_manager()

    _remove_cached_modules(metadata.package_name)

    session = KedroSession.create(metadata.package_name, path)
    _activate_session(session)
    logging.debug("Loading the context from %s", str(path))
    context = session.load_context()
    catalog = context.catalog

    get_ipython().push(variables={
        "context": context,
        "catalog": catalog,
        "session": session
    })

    logging.info("** Kedro project %s", str(metadata.project_name))
    logging.info("Defined global variable `context`, `session` and `catalog`")

    for line_magic in load_entry_points("line_magic"):
        register_line_magic(needs_local_scope(line_magic))
        logging.info("Registered line magic `%s`", line_magic.__name__)
Ejemplo n.º 18
0
def test_kedro_mlflow_config_setup_export_credentials(kedro_project_with_mlflow_conf):

    (kedro_project_with_mlflow_conf / "conf/base/credentials.yml").write_text(
        yaml.dump(dict(my_mlflow_creds=dict(fake_mlflow_cred="my_fake_cred")))
    )

    # the config must restore properly the experiment
    config = KedroMlflowConfig(
        project_path=kedro_project_with_mlflow_conf, credentials="my_mlflow_creds"
    )

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        "fake_project", project_path=kedro_project_with_mlflow_conf
    ):
        config.setup()

    assert os.environ["fake_mlflow_cred"] == "my_fake_cred"
Ejemplo n.º 19
0
def test_kedro_mlflow_config_experiment_exists(mocker, kedro_project_with_mlflow_conf):

    # create an experiment with the same name
    mlflow_tracking_uri = (
        kedro_project_with_mlflow_conf / "conf" / "local" / "mlruns"
    ).as_uri()
    MlflowClient(mlflow_tracking_uri).create_experiment("exp1")
    config = KedroMlflowConfig(
        project_path=kedro_project_with_mlflow_conf,
        mlflow_tracking_uri="mlruns",
        experiment_opts=dict(name="exp1"),
    )

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        "fake_project", project_path=kedro_project_with_mlflow_conf
    ):
        config.setup()
    assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
Ejemplo n.º 20
0
def test_kedro_mlflow_config_setup_set_tracking_uri(kedro_project_with_mlflow_conf):

    # create an experiment with the same name and then delete it
    mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "awesome_tracking").as_uri()

    # the config must restore properly the experiment
    config = KedroMlflowConfig(
        project_path=kedro_project_with_mlflow_conf,
        mlflow_tracking_uri="awesome_tracking",
        experiment_opts=dict(name="exp1"),
    )

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        "fake_project", project_path=kedro_project_with_mlflow_conf
    ):
        config.setup()

    assert mlflow.get_tracking_uri() == mlflow_tracking_uri
Ejemplo n.º 21
0
def test_cli_init_existing_config(monkeypatch, kedro_project_with_mlflow_conf):
    # "kedro_project" is a pytest.fixture declared in conftest
    cli_runner = CliRunner()
    monkeypatch.chdir(kedro_project_with_mlflow_conf)
    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir,
                     kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            "fake_project",
            project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()
        # emulate first call by writing a mlflow.yml file
        yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto"))
        (kedro_project_with_mlflow_conf / context.CONF_ROOT / "local" /
         "mlflow.yml").write_text(yaml_str)

        result = cli_runner.invoke(cli_init)

        # check an error message is raised
        assert "A 'mlflow.yml' already exists" in result.output

        # check the file remains unmodified
        assert get_mlflow_config().mlflow_tracking_uri.endswith("toto")
Ejemplo n.º 22
0
def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters(
    kedro_project_with_mlflow_conf,  # a fixture to be in a kedro project
    tmp_path,
    pipeline_ml_with_parameters,
    dummy_run_params,
):
    # config_with_base_mlflow_conf is a conftest fixture
    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        mlflow_conf = get_mlflow_config()
        mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

        catalog_with_parameters = DataCatalog(
            {
                "data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "cleaned_data": MemoryDataSet(),
                "params:stopwords": MemoryDataSet(["Hello", "Hi"]),
                "params:penalty": MemoryDataSet(0.1),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "params:threshold": MemoryDataSet(0.5),
            }
        )

        pipeline_hook = MlflowPipelineHook()

        runner = SequentialRunner()
        pipeline_hook.after_catalog_created(
            catalog=catalog_with_parameters,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=pipeline_ml_with_parameters,
            catalog=catalog_with_parameters,
        )
        runner.run(pipeline_ml_with_parameters, catalog_with_parameters)

        current_run_id = mlflow.active_run().info.run_id

        # This is what we want to test: model must be saved and the parameters automatically persisted on disk
        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=pipeline_ml_with_parameters,
            catalog=catalog_with_parameters,
        )

        # the 2 parameters which are inputs of inference pipeline
        # must have been persisted and logged inside the model's artifacts
        model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model")
        assert set(
            model.metadata.to_dict()["flavors"]["python_function"]["artifacts"].keys()
        ) == {"model", "params:stopwords", "params:threshold"}

        # the model should be loadable and predict() should work (this tests KedroPipelineModel)
        assert model.predict(pd.DataFrame(data=[1], columns=["a"])).values[0][0] == 1
Ejemplo n.º 23
0
def test_node_hook_logging(
    kedro_project,
    dummy_run_params,
    dummy_catalog,
    dummy_pipeline,
    dummy_node,
    flatten_dict_params,
    expected,
):

    # config = KedroMlflowConfig(
    #     project_path=tmp_path,
    #     node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"},
    # )
    # # the function is imported inside the other file antd this is the file to patch
    # # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method
    # mocker.patch(
    #     "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config
    # )

    _write_yaml(
        kedro_project / "conf" / "base" / "mlflow.yml",
        dict(hooks=dict(node=dict(flatten_dict_params=flatten_dict_params,
                                  recursive=False,
                                  sep="-")), ),
    ),

    mlflow_node_hook = MlflowNodeHook()

    node_inputs = {
        v: dummy_catalog._data_sets.get(v)
        for k, v in dummy_node._inputs.items()
    }

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        mlflow.set_tracking_uri(mlflow_tracking_uri)
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=dummy_pipeline,
                catalog=dummy_catalog,
            )
            mlflow_node_hook.before_node_run(
                node=dummy_node,
                catalog=dummy_catalog,
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == expected
Ejemplo n.º 24
0
def test_mlflow_pipeline_hook_with_different_pipeline_types(
    kedro_project_with_mlflow_conf,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
):

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):
        # config_with_base_mlflow_conf is a conftest fixture
        pipeline_hook = MlflowPipelineHook()
        runner = SequentialRunner()
        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of below arguments,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )
        runner.run(pipeline_to_run, dummy_catalog)
        run_id = mlflow.active_run().info.run_id
        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )
        # test : parameters should have been logged
        mlflow_conf = get_mlflow_config()
        mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
        run_data = mlflow_client.get_run(run_id).data

        # all run_params are recorded as tags
        for k, v in dummy_run_params.items():
            if v:
                assert run_data.tags[k] == str(v)

        # params are not recorded because we don't have MlflowNodeHook here
        # and the model should not be logged when it is not a PipelineML
        nb_artifacts = len(mlflow_client.list_artifacts(run_id))
        if isinstance(pipeline_to_run, PipelineML):
            assert nb_artifacts == 1
        else:
            assert nb_artifacts == 0

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics"
        assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo"

        if isinstance(pipeline_to_run, PipelineML):
            trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
            assert trained_model.metadata.signature.to_dict() == {
                "inputs": '[{"name": "a", "type": "long"}]',
                "outputs": None,
            }
Ejemplo n.º 25
0
def test_mlflow_pipeline_hook_metrics_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params
):

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        mlflow_conf = get_mlflow_config()
        mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
            }
        )

        pipeline_hook = MlflowPipelineHook()

        runner = SequentialRunner()
        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

        current_run_id = mlflow.active_run().info.run_id

        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}
        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1