def test_kedro_mlflow_config_experiment_was_deleted(kedro_project_with_mlflow_conf): # create an experiment with the same name and then delete it mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() mlflow_client = MlflowClient(mlflow_tracking_uri) mlflow_client.create_experiment("exp1") mlflow_client.delete_experiment( mlflow_client.get_experiment_by_name("exp1").experiment_id ) # the config must restore properly the experiment config = KedroMlflowConfig( project_path=kedro_project_with_mlflow_conf, mlflow_tracking_uri="mlruns", experiment_opts=dict(name="exp1"), ) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf ): config.setup() assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
def test_pipeline_run_hook_getting_configs( kedro_project, dummy_run_params, dummy_pipeline, dummy_catalog, ): _write_yaml( kedro_project / "conf" / "local" / "mlflow.yml", dict(hooks=dict( node=dict(flatten_dict_params=True, recursive=False, sep="-")), ), ), project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): mlflow_node_hook = MlflowNodeHook() mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog) assert ( mlflow_node_hook.flatten, mlflow_node_hook.recursive, mlflow_node_hook.sep, ) == (True, False, "-")
def test_kedro_mlflow_config_setup_tracking_priority(kedro_project_with_mlflow_conf): """Test if the mlflow_tracking uri set is the one of mlflow.yml if it also eist in credentials. Args: mocker ([type]): [description] tmp_path ([type]): [description] """ # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project (kedro_project_with_mlflow_conf / "conf/base/credentials.yml").write_text( yaml.dump(dict(my_mlflow_creds=dict(mlflow_tracking_uri="mlruns2"))) ) config = KedroMlflowConfig( project_path=kedro_project_with_mlflow_conf, mlflow_tracking_uri="mlruns1", credentials="my_mlflow_creds", ) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf ): config.setup() assert ( mlflow.get_tracking_uri() == (kedro_project_with_mlflow_conf / "mlruns1").as_uri() )
def test_mlflow_pipeline_hook_with_copy_mode( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_catalog, dummy_run_params, copy_mode, expected, ): # config_with_base_mlflow_conf is a conftest fixture project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline_ml.training, inference=dummy_pipeline_ml.inference, input_name=dummy_pipeline_ml.input_name, conda_env={}, model_name=dummy_pipeline_ml.model_name, copy_mode=copy_mode, ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model") actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model.loaded_catalog._data_sets.items() } assert actual_copy_mode == expected
def main(): # pragma: no cover """Main entry point. Look for a ``cli.py``, and, if found, add its commands to `kedro`'s before invoking the CLI. """ _init_plugins() global_groups = [cli] global_groups.extend(load_entry_points("global")) project_groups = [] cli_context = dict() path = Path.cwd() if _is_project(path): # load project commands from cli.py metadata = _get_project_metadata(path) cli_context = dict(obj=metadata) _add_src_to_path(metadata.source_dir, path) project_groups.extend(load_entry_points("project")) package_name = metadata.package_name try: project_cli = importlib.import_module(f"{package_name}.cli") project_groups.append(project_cli.cli) except Exception as exc: raise KedroCliError( f"Cannot load commands from {package_name}.cli" ) from exc cli_collection = CommandCollection( ("Global commands", global_groups), ("Project specific commands", project_groups), ) cli_collection(**cli_context)
def ui(env, port, host): """Opens the mlflow user interface with the project-specific settings of mlflow.yml. This interface enables to browse and compares runs. """ project_path = Path().cwd() project_metadata = _get_project_metadata(project_path) _add_src_to_path(project_metadata.source_dir, project_path) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=project_path, env=env, ): mlflow_conf = get_mlflow_config() host = host or mlflow_conf.ui_opts.get("host") port = port or mlflow_conf.ui_opts.get("port") # call mlflow ui with specific options # TODO : add more options for ui subprocess.call( [ "mlflow", "ui", "--backend-store-uri", mlflow_conf.mlflow_tracking_uri, "--host", host, "--port", port, ] )
def test_mlflow_config_with_templated_config_loader(kedro_project_with_tcl, ): _write_yaml( kedro_project_with_tcl / "conf" / "local" / "mlflow.yml", dict( mlflow_tracking_uri="${mlflow_tracking_uri}", credentials=None, disable_tracking=dict(pipelines=["my_disabled_pipeline"]), experiment=dict(name="fake_package", create=True), run=dict(id="123456789", name="my_run", nested=True), ui=dict(port="5151", host="localhost"), hooks=dict(node=dict( flatten_dict_params=True, recursive=False, sep="-", long_parameters_strategy="truncate", )), ), ) _write_yaml( kedro_project_with_tcl / "conf" / "local" / "globals.yml", dict(mlflow_tracking_uri="dynamic_mlruns"), ) expected = { "mlflow_tracking_uri": (kedro_project_with_tcl / "dynamic_mlruns").as_uri(), "credentials": None, "disable_tracking": { "pipelines": ["my_disabled_pipeline"] }, "experiments": { "name": "fake_package", "create": True }, "run": { "id": "123456789", "name": "my_run", "nested": True }, "ui": { "port": "5151", "host": "localhost" }, "hooks": { "node": { "flatten_dict_params": True, "recursive": False, "sep": "-", "long_parameters_strategy": "truncate", } }, } project_metadata = _get_project_metadata(kedro_project_with_tcl) _add_src_to_path(project_metadata.source_dir, kedro_project_with_tcl) configure_project(project_metadata.package_name) with KedroSession.create(project_metadata.package_name, kedro_project_with_tcl): assert get_mlflow_config().to_dict() == expected
def test_node_hook_logging_above_limit_tag_strategy(kedro_project, dummy_run_params, param_length): _write_yaml( kedro_project / "conf" / "local" / "mlflow.yml", dict(hooks=dict(node=dict(long_parameters_strategy="tag")), ), ) mlflow_tracking_uri = (kedro_project / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) mlflow_node_hook = MlflowNodeHook() param_value = param_length * "a" node_inputs = {"params:my_param": param_value} project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): with mlflow.start_run(): mlflow_node_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog(), ) # IMPORTANT: Overpassing the parameters limit # should raise an error for all mlflow backend # but it does not on FileStore backend : # https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425 # Since we use FileStore system for simplicty for tests logging works # But we have enforced failure (which is slightly different from mlflow # behaviour) mlflow_node_hook.before_node_run( node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None), catalog=DataCatalog(), # can be empty inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == {} assert { k: v for k, v in current_run.data.tags.items() if not k.startswith("mlflow") } == { "my_param": param_value }
def _load_project(project_path): # pragma: no cover # TODO: This one can potentially become project bootstrap and will be # tested there if not _is_project(project_path): return None metadata = _get_project_metadata(project_path) _add_src_to_path(metadata.source_dir, project_path) configure_project(metadata.package_name) return metadata
def test_get_mlflow_config_in_uninitialized_project(kedro_project): # config_with_base_mlflow_conf is a pytest.fixture in conftest with pytest.raises( KedroMlflowConfigError, match="No 'mlflow.yml' config file found in environment"): project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create(project_metadata.package_name, kedro_project): get_mlflow_config()
def test_add_source_dir_to_sys_path(self, monkeypatch, tmp_path, mocker): # test we are also adding source_dir to PYTHONPATH as well monkeypatch.delenv("PYTHONPATH", raising=False) mocker.patch("kedro.framework.cli.utils._validate_source_path") project_path = tmp_path source_dir = project_path / "source_dir" _add_src_to_path(source_dir, project_path) assert str(source_dir) in sys.path[0] assert os.environ["PYTHONPATH"] == str(source_dir)
def test_mlflow_pipeline_hook_with_pipeline_ml_signature( kedro_project_with_mlflow_conf, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, model_signature, expected_signature, ): # config_with_base_mlflow_conf is a conftest fixture project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", conda_env=env_from_dict, model_name="model", model_signature=model_signature, ) pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature == expected_signature
def init(env, force, silent): """Updates the template of a kedro project. Running this command is mandatory to use kedro-mlflow. This adds "conf/base/mlflow.yml": This is a configuration file used for run parametrization when calling "kedro run" command. See INSERT_DOC_URL for further details. """ # get constants mlflow_yml = "mlflow.yml" project_path = Path().cwd() project_metadata = _get_project_metadata(project_path) _add_src_to_path(project_metadata.source_dir, project_path) configure_project(project_metadata.package_name) session = KedroSession.create( project_metadata.package_name, project_path=project_path ) context = session.load_context() mlflow_yml_path = project_path / context.CONF_ROOT / env / mlflow_yml # mlflow.yml is just a static file, # but the name of the experiment is set to be the same as the project if mlflow_yml_path.is_file() and not force: click.secho( click.style( f"A 'mlflow.yml' already exists at '{mlflow_yml_path}' You can use the ``--force`` option to override it.", fg="red", ) ) else: try: write_jinja_template( src=TEMPLATE_FOLDER_PATH / mlflow_yml, is_cookiecutter=False, dst=mlflow_yml_path, python_package=project_metadata.package_name, ) except FileNotFoundError: click.secho( click.style( f"No env '{env}' found. Please check this folder exists inside '{context.CONF_ROOT}' folder.", fg="red", ) ) if not silent: click.secho( click.style( f"'{context.CONF_ROOT}/{env}/{mlflow_yml}' successfully updated.", fg="green", ) )
def test_on_pipeline_error(kedro_project_with_mlflow_conf): tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): def failing_node(): mlflow.start_run(nested=True) raise ValueError("Let's make this pipeline fail") class DummyContextWithHook(KedroContext): project_name = "fake project" package_name = "fake_project" project_version = "0.16.5" hooks = (MlflowPipelineHook(),) def _get_pipeline(self, name: str = None) -> Pipeline: return Pipeline( [ node( func=failing_node, inputs=None, outputs="fake_output", ) ] ) with pytest.raises(ValueError): failing_context = DummyContextWithHook( "fake_package", kedro_project_with_mlflow_conf.as_posix() ) failing_context.run() # the run we want is the last one in Default experiment failing_run_info = MlflowClient(tracking_uri).list_run_infos("0")[0] assert mlflow.active_run() is None # the run must have been closed assert failing_run_info.status == RunStatus.to_string( RunStatus.FAILED ) # it must be marked as failed
def test_kedro_mlflow_config_new_experiment_does_not_exists( kedro_project_with_mlflow_conf, ): config = KedroMlflowConfig( project_path=kedro_project_with_mlflow_conf, mlflow_tracking_uri="mlruns", experiment_opts=dict(name="exp1"), ) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf ): config.setup() assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
def test_node_hook_logging_above_limit_truncate_strategy( kedro_project, dummy_run_params, param_length): _write_yaml( kedro_project / "conf" / "local" / "mlflow.yml", dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ), ) mlflow_tracking_uri = (kedro_project / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) mlflow_node_hook = MlflowNodeHook() param_value = param_length * "a" node_inputs = {"params:my_param": param_value} project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): with mlflow.start_run(): mlflow_node_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog(), ) mlflow_node_hook.before_node_run( node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None), catalog=DataCatalog(), # can be empty inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == { "my_param": param_value[0:MAX_PARAM_VAL_LENGTH] }
def load_kedro_objects(path, line=None): # pylint: disable=unused-argument """Line magic which reloads all Kedro default variables.""" import kedro.config.default_logger # noqa: F401 # pylint: disable=unused-import from kedro.framework.cli import load_entry_points from kedro.framework.cli.utils import _add_src_to_path from kedro.framework.project import configure_project from kedro.framework.session import KedroSession from kedro.framework.session.session import _activate_session from kedro.framework.startup import _get_project_metadata global context global catalog global session path = path or project_path metadata = _get_project_metadata(path) _add_src_to_path(metadata.source_dir, path) configure_project(metadata.package_name) _clear_hook_manager() _remove_cached_modules(metadata.package_name) session = KedroSession.create(metadata.package_name, path) _activate_session(session) logging.debug("Loading the context from %s", str(path)) context = session.load_context() catalog = context.catalog get_ipython().push(variables={ "context": context, "catalog": catalog, "session": session }) logging.info("** Kedro project %s", str(metadata.project_name)) logging.info("Defined global variable `context`, `session` and `catalog`") for line_magic in load_entry_points("line_magic"): register_line_magic(needs_local_scope(line_magic)) logging.info("Registered line magic `%s`", line_magic.__name__)
def test_kedro_mlflow_config_setup_export_credentials(kedro_project_with_mlflow_conf): (kedro_project_with_mlflow_conf / "conf/base/credentials.yml").write_text( yaml.dump(dict(my_mlflow_creds=dict(fake_mlflow_cred="my_fake_cred"))) ) # the config must restore properly the experiment config = KedroMlflowConfig( project_path=kedro_project_with_mlflow_conf, credentials="my_mlflow_creds" ) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf ): config.setup() assert os.environ["fake_mlflow_cred"] == "my_fake_cred"
def test_kedro_mlflow_config_experiment_exists(mocker, kedro_project_with_mlflow_conf): # create an experiment with the same name mlflow_tracking_uri = ( kedro_project_with_mlflow_conf / "conf" / "local" / "mlruns" ).as_uri() MlflowClient(mlflow_tracking_uri).create_experiment("exp1") config = KedroMlflowConfig( project_path=kedro_project_with_mlflow_conf, mlflow_tracking_uri="mlruns", experiment_opts=dict(name="exp1"), ) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf ): config.setup() assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
def test_kedro_mlflow_config_setup_set_tracking_uri(kedro_project_with_mlflow_conf): # create an experiment with the same name and then delete it mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "awesome_tracking").as_uri() # the config must restore properly the experiment config = KedroMlflowConfig( project_path=kedro_project_with_mlflow_conf, mlflow_tracking_uri="awesome_tracking", experiment_opts=dict(name="exp1"), ) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf ): config.setup() assert mlflow.get_tracking_uri() == mlflow_tracking_uri
def test_cli_init_existing_config(monkeypatch, kedro_project_with_mlflow_conf): # "kedro_project" is a pytest.fixture declared in conftest cli_runner = CliRunner() monkeypatch.chdir(kedro_project_with_mlflow_conf) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() # emulate first call by writing a mlflow.yml file yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto")) (kedro_project_with_mlflow_conf / context.CONF_ROOT / "local" / "mlflow.yml").write_text(yaml_str) result = cli_runner.invoke(cli_init) # check an error message is raised assert "A 'mlflow.yml' already exists" in result.output # check the file remains unmodified assert get_mlflow_config().mlflow_tracking_uri.endswith("toto")
def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters( kedro_project_with_mlflow_conf, # a fixture to be in a kedro project tmp_path, pipeline_ml_with_parameters, dummy_run_params, ): # config_with_base_mlflow_conf is a conftest fixture project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): mlflow_conf = get_mlflow_config() mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) catalog_with_parameters = DataCatalog( { "data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0.1), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "params:threshold": MemoryDataSet(0.5), } ) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=catalog_with_parameters, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) runner.run(pipeline_ml_with_parameters, catalog_with_parameters) current_run_id = mlflow.active_run().info.run_id # This is what we want to test: model must be saved and the parameters automatically persisted on disk pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) # the 2 parameters which are inputs of inference pipeline # must have been persisted and logged inside the model's artifacts model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model") assert set( model.metadata.to_dict()["flavors"]["python_function"]["artifacts"].keys() ) == {"model", "params:stopwords", "params:threshold"} # the model should be loadable and predict() should work (this tests KedroPipelineModel) assert model.predict(pd.DataFrame(data=[1], columns=["a"])).values[0][0] == 1
def test_node_hook_logging( kedro_project, dummy_run_params, dummy_catalog, dummy_pipeline, dummy_node, flatten_dict_params, expected, ): # config = KedroMlflowConfig( # project_path=tmp_path, # node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"}, # ) # # the function is imported inside the other file antd this is the file to patch # # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method # mocker.patch( # "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config # ) _write_yaml( kedro_project / "conf" / "base" / "mlflow.yml", dict(hooks=dict(node=dict(flatten_dict_params=flatten_dict_params, recursive=False, sep="-")), ), ), mlflow_node_hook = MlflowNodeHook() node_inputs = { v: dummy_catalog._data_sets.get(v) for k, v in dummy_node._inputs.items() } mlflow_tracking_uri = (kedro_project / "mlruns").as_uri() project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow_node_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog, ) mlflow_node_hook.before_node_run( node=dummy_node, catalog=dummy_catalog, inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == expected
def test_mlflow_pipeline_hook_with_different_pipeline_types( kedro_project_with_mlflow_conf, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, ): project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of below arguments, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) # test : parameters should have been logged mlflow_conf = get_mlflow_config() mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0 # Check if metrics datasets have prefix with its names. # for metric assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics" assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo" if isinstance(pipeline_to_run, PipelineML): trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature.to_dict() == { "inputs": '[{"name": "a", "type": "long"}]', "outputs": None, }
def test_mlflow_pipeline_hook_metrics_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params ): project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): mlflow_conf = get_mlflow_config() mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), } ) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1