def test_get_mlflow_config_in_uninitialized_project(kedro_project): # config_with_base_mlflow_conf is a pytest.fixture in conftest with pytest.raises( KedroMlflowConfigError, match="No 'mlflow.yml' config file found in environment"): project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create(project_metadata.package_name, kedro_project): get_mlflow_config()
def test_get_mlflow_config_in_uninitialized_project(mocker, tmp_path, config_dir): # config_with_base_mlflow_conf is a pytest.fixture in conftest mocker.patch("logging.config.dictConfig") mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) context = load_context(tmp_path) with pytest.raises( KedroMlflowConfigError, match="No 'mlflow.yml' config file found in environment"): get_mlflow_config(context)
def before_pipeline_run(self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog) -> None: """Hook to be invoked before a pipeline runs. Args: run_params: The params needed for the given run. Should be identical to the data logged by Journal. # @fixme: this needs to be modelled explicitly as code, instead of comment Schema: { "run_id": str, "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]], } pipeline: The ``Pipeline`` that will be run. catalog: The ``DataCatalog`` to be used during the run. """ self.context = load_context( project_path=run_params["project_path"], env=run_params["env"], extra_params=run_params["extra_params"], ) mlflow_conf = get_mlflow_config(self.context) mlflow_conf.setup(self.context) run_name = (mlflow_conf.run_opts["name"] if mlflow_conf.run_opts["name"] is not None else run_params["pipeline_name"]) mlflow.start_run( run_id=mlflow_conf.run_opts["id"], experiment_id=mlflow_conf.experiment.experiment_id, run_name=run_name, nested=mlflow_conf.run_opts["nested"], ) # Set tags only for run parameters that have values. mlflow.set_tags({k: v for k, v in run_params.items() if v}) # add manually git sha for consistency with the journal # TODO : this does not take into account not committed files, so it # does not ensure reproducibility. Define what to do. mlflow.set_tag("git_sha", _git_sha(run_params["project_path"])) mlflow.set_tag( "kedro_command", _generate_kedro_command( tags=run_params["tags"], node_names=run_params["node_names"], from_nodes=run_params["from_nodes"], to_nodes=run_params["to_nodes"], from_inputs=run_params["from_inputs"], load_versions=run_params["load_versions"], pipeline_name=run_params["pipeline_name"], ), )
def test_mlflow_config_with_templated_config_loader(kedro_project_with_tcl, ): _write_yaml( kedro_project_with_tcl / "conf" / "local" / "mlflow.yml", dict( mlflow_tracking_uri="${mlflow_tracking_uri}", credentials=None, disable_tracking=dict(pipelines=["my_disabled_pipeline"]), experiment=dict(name="fake_package", create=True), run=dict(id="123456789", name="my_run", nested=True), ui=dict(port="5151", host="localhost"), hooks=dict(node=dict( flatten_dict_params=True, recursive=False, sep="-", long_parameters_strategy="truncate", )), ), ) _write_yaml( kedro_project_with_tcl / "conf" / "local" / "globals.yml", dict(mlflow_tracking_uri="dynamic_mlruns"), ) expected = { "mlflow_tracking_uri": (kedro_project_with_tcl / "dynamic_mlruns").as_uri(), "credentials": None, "disable_tracking": { "pipelines": ["my_disabled_pipeline"] }, "experiments": { "name": "fake_package", "create": True }, "run": { "id": "123456789", "name": "my_run", "nested": True }, "ui": { "port": "5151", "host": "localhost" }, "hooks": { "node": { "flatten_dict_params": True, "recursive": False, "sep": "-", "long_parameters_strategy": "truncate", } }, } project_metadata = _get_project_metadata(kedro_project_with_tcl) _add_src_to_path(project_metadata.source_dir, kedro_project_with_tcl) configure_project(project_metadata.package_name) with KedroSession.create(project_metadata.package_name, kedro_project_with_tcl): assert get_mlflow_config().to_dict() == expected
def ui(env, port, host): """Opens the mlflow user interface with the project-specific settings of mlflow.yml. This interface enables to browse and compares runs. """ project_path = Path().cwd() project_metadata = _get_project_metadata(project_path) _add_src_to_path(project_metadata.source_dir, project_path) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=project_path, env=env, ): mlflow_conf = get_mlflow_config() host = host or mlflow_conf.ui_opts.get("host") port = port or mlflow_conf.ui_opts.get("port") # call mlflow ui with specific options # TODO : add more options for ui subprocess.call( [ "mlflow", "ui", "--backend-store-uri", mlflow_conf.mlflow_tracking_uri, "--host", host, "--port", port, ] )
def before_pipeline_run(self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog) -> None: """Hook to be invoked before a pipeline runs. Args: run_params: The params needed for the given run. Should be identical to the data logged by Journal. # @fixme: this needs to be modelled explicitly as code, instead of comment Schema: { "run_id": str, "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]], } pipeline: The ``Pipeline`` that will be run. catalog: The ``DataCatalog`` to be used during the run. """ self.context = load_context( project_path=run_params["project_path"], env=run_params["env"], extra_params=run_params["extra_params"], ) config = get_mlflow_config(self.context) self.flatten = config.node_hook_opts["flatten_dict_params"] self.recursive = config.node_hook_opts["recursive"] self.sep = config.node_hook_opts["sep"]
def test_deactivated_tracking_for_given_pipeline( mock_settings_with_mlflow_hooks, patched_configure_project, mocker, kedro_project_path, ): mocker.patch("kedro.framework.session.session.KedroSession._setup_logging") with KedroSession.create(MOCK_PACKAGE_NAME, kedro_project_path) as session: kedro_mlflow_config = get_mlflow_config() kedro_mlflow_config.setup() mlflow_client = MlflowClient((kedro_project_path / "mlruns").as_uri()) # 0 is default, 1 is "fake_exp" all_runs_id_beginning = set([ run.run_id for k in range(len(mlflow_client.list_experiments())) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ]) context = session.load_context() context.run(pipeline_name="pipeline_off") all_runs_id_end = set([ run.run_id for k in range(len(mlflow_client.list_experiments())) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ]) assert all_runs_id_beginning == all_runs_id_end # no run is created
def mlflow_start(ctx, kubeflow_run_id: str): from kedro_mlflow.framework.context import get_mlflow_config import mlflow mlflow_conf = get_mlflow_config(ctx.obj["kedro_ctx"]) mlflow_conf.setup(ctx.obj["kedro_ctx"]) run = mlflow.start_run(experiment_id=mlflow_conf.experiment.experiment_id, nested=False) mlflow.set_tag("kubeflow_run_id", kubeflow_run_id) with open("/tmp/mlflow_run_id", "w") as f: f.write(run.info.run_id) click.echo(f"Started run: {run.info.run_id}")
def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir): _write_yaml( tmp_path / "conf" / "base" / "mlflow.yml", dict( mlflow_tracking_uri="${mlflow_tracking_uri}", credentials=None, experiment=dict(name="fake_package", create=True), run=dict(id="123456789", name="my_run", nested=True), ui=dict(port="5151", host="localhost"), hooks=dict(node=dict( flatten_dict_params=True, recursive=False, sep="-", long_parameters_strategy="truncate", )), ), ) _write_yaml( tmp_path / "conf" / "base" / "globals.yml", dict(mlflow_tracking_uri="testruns"), ) expected = { "mlflow_tracking_uri": (tmp_path / "testruns").as_uri(), "credentials": None, "experiments": { "name": "fake_package", "create": True }, "run": { "id": "123456789", "name": "my_run", "nested": True }, "ui": { "port": "5151", "host": "localhost" }, "hooks": { "node": { "flatten_dict_params": True, "recursive": False, "sep": "-", "long_parameters_strategy": "truncate", } }, } context = load_context(tmp_path) assert get_mlflow_config(context).to_dict() == expected
def ui(project_path, env): """Opens the mlflow user interface with the project-specific settings of mlflow.yml. This interface enables to browse and compares runs. """ # the context must contains the self.mlflow attribues with mlflow configuration mlflow_conf = get_mlflow_config(project_path=project_path, env=env) # call mlflow ui with specific options # TODO : add more options for ui subprocess.call([ "mlflow", "ui", "--backend-store-uri", mlflow_conf.mlflow_tracking_uri ])
def test_get_mlflow_config(mocker, tmp_path, config_dir): # config_with_base_mlflow_conf is a pytest.fixture in conftest mocker.patch("logging.config.dictConfig") mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) _write_yaml( tmp_path / "conf" / "base" / "mlflow.yml", dict( mlflow_tracking_uri="mlruns", credentials=None, experiment=dict(name="fake_package", create=True), run=dict(id="123456789", name="my_run", nested=True), ui=dict(port="5151", host="localhost"), hooks=dict(node=dict( flatten_dict_params=True, recursive=False, sep="-", long_parameters_strategy="truncate", )), ), ) expected = { "mlflow_tracking_uri": (tmp_path / "mlruns").as_uri(), "credentials": None, "experiments": { "name": "fake_package", "create": True }, "run": { "id": "123456789", "name": "my_run", "nested": True }, "ui": { "port": "5151", "host": "localhost" }, "hooks": { "node": { "flatten_dict_params": True, "recursive": False, "sep": "-", "long_parameters_strategy": "truncate", } }, } context = load_context(tmp_path) assert get_mlflow_config(context).to_dict() == expected
def mlflow_start(ctx, kubeflow_run_id: str, output: str): import mlflow from kedro_mlflow.framework.context import get_mlflow_config token = AuthHandler().obtain_id_token() if token: os.environ["MLFLOW_TRACKING_TOKEN"] = token LOG.info("Configuring MLFLOW_TRACKING_TOKEN") kedro_context = ctx.obj["context_helper"].context mlflow_conf = get_mlflow_config(kedro_context) mlflow_conf.setup(kedro_context) run = mlflow.start_run(experiment_id=mlflow_conf.experiment.experiment_id, nested=False) mlflow.set_tag("kubeflow_run_id", kubeflow_run_id) with open(output, "w") as f: f.write(run.info.run_id) click.echo(f"Started run: {run.info.run_id}")
def test_cli_init_existing_config(monkeypatch, tmp_path, kedro_project): # "kedro_project" is a pytest.fixture declared in conftest project_path = tmp_path / "fake-project" monkeypatch.chdir(project_path) cli_runner = CliRunner() project_context = load_context(project_path.as_posix()) # emulate first call by writing a mlflow.yml file yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto")) (project_path / project_context.CONF_ROOT / "local/mlflow.yml").write_text(yaml_str) result = cli_runner.invoke(cli_init) # check an error message is raised assert "A 'mlflow.yml' already exists" in result.output # check the file remains unmodified assert get_mlflow_config(project_context).mlflow_tracking_uri.endswith( "toto")
def test_mlflow_pipeline_hook_with_different_pipeline_types( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict, model_name="model") runner = SequentialRunner() pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"]) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged mlflow_conf = get_mlflow_config(tmp_path) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0
def test_cli_init_existing_config(monkeypatch, kedro_project_with_mlflow_conf): # "kedro_project" is a pytest.fixture declared in conftest cli_runner = CliRunner() monkeypatch.chdir(kedro_project_with_mlflow_conf) project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( "fake_project", project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() # emulate first call by writing a mlflow.yml file yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto")) (kedro_project_with_mlflow_conf / context.CONF_ROOT / "local" / "mlflow.yml").write_text(yaml_str) result = cli_runner.invoke(cli_init) # check an error message is raised assert "A 'mlflow.yml' already exists" in result.output # check the file remains unmodified assert get_mlflow_config().mlflow_tracking_uri.endswith("toto")
def test_mlflow_pipeline_hook_metrics_with_run_id( mocker, monkeypatch, tmp_path, config_dir, dummy_pipeline_ml, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"), }) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) all_runs_id = set([ run.run_id for run in mlflow_client.list_run_infos(experiment_id="0") ]) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1
def test_mlflow_pipeline_hook_metrics_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params ): project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): mlflow_conf = get_mlflow_config() mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), } ) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1
def test_mlflow_pipeline_hook_with_different_pipeline_types( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0 # Check if metrics datasets have prefix with its names. # for metric assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics" assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo" if isinstance(pipeline_to_run, PipelineML): trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature.to_dict() == { "inputs": '[{"name": "a", "type": "long"}]', "outputs": None, }
def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters( mocker, monkeypatch, config_dir, # a fixture to be in a kedro project dummy_mlflow_conf, # a fixture to setup mlflow configuration tmp_path, pipeline_ml_with_parameters, dummy_run_params, ): # config_with_base_mlflow_conf is a conftest fixture monkeypatch.chdir(tmp_path) context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) catalog_with_parameters = DataCatalog({ "data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0.1), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "params:threshold": MemoryDataSet(0.5), }) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=catalog_with_parameters, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) runner.run(pipeline_ml_with_parameters, catalog_with_parameters) current_run_id = mlflow.active_run().info.run_id # This is what we want to test: model must be saved and the parameters automatically persisted on disk pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) # the 2 parameters which are inputs of inference pipeline # must have been persisted and logged inside the model's artifacts model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model") assert set(model.metadata.to_dict()["flavors"]["python_function"] ["artifacts"].keys()) == { "model", "params:stopwords", "params:threshold" } # the model should be loadable and predict() should work (this tests KedroPipelineModel) assert model.predict(pd.DataFrame(data=[1], columns=["a"])).values[0][0] == 1