def dummy_catalog(tmp_path): dummy_catalog = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(), "another_metrics": MlflowMetricsDataSet(prefix="foo"), } ) return dummy_catalog
def test_mlflow_metrics_dataset_exists(tmp_path, tracking_uri, metrics3): """Check if MlflowMetricsDataSet is well identified as existing if it has already been saved. """ prefix = "test_metric" mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix) # a mlflow run_id is automatically created mlflow_metrics_dataset.save(metrics3) assert mlflow_metrics_dataset.exists()
def after_catalog_created( self, catalog: DataCatalog, conf_catalog: Dict[str, Any], conf_creds: Dict[str, Any], feed_dict: Dict[str, Any], save_version: str, load_versions: str, ): # we use this hooks to modif "MlflowmetricsDataset" to ensure consistency # of the metric name with the catalog name for name, dataset in catalog._data_sets.items(): if isinstance(dataset, MlflowMetricsDataSet) and dataset._prefix is None: if dataset._run_id is not None: catalog._data_sets[name] = MlflowMetricsDataSet( run_id=dataset._run_id, prefix=name) else: catalog._data_sets[name] = MlflowMetricsDataSet( prefix=name) if isinstance(dataset, MlflowMetricDataSet) and dataset.key is None: if dataset._run_id is not None: catalog._data_sets[name] = MlflowMetricDataSet( run_id=dataset._run_id, key=name, load_args=dataset._load_args, save_args=dataset._save_args, ) else: catalog._data_sets[name] = MlflowMetricDataSet( key=name, load_args=dataset._load_args, save_args=dataset._save_args, ) if isinstance(dataset, MlflowMetricHistoryDataSet) and dataset.key is None: if dataset._run_id is not None: catalog._data_sets[name] = MlflowMetricHistoryDataSet( run_id=dataset._run_id, key=name, load_args=dataset._load_args, save_args=dataset._save_args, ) else: catalog._data_sets[name] = MlflowMetricHistoryDataSet( key=name, load_args=dataset._load_args, save_args=dataset._save_args, )
def test_mlflow_metrics_dataset_fails_with_invalid_metric( tmp_path, tracking_uri, metrics3): """Check if MlflowMetricsDataSet is well identified as not existingif it has never been saved. """ mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric") with pytest.raises(DataSetError, match="Unexpected metric value. Should be of type"): mlflow_metrics_dataset.save({ "metric1": 1 }) # key: value is not valid, you must specify {key: {value, step}}
def test_mlflow_metrics_dataset_does_not_exist(tmp_path, tracking_uri, metrics3): """Check if MlflowMetricsDataSet is well identified as not existingif it has never been saved. """ mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow.start_run( ) # starts a run toenable mlflow_metrics_dataset to know where to seacrh run_id = mlflow.active_run().info.run_id mlflow.end_run() mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric", run_id=run_id) # a mlflow run_id is automatically created assert not mlflow_metrics_dataset.exists()
def test_mlflow_metrics_dataset_saved_without_run_id(tmp_path, tracking_uri, metrics3): """Check if MlflowMetricsDataSet can be saved in catalog when filepath is given, and if logged in mlflow. """ prefix = "test_metric" mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix) # a mlflow run_id is automatically created mlflow_metrics_dataset.save(metrics3) run_id = mlflow.active_run().info.run_id assert_are_metrics_logged(metrics3, mlflow_client, run_id, prefix)
def test_mlflow_metrics_dataset_saved_and_logged(tmp_path, tracking_uri, data, prefix): """Check if MlflowMetricsDataSet can be saved in catalog when filepath is given, and if logged in mlflow. """ mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id mlflow_metrics_dataset.save(data) # Check if metrics where logged corectly in MLflow. assert_are_metrics_logged(data, mlflow_client, run_id, prefix) # Check if metrics are stored in catalog. catalog_metrics = MlflowMetricsDataSet( prefix=prefix, # Run id needs to be provided as there is no active run. run_id=run_id, ).load() assert len(catalog_metrics) == len(data) for k in catalog_metrics.keys(): data_key = k.split(".")[-1] if prefix is not None else k assert data[data_key] == catalog_metrics[k]
def after_catalog_created( self, catalog: DataCatalog, conf_catalog: Dict[str, Any], conf_creds: Dict[str, Any], feed_dict: Dict[str, Any], save_version: str, load_versions: str, run_id: str, ): for name, dataset in catalog._data_sets.items(): if isinstance(dataset, MlflowMetricsDataSet) and dataset._prefix is None: if dataset._run_id is not None: catalog._data_sets[name] = MlflowMetricsDataSet( run_id=dataset._run_id, prefix=name ) else: catalog._data_sets[name] = MlflowMetricsDataSet(prefix=name)
def test_mlflow_metrics_logging_deactivation(tracking_uri, metrics): mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="hello") mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) mlflow_metrics_dataset._logging_activated = False all_runs_id_beginning = set([ run.run_id for k in range(len(mlflow_client.list_experiments())) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ]) mlflow_metrics_dataset.save(metrics) all_runs_id_end = set([ run.run_id for k in range(len(mlflow_client.list_experiments())) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ]) assert all_runs_id_beginning == all_runs_id_end
def test_mlflow_metrics_logging_deactivation_is_bool(): mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="hello") with pytest.raises(ValueError, match="_logging_activated must be a boolean"): mlflow_metrics_dataset._logging_activated = "hello"
def test_mlflow_hook_metrics_dataset_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params ): bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), "my_metric": MlflowMetricDataSet(run_id=existing_run_id), "another_metric": MlflowMetricDataSet( run_id=existing_run_id, key="foo" ), "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id), "another_metric_history": MlflowMetricHistoryDataSet( run_id=existing_run_id, key="bar" ), } ) mlflow_hook = MlflowHook() runner = SequentialRunner() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager) current_run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1 assert run_data.metrics["my_metric"] == 1.1 assert run_data.metrics["foo"] == 1.1 assert ( run_data.metrics["my_metric_history"] == 0.2 ) # the list is stored, but only the last value is retrieved assert ( run_data.metrics["bar"] == 0.2 ) # the list is stored, but only the last value is retrieved
def test_mlflow_pipeline_hook_metrics_with_run_id( mocker, monkeypatch, tmp_path, config_dir, dummy_pipeline_ml, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"), }) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) all_runs_id = set([ run.run_id for run in mlflow_client.list_run_infos(experiment_id="0") ]) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1
def test_mlflow_pipeline_hook_metrics_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params ): project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): mlflow_conf = get_mlflow_config() mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), } ) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1