def test_pipeline_run_hook_getting_configs( kedro_project, dummy_run_params, dummy_pipeline, dummy_catalog, ): _write_yaml( kedro_project / "conf" / "local" / "mlflow.yml", dict(hooks=dict( node=dict(flatten_dict_params=True, recursive=False, sep="-")), ), ), project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): mlflow_node_hook = MlflowNodeHook() mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog) assert ( mlflow_node_hook.flatten, mlflow_node_hook.recursive, mlflow_node_hook.sep, ) == (True, False, "-")
def test_node_hook(tmp_path): mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True, recursive=True, sep="-") def fake_fun(arg1, arg2, arg3): return None node_test = node( func=fake_fun, inputs={ "arg1": "params:param1", "arg2": "foo", "arg3": "parameters" }, outputs="out", ) catalog = DataCatalog({ "params:param1": 1, "foo": MemoryDataSet(), "bar": MemoryDataSet(), "parameters": { "param1": 1, "param2": 2 }, }) node_inputs = { v: catalog._data_sets.get(v) for k, v in node_test._inputs.items() } mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow_node_hook.before_node_run( node=node_test, catalog=catalog, inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == { "param1": "1", "parameters-param1": "1", "parameters-param2": "2", }
def test_pipeline_run_hook_getting_configs(tmp_path, config_dir, monkeypatch, dummy_run_params, dummy_pipeline, dummy_catalog): monkeypatch.chdir(tmp_path) _write_yaml( tmp_path / "conf" / "base" / "mlflow.yml", dict(hooks=dict( node=dict(flatten_dict_params=True, recursive=False, sep="-")), ), ), mlflow_node_hook = MlflowNodeHook() mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog) assert ( mlflow_node_hook.flatten, mlflow_node_hook.recursive, mlflow_node_hook.sep, ) == (True, False, "-")
def test_node_hook_logging( tmp_path, mocker, monkeypatch, dummy_run_params, dummy_catalog, dummy_pipeline, dummy_node, config_dir, flatten_dict_params, expected, ): mocker.patch("logging.config.dictConfig") mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) # config = KedroMlflowConfig( # project_path=tmp_path, # node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"}, # ) # # the function is imported inside the other file antd this is the file to patch # # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method # mocker.patch( # "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config # ) _write_yaml( tmp_path / "conf" / "base" / "mlflow.yml", dict(hooks=dict(node=dict(flatten_dict_params=flatten_dict_params, recursive=False, sep="-")), ), ), mlflow_node_hook = MlflowNodeHook() node_inputs = { v: dummy_catalog._data_sets.get(v) for k, v in dummy_node._inputs.items() } mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog) mlflow_node_hook.before_node_run( node=dummy_node, catalog=dummy_catalog, inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == expected
def test_node_hook_logging_above_limit_tag_strategy(kedro_project, dummy_run_params, param_length): _write_yaml( kedro_project / "conf" / "local" / "mlflow.yml", dict(hooks=dict(node=dict(long_parameters_strategy="tag")), ), ) mlflow_tracking_uri = (kedro_project / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) mlflow_node_hook = MlflowNodeHook() param_value = param_length * "a" node_inputs = {"params:my_param": param_value} project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): with mlflow.start_run(): mlflow_node_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog(), ) # IMPORTANT: Overpassing the parameters limit # should raise an error for all mlflow backend # but it does not on FileStore backend : # https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425 # Since we use FileStore system for simplicty for tests logging works # But we have enforced failure (which is slightly different from mlflow # behaviour) mlflow_node_hook.before_node_run( node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None), catalog=DataCatalog(), # can be empty inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == {} assert { k: v for k, v in current_run.data.tags.items() if not k.startswith("mlflow") } == { "my_param": param_value }
class ProjectContext(KedroContext): project_name = "nyc-taxi" # `project_version` is the version of kedro used to generate the project project_version = "0.16.4" package_name = "nyc-taxi" hooks = ( MlflowNodeHook(flatten_dict_params=False), MlflowPipelineHook( model_name="nyc-taxi", conda_env= "/home/dwarszawski/Workspace/personal/dssconf2020/dssconf2020/ml-pipeline/src/requirements.txt", ), ) def _get_pipelines(self) -> Dict[str, Pipeline]: return create_pipelines()
class ProjectContext(KedroContext): """Users can override the remaining methods from the parent class here, or create new ones (e.g. as required by plugins) """ project_name = "{{ cookiecutter.project_name }}" # `project_version` is the version of kedro used to generate the project project_version = "{{ cookiecutter.kedro_version }}" package_name = "{{ cookiecutter.python_package }}" hooks = ( MlflowNodeHook(flatten_dict_params=False), MlflowPipelineHook( model_name="{{ cookiecutter.python_package }}", conda_env="src/requirements.txt", ), ) def _get_pipelines(self) -> Dict[str, Pipeline]: return create_pipelines()
def test_node_hook_logging_above_limit_truncate_strategy( kedro_project, dummy_run_params, param_length): _write_yaml( kedro_project / "conf" / "local" / "mlflow.yml", dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ), ) mlflow_tracking_uri = (kedro_project / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) mlflow_node_hook = MlflowNodeHook() param_value = param_length * "a" node_inputs = {"params:my_param": param_value} project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): with mlflow.start_run(): mlflow_node_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog(), ) mlflow_node_hook.before_node_run( node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None), catalog=DataCatalog(), # can be empty inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == { "my_param": param_value[0:MAX_PARAM_VAL_LENGTH] }
def test_node_hook_logging_above_limit_fail_strategy(tmp_path, config_dir, dummy_run_params, dummy_node, param_length): # mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) _write_yaml( tmp_path / "conf" / "base" / "mlflow.yml", dict(hooks=dict(node=dict(long_parameters_strategy="fail")), ), ) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) mlflow_node_hook = MlflowNodeHook() param_value = param_length * "a" node_inputs = {"params:my_param": param_value} with mlflow.start_run(): mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog()) # IMPORTANT: Overpassing the parameters limit # should raise an error for all mlflow backend # but it does not on FileStore backend : # https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425 # Since we use FileStore system for simplicty for tests logging works # But we have enforced failure (which is slightly different from mlflow # behaviour) with pytest.raises( ValueError, match=f"Parameter 'my_param' length is {param_length}"): mlflow_node_hook.before_node_run( node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None), catalog=DataCatalog(), # can be empty inputs=node_inputs, is_async=False, run_id="132", )
def test_node_hook_logging_above_limit_truncate_strategy( tmp_path, config_dir, dummy_run_params, dummy_node, param_length): # mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) _write_yaml( tmp_path / "conf" / "base" / "mlflow.yml", dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ), ) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) mlflow_node_hook = MlflowNodeHook() param_value = param_length * "a" node_inputs = {"params:my_param": param_value} with mlflow.start_run(): mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog()) mlflow_node_hook.before_node_run( node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None), catalog=DataCatalog(), # can be empty inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == { "my_param": param_value[0:MAX_PARAM_VAL_LENGTH] }
def mock_settings_with_mlflow_hooks(mocker): return _mock_settings_with_hooks(mocker, hooks=(DummyProjectHooks(), MlflowPipelineHook(), MlflowNodeHook()))