def test_mlflow_pipeline_hook_with_copy_mode( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_catalog, dummy_run_params, copy_mode, expected, ): # config_with_base_mlflow_conf is a conftest fixture project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline_ml.training, inference=dummy_pipeline_ml.inference, input_name=dummy_pipeline_ml.input_name, conda_env={}, model_name=dummy_pipeline_ml.model_name, copy_mode=copy_mode, ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model") actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model.loaded_catalog._data_sets.items() } assert actual_copy_mode == expected
def _run_one_task(self, config_filename): # create node from Task load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node() config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) # Prepare a data catalog data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', config) data_catalog.save('log', log) data_catalog.save('base_directory', self.base_directory) # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_data_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, config, data_catalog
def run(self): """ Run all tasks """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet(), 'dataset': MemoryDataSet(), 'data': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_node = mls.sl.workflows.tasks.LearnTask.get_node() evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node() # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_node, evaluate_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) self.terminate()
def test_mlflow_pipeline_hook_with_copy_mode( mocker, monkeypatch, tmp_path, config_dir, dummy_pipeline_ml, dummy_catalog, dummy_run_params, dummy_mlflow_conf, copy_mode, expected, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline_ml.training, inference=dummy_pipeline_ml.inference, input_name=dummy_pipeline_ml.input_name, conda_env={}, model_name=dummy_pipeline_ml.model_name, copy_mode=copy_mode, ) pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model") actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model.loaded_catalog. _data_sets.items() } assert actual_copy_mode == expected
def test_mlflow_hook_save_pipeline_ml( kedro_project_with_mlflow_conf, pipeline_to_run, dummy_catalog, dummy_run_params, ): bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() # triggers conf setup # config_with_base_mlflow_conf is a conftest fixture mlflow_hook = MlflowHook() mlflow_hook.after_context_created(context) # setup mlflow config runner = SequentialRunner() mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of below arguments, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0 if isinstance(pipeline_to_run, PipelineML): trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature.to_dict() == { "inputs": '[{"name": "a", "type": "long"}]', "outputs": None, }
def test_mlflow_hook_save_pipeline_ml_with_artifact_path( kedro_project_with_mlflow_conf, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, artifact_path, expected_artifact_path, ): # config_with_base_mlflow_conf is a conftest fixture bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: mlflow_hook = MlflowHook() runner = SequentialRunner() log_model_kwargs = { "conda_env": env_from_dict, } if artifact_path is not None: # we need to test what happens if the key is NOT present log_model_kwargs["artifact_path"] = artifact_path pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", log_model_kwargs=log_model_kwargs, ) context = session.load_context() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model( f"runs:/{run_id}/{expected_artifact_path}") # the real test is that the model is loaded without error assert trained_model is not None
def test_mlflow_pipeline_hook_with_pipeline_ml_signature( kedro_project_with_mlflow_conf, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, model_signature, expected_signature, ): # config_with_base_mlflow_conf is a conftest fixture project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", conda_env=env_from_dict, model_name="model", model_signature=model_signature, ) pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature == expected_signature
def test_mlflow_pipeline_hook_with_pipeline_ml_signature( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, dummy_mlflow_conf, model_signature, expected_signature, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", conda_env=env_from_dict, model_name="model", model_signature=model_signature, ) pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature == expected_signature
def test_ds_pipeline(all_catalog): runner = SequentialRunner() output_name = 'outputs' pipeline = create_pipeline(inputs=["dataframex", "dataframey"], outputs=output_name) pipeline_output = runner.run(pipeline, all_catalog) assert pipeline_output is not None with pytest.raises(ValueError): # не учиться на плохоих значениях pipeline = create_pipeline(inputs=["dataframex", "dataframey_bad"], outputs=output_name) pipeline_output = runner.run(pipeline, all_catalog)
def test_mlflow_hook_save_pipeline_ml_with_signature( kedro_project_with_mlflow_conf, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, model_signature, expected_signature, ): # config_with_base_mlflow_conf is a conftest fixture bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: mlflow_hook = MlflowHook() runner = SequentialRunner() pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", log_model_kwargs={ "conda_env": env_from_dict, "signature": model_signature, }, ) context = session.load_context() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature == expected_signature
class Kedro171: def __init__(self): self.session = get_session() self.context = self.session.load_context() self.catalog = self.context.catalog self.pipeline = self.context.pipeline self.pipelines = self.context.pipelines self.runner = SequentialRunner() def run(self, pipeline=None, catalog=None): if pipeline is None: pipeline = self.pipeline if catalog is None: catalog = self.catalog self.runner.run(pipeline, catalog)
def run(self): """ Run the workflow : run each config """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node( ) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node, multiple_learning_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline result = runner.run(pipeline, data_catalog) if len(result) == 0: self.terminate()
def _run_one_task(self, config_filename): # create node from Task expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) # Prepare a data catalog data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'expanded_config': MemoryDataSet() }) data_catalog.save('config', config) data_catalog.save('log', log) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, data_catalog
def test_mlflow_pipeline_hook_with_different_pipeline_types( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict, model_name="model") runner = SequentialRunner() pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"]) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged mlflow_conf = get_mlflow_config(tmp_path) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0
def predict(self, context, model_input): # TODO : checkout out how to pass extra args in predict # for instance, to enable parallelization self.loaded_catalog.add( data_set_name=self.pipeline_ml.input_name, data_set=MemoryDataSet(model_input), replace=True, ) runner = SequentialRunner() run_outputs = runner.run(pipeline=self.pipeline_ml.inference, catalog=self.loaded_catalog) return run_outputs
def test_feature_engineering_pipeline(sample_data_catalog_train: DataCatalog, runner: SequentialRunner): train_pipeline = create_pipeline( output_X_train_normalized="sample_iris_X_train_normalized", output_X_test_normalized="sample_iris_X_test_normalized", output_y_train="sample_iris_y_train", output_y_test="sample_iris_y_test", normalizer="sample_normalizer", ) output = runner.run(pipeline=train_pipeline, catalog=sample_data_catalog_train) assert output["sample_iris_X_train_normalized"].shape == (3, 4) assert output["sample_iris_X_test_normalized"].shape == (1, 4) assert output["sample_iris_y_train"].shape == (3, ) assert output["sample_iris_y_test"].shape == (1, )
# Prepare a data catalog data_catalog = DataCatalog({"my_salutation": MemoryDataSet()}) # Prepare first node def return_greeting(): return "Hello" return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation") # Prepare second node def join_statements(greeting): return f"{greeting} Kedro!" join_statements_node = node( join_statements, inputs="my_salutation", outputs="my_message" ) # Assemble nodes into a pipeline pipeline = Pipeline([return_greeting_node, join_statements_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline print(runner.run(pipeline, data_catalog))
def test_mlflow_hook_metrics_dataset_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params ): bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), "my_metric": MlflowMetricDataSet(run_id=existing_run_id), "another_metric": MlflowMetricDataSet( run_id=existing_run_id, key="foo" ), "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id), "another_metric_history": MlflowMetricHistoryDataSet( run_id=existing_run_id, key="bar" ), } ) mlflow_hook = MlflowHook() runner = SequentialRunner() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager) current_run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1 assert run_data.metrics["my_metric"] == 1.1 assert run_data.metrics["foo"] == 1.1 assert ( run_data.metrics["my_metric_history"] == 0.2 ) # the list is stored, but only the last value is retrieved assert ( run_data.metrics["bar"] == 0.2 ) # the list is stored, but only the last value is retrieved
data_catalog = DataCatalog({"example_data": MemoryDataSet()}) def return_greeting(): # Prepare first node return "Hello" return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation") def join_statements(greeting): # Prepare second node return f"{greeting} Kedro!" join_statements_node = node(join_statements, inputs="my_salutation", outputs="my_message") # Assemble nodes into a pipeline pipeline = Pipeline([return_greeting_node, join_statements_node]) # Create a runner runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog)
def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters( mocker, monkeypatch, config_dir, # a fixture to be in a kedro project dummy_mlflow_conf, # a fixture to setup mlflow configuration tmp_path, pipeline_ml_with_parameters, dummy_run_params, ): # config_with_base_mlflow_conf is a conftest fixture monkeypatch.chdir(tmp_path) context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) catalog_with_parameters = DataCatalog({ "data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0.1), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "params:threshold": MemoryDataSet(0.5), }) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=catalog_with_parameters, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) runner.run(pipeline_ml_with_parameters, catalog_with_parameters) current_run_id = mlflow.active_run().info.run_id # This is what we want to test: model must be saved and the parameters automatically persisted on disk pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_ml_with_parameters, catalog=catalog_with_parameters, ) # the 2 parameters which are inputs of inference pipeline # must have been persisted and logged inside the model's artifacts model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model") assert set(model.metadata.to_dict()["flavors"]["python_function"] ["artifacts"].keys()) == { "model", "params:stopwords", "params:threshold" } # the model should be loadable and predict() should work (this tests KedroPipelineModel) assert model.predict(pd.DataFrame(data=[1], columns=["a"])).values[0][0] == 1
def test_mlflow_pipeline_hook_metrics_with_run_id( mocker, monkeypatch, tmp_path, config_dir, dummy_pipeline_ml, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"), }) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) all_runs_id = set([ run.run_id for run in mlflow_client.list_run_infos(experiment_id="0") ]) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1
def test_mlflow_hook_save_pipeline_ml_with_copy_mode( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_catalog, dummy_run_params, copy_mode, expected, ): # config_with_base_mlflow_conf is a conftest fixture bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() mlflow_hook = MlflowHook() runner = SequentialRunner() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline_ml.training, inference=dummy_pipeline_ml.inference, input_name=dummy_pipeline_ml.input_name, log_model_kwargs={ "artifact_path": dummy_pipeline_ml.log_model_kwargs["artifact_path"], "conda_env": { "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, }, kpm_kwargs={ "copy_mode": copy_mode, }, ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) loaded_model = mlflow.pyfunc.load_model( model_uri=f"runs:/{run_id}/model") actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model. loaded_catalog._data_sets.items() } assert actual_copy_mode == expected
def test_mlflow_pipeline_hook_with_different_pipeline_types( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0 # Check if metrics datasets have prefix with its names. # for metric assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics" assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo" if isinstance(pipeline_to_run, PipelineML): trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature.to_dict() == { "inputs": '[{"name": "a", "type": "long"}]', "outputs": None, }
def _run_one_task(self, config_filename): # create node from Task expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node() # Prepare a data catalog config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) expanded_config = [{"input": {"type": "NClassRandomClassificationWithNoise", "parameters": {"n_samples": 100, "shuffle": True, "random_state": 0, "noise": 0} }, "split": {"type": "traintest", "parameters": {"test_size": 20, "random_state": 0, "shuffle": True} }, "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier", "hyperparameters": { "n_neighbors": 15, "algorithm": "auto", "weights": "uniform" } } }, {"input": {"type": "make_circles", "parameters": { "n_samples": 100, "shuffle": True, "noise": 0, "random_state": 0, "factor": 0.3 }}, "split": {"type": "traintest", "parameters": {"test_size": 20, "random_state": 0, "shuffle": True} }, "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier", "hyperparameters": { "n_neighbors": 15, "algorithm": "auto", "weights": "uniform" } } }, {"input": {"type": "load_iris", "parameters": {} }, "split": {"type": "traintest", "parameters": {"test_size": 20, "random_state": 0, "shuffle": True} }, "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier", "hyperparameters": { "n_neighbors": 15, "algorithm": "auto", "weights": "uniform" } } }] data_catalog = DataCatalog({'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet(), 'expanded_config': MemoryDataSet()}) data_catalog.save('config', config) data_catalog.save('log', log) data_catalog.save('base_directory', self.base_directory) data_catalog.save('expanded_config', expanded_config) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node, multiple_learning_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, data_catalog
def run_pipeline_dict(pipelines, logger): runner = SequentialRunner() for kk, pipeline in pipelines.items(): logger.info(f'Running pipeline {kk}') runner.run(pipeline, kedro_catalog)
def test_mlflow_pipeline_hook_metrics_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params ): project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): mlflow_conf = get_mlflow_config() mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), } ) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id) current_run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline_ml, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1
#%% # Prepare second node def join_statements(greeting): return f"{greeting} Kedro!" node_join_statements = node( join_statements, inputs="my_salutation", outputs="my_message" ) #%% # Assemble nodes into a pipeline pipeline = Pipeline([node_return_greeting, node_join_statements]) #%% # Create a runner to run the pipeline runner = SequentialRunner() #%% # Run the pipeline output = runner.run(pipeline, data_catalog) print(output) print(type(output)) #%%
df = df.dropna() return df # Plot the amount of people who survived and who died. def plot_survival_breakdown(df): plt.figure(figsize=(6, 4)) fig, ax = plt.subplots() df.Survived.value_counts().plot(kind="barh", color="blue", alpha=0.65) ax.set_ylim(-1, len(df.Survived.value_counts())) plt.title("Survival Breakdown (1 = Survived, 0 = Died)") return fig # Create nodes clean_data_node = node(clean_raw_data, inputs="titanic_training_data", outputs="df_clean") plot_survival_breakdown_node = node(plot_survival_breakdown, inputs="df_clean", outputs="survival_breakdown_chart") # Assemble nodes into a pipeline pipeline = Pipeline([clean_data_node, plot_survival_breakdown_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline print(runner.run(pipeline, io))
# Prepare first node def return_greeting(): return "Hello" return_greeting_node = node(func=return_greeting, inputs=None, outputs="my_salutation") # Prepare second node def join_statements(greeting): return f"{greeting} Kedro!" join_statements_node = node(join_statements, inputs="my_salutation", outputs="my_message") # Assemble nodes into a pipeline greeting_pipeline = pipeline([return_greeting_node, join_statements_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline print(runner.run(greeting_pipeline, data_catalog))