def test_mlflow_pipeline_hook_with_copy_mode(
    kedro_project_with_mlflow_conf,
    dummy_pipeline_ml,
    dummy_catalog,
    dummy_run_params,
    copy_mode,
    expected,
):
    # config_with_base_mlflow_conf is a conftest fixture
    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        pipeline_hook = MlflowPipelineHook()
        runner = SequentialRunner()

        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline_ml.training,
            inference=dummy_pipeline_ml.inference,
            input_name=dummy_pipeline_ml.input_name,
            conda_env={},
            model_name=dummy_pipeline_ml.model_name,
            copy_mode=copy_mode,
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )
        runner.run(pipeline_to_run, dummy_catalog)
        run_id = mlflow.active_run().info.run_id
        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )

        mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri()
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model")

        actual_copy_mode = {
            name: ds._copy_mode
            for name, ds in loaded_model._model_impl.python_model.loaded_catalog._data_sets.items()
        }

        assert actual_copy_mode == expected
Example #2
0
 def _run_one_task(self, config_filename):
     # create node from Task
     load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
     prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
     split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
     learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node()
     config, log = self._init_config_log(config_filename,
                                         self.base_directory,
                                         self.config_directory)
     # Prepare a data catalog
     data_catalog = DataCatalog({
         'config': MemoryDataSet(),
         'log': MemoryDataSet(),
         'base_directory': MemoryDataSet()
     })
     data_catalog.save('config', config)
     data_catalog.save('log', log)
     data_catalog.save('base_directory', self.base_directory)
     # Assemble nodes into a pipeline
     pipeline = Pipeline([
         load_data_node, prepare_data_node, split_data_node, learn_data_node
     ])
     # Create a runner to run the pipeline
     runner = SequentialRunner()
     # Run the pipeline
     runner.run(pipeline, data_catalog)
     return log, config, data_catalog
    def run(self):
        """
        Run all tasks
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet(),
            'dataset': MemoryDataSet(),
            'data': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
        prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
        split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
        learn_node = mls.sl.workflows.tasks.LearnTask.get_node()
        evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node()
        # Assemble nodes into a pipeline
        pipeline = Pipeline([
            load_data_node, prepare_data_node, split_data_node, learn_node,
            evaluate_node
        ])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        runner.run(pipeline, data_catalog)
        self.terminate()
def test_mlflow_pipeline_hook_with_copy_mode(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    dummy_pipeline_ml,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
    copy_mode,
    expected,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook()
    runner = SequentialRunner()

    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )

    pipeline_to_run = pipeline_ml_factory(
        training=dummy_pipeline_ml.training,
        inference=dummy_pipeline_ml.inference,
        input_name=dummy_pipeline_ml.input_name,
        conda_env={},
        model_name=dummy_pipeline_ml.model_name,
        copy_mode=copy_mode,
    )
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog)
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model")

    actual_copy_mode = {
        name: ds._copy_mode
        for name, ds in loaded_model._model_impl.python_model.loaded_catalog.
        _data_sets.items()
    }

    assert actual_copy_mode == expected
def test_mlflow_hook_save_pipeline_ml(
    kedro_project_with_mlflow_conf,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
):

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(
            project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()  # triggers conf setup

        # config_with_base_mlflow_conf is a conftest fixture
        mlflow_hook = MlflowHook()
        mlflow_hook.after_context_created(context)  # setup mlflow config
        runner = SequentialRunner()
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of below arguments,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(run_params=dummy_run_params,
                                        pipeline=pipeline_to_run,
                                        catalog=dummy_catalog)
        runner.run(pipeline_to_run, dummy_catalog, session._hook_manager)
        run_id = mlflow.active_run().info.run_id
        mlflow_hook.after_pipeline_run(run_params=dummy_run_params,
                                       pipeline=pipeline_to_run,
                                       catalog=dummy_catalog)
        # test : parameters should have been logged
        mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri)
        run_data = mlflow_client.get_run(run_id).data

        # all run_params are recorded as tags
        for k, v in dummy_run_params.items():
            if v:
                assert run_data.tags[k] == str(v)

        # params are not recorded because we don't have MlflowHook here
        # and the model should not be logged when it is not a PipelineML
        nb_artifacts = len(mlflow_client.list_artifacts(run_id))
        if isinstance(pipeline_to_run, PipelineML):
            assert nb_artifacts == 1
        else:
            assert nb_artifacts == 0

        if isinstance(pipeline_to_run, PipelineML):
            trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
            assert trained_model.metadata.signature.to_dict() == {
                "inputs": '[{"name": "a", "type": "long"}]',
                "outputs": None,
            }
def test_mlflow_hook_save_pipeline_ml_with_artifact_path(
    kedro_project_with_mlflow_conf,
    env_from_dict,
    dummy_pipeline,
    dummy_catalog,
    dummy_run_params,
    artifact_path,
    expected_artifact_path,
):
    # config_with_base_mlflow_conf is a conftest fixture
    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(
            project_path=kedro_project_with_mlflow_conf) as session:
        mlflow_hook = MlflowHook()
        runner = SequentialRunner()

        log_model_kwargs = {
            "conda_env": env_from_dict,
        }
        if artifact_path is not None:
            # we need to test what happens if the key is NOT present
            log_model_kwargs["artifact_path"] = artifact_path

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline.only_nodes_with_tags("training"),
            inference=dummy_pipeline.only_nodes_with_tags("inference"),
            input_name="raw_data",
            log_model_kwargs=log_model_kwargs,
        )

        context = session.load_context()
        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(run_params=dummy_run_params,
                                        pipeline=pipeline_to_run,
                                        catalog=dummy_catalog)
        runner.run(pipeline_to_run, dummy_catalog, session._hook_manager)
        run_id = mlflow.active_run().info.run_id
        mlflow_hook.after_pipeline_run(run_params=dummy_run_params,
                                       pipeline=pipeline_to_run,
                                       catalog=dummy_catalog)

        # test : parameters should have been logged
        trained_model = mlflow.pyfunc.load_model(
            f"runs:/{run_id}/{expected_artifact_path}")
        # the real test is that the model is loaded without error
        assert trained_model is not None
def test_mlflow_pipeline_hook_with_pipeline_ml_signature(
    kedro_project_with_mlflow_conf,
    env_from_dict,
    dummy_pipeline,
    dummy_catalog,
    dummy_run_params,
    model_signature,
    expected_signature,
):
    # config_with_base_mlflow_conf is a conftest fixture
    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):
        pipeline_hook = MlflowPipelineHook()
        runner = SequentialRunner()

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline.only_nodes_with_tags("training"),
            inference=dummy_pipeline.only_nodes_with_tags("inference"),
            input_name="raw_data",
            conda_env=env_from_dict,
            model_name="model",
            model_signature=model_signature,
        )

        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )
        runner.run(pipeline_to_run, dummy_catalog)
        run_id = mlflow.active_run().info.run_id
        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog
        )

        # test : parameters should have been logged
        trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
        assert trained_model.metadata.signature == expected_signature
def test_mlflow_pipeline_hook_with_pipeline_ml_signature(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    dummy_pipeline,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
    model_signature,
    expected_signature,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook()
    runner = SequentialRunner()

    pipeline_to_run = pipeline_ml_factory(
        training=dummy_pipeline.only_nodes_with_tags("training"),
        inference=dummy_pipeline.only_nodes_with_tags("inference"),
        input_name="raw_data",
        conda_env=env_from_dict,
        model_name="model",
        model_signature=model_signature,
    )

    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog)
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)

    # test : parameters should have been logged
    trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
    assert trained_model.metadata.signature == expected_signature
def test_ds_pipeline(all_catalog):
    runner = SequentialRunner()
    output_name = 'outputs'

    pipeline = create_pipeline(inputs=["dataframex", "dataframey"],
                               outputs=output_name)
    pipeline_output = runner.run(pipeline, all_catalog)
    assert pipeline_output is not None

    with pytest.raises(ValueError):
        # не учиться на плохоих значениях
        pipeline = create_pipeline(inputs=["dataframex", "dataframey_bad"],
                                   outputs=output_name)
        pipeline_output = runner.run(pipeline, all_catalog)
def test_mlflow_hook_save_pipeline_ml_with_signature(
    kedro_project_with_mlflow_conf,
    env_from_dict,
    dummy_pipeline,
    dummy_catalog,
    dummy_run_params,
    model_signature,
    expected_signature,
):
    # config_with_base_mlflow_conf is a conftest fixture
    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(
            project_path=kedro_project_with_mlflow_conf) as session:
        mlflow_hook = MlflowHook()
        runner = SequentialRunner()

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline.only_nodes_with_tags("training"),
            inference=dummy_pipeline.only_nodes_with_tags("inference"),
            input_name="raw_data",
            log_model_kwargs={
                "conda_env": env_from_dict,
                "signature": model_signature,
            },
        )

        context = session.load_context()
        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(run_params=dummy_run_params,
                                        pipeline=pipeline_to_run,
                                        catalog=dummy_catalog)
        runner.run(pipeline_to_run, dummy_catalog, session._hook_manager)
        run_id = mlflow.active_run().info.run_id
        mlflow_hook.after_pipeline_run(run_params=dummy_run_params,
                                       pipeline=pipeline_to_run,
                                       catalog=dummy_catalog)

        # test : parameters should have been logged
        trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
        assert trained_model.metadata.signature == expected_signature
Example #11
0
class Kedro171:
    def __init__(self):
        self.session = get_session()
        self.context = self.session.load_context()
        self.catalog = self.context.catalog
        self.pipeline = self.context.pipeline
        self.pipelines = self.context.pipelines
        self.runner = SequentialRunner()

    def run(self, pipeline=None, catalog=None):
        if pipeline is None:
            pipeline = self.pipeline
        if catalog is None:
            catalog = self.catalog
        self.runner.run(pipeline, catalog)
    def run(self):
        """
        Run the workflow : run each config
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
        multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node(
        )

        # Assemble nodes into a pipeline
        pipeline = Pipeline([expand_config_node, multiple_learning_node])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        result = runner.run(pipeline, data_catalog)
        if len(result) == 0:
            self.terminate()
 def _run_one_task(self, config_filename):
     # create node from Task
     expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
     config, log = self._init_config_log(config_filename,
                                         self.base_directory,
                                         self.config_directory)
     # Prepare a data catalog
     data_catalog = DataCatalog({
         'config': MemoryDataSet(),
         'log': MemoryDataSet(),
         'expanded_config': MemoryDataSet()
     })
     data_catalog.save('config', config)
     data_catalog.save('log', log)
     # Assemble nodes into a pipeline
     pipeline = Pipeline([expand_config_node])
     # Create a runner to run the pipeline
     runner = SequentialRunner()
     # Run the pipeline
     runner.run(pipeline, data_catalog)
     return log, data_catalog
def test_mlflow_pipeline_hook_with_different_pipeline_types(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict,
                                       model_name="model")
    runner = SequentialRunner()
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"])
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)
    # test : parameters should have been logged
    mlflow_conf = get_mlflow_config(tmp_path)
    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    run_data = mlflow_client.get_run(run_id).data
    # all run_params are recorded as tags
    for k, v in dummy_run_params.items():
        if v:
            assert run_data.tags[k] == str(v)
    # params are not recorded because we don't have MlflowNodeHook here
    # and the model should not be logged when it is not a PipelineML
    nb_artifacts = len(mlflow_client.list_artifacts(run_id))
    if isinstance(pipeline_to_run, PipelineML):
        assert nb_artifacts == 1
    else:
        assert nb_artifacts == 0
    def predict(self, context, model_input):
        # TODO : checkout out how to pass extra args in predict
        # for instance, to enable parallelization

        self.loaded_catalog.add(
            data_set_name=self.pipeline_ml.input_name,
            data_set=MemoryDataSet(model_input),
            replace=True,
        )
        runner = SequentialRunner()
        run_outputs = runner.run(pipeline=self.pipeline_ml.inference,
                                 catalog=self.loaded_catalog)
        return run_outputs
Example #16
0
def test_feature_engineering_pipeline(sample_data_catalog_train: DataCatalog,
                                      runner: SequentialRunner):
    train_pipeline = create_pipeline(
        output_X_train_normalized="sample_iris_X_train_normalized",
        output_X_test_normalized="sample_iris_X_test_normalized",
        output_y_train="sample_iris_y_train",
        output_y_test="sample_iris_y_test",
        normalizer="sample_normalizer",
    )

    output = runner.run(pipeline=train_pipeline,
                        catalog=sample_data_catalog_train)

    assert output["sample_iris_X_train_normalized"].shape == (3, 4)
    assert output["sample_iris_X_test_normalized"].shape == (1, 4)
    assert output["sample_iris_y_train"].shape == (3, )
    assert output["sample_iris_y_test"].shape == (1, )
Example #17
0
# Prepare a data catalog
data_catalog = DataCatalog({"my_salutation": MemoryDataSet()})


# Prepare first node
def return_greeting():
    return "Hello"


return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation")


# Prepare second node
def join_statements(greeting):
    return f"{greeting} Kedro!"


join_statements_node = node(
    join_statements, inputs="my_salutation", outputs="my_message"
)

# Assemble nodes into a pipeline
pipeline = Pipeline([return_greeting_node, join_statements_node])

# Create a runner to run the pipeline
runner = SequentialRunner()

# Run the pipeline
print(runner.run(pipeline, data_catalog))
Example #18
0
def test_mlflow_hook_metrics_dataset_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params
):

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
                "my_metric": MlflowMetricDataSet(run_id=existing_run_id),
                "another_metric": MlflowMetricDataSet(
                    run_id=existing_run_id, key="foo"
                ),
                "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id),
                "another_metric_history": MlflowMetricHistoryDataSet(
                    run_id=existing_run_id, key="bar"
                ),
            }
        )

        mlflow_hook = MlflowHook()
        runner = SequentialRunner()

        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager)

        current_run_id = mlflow.active_run().info.run_id

        mlflow_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}

        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1
        assert run_data.metrics["my_metric"] == 1.1
        assert run_data.metrics["foo"] == 1.1
        assert (
            run_data.metrics["my_metric_history"] == 0.2
        )  # the list is stored, but only the last value is retrieved
        assert (
            run_data.metrics["bar"] == 0.2
        )  # the list is stored, but only the last value is retrieved
Example #19
0
data_catalog = DataCatalog({"example_data": MemoryDataSet()})


def return_greeting():
    # Prepare first node
    return "Hello"


return_greeting_node = node(return_greeting,
                            inputs=None,
                            outputs="my_salutation")


def join_statements(greeting):
    # Prepare second node
    return f"{greeting} Kedro!"


join_statements_node = node(join_statements,
                            inputs="my_salutation",
                            outputs="my_message")

# Assemble nodes into a pipeline
pipeline = Pipeline([return_greeting_node, join_statements_node])

# Create a runner
runner = SequentialRunner()

# Run the pipeline
runner.run(pipeline, data_catalog)
def test_mlflow_pipeline_hook_save_pipeline_ml_with_parameters(
    mocker,
    monkeypatch,
    config_dir,  # a fixture to be in a kedro project
    dummy_mlflow_conf,  # a fixture to setup mlflow configuration
    tmp_path,
    pipeline_ml_with_parameters,
    dummy_run_params,
):
    # config_with_base_mlflow_conf is a conftest fixture
    monkeypatch.chdir(tmp_path)

    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

    catalog_with_parameters = DataCatalog({
        "data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "cleaned_data":
        MemoryDataSet(),
        "params:stopwords":
        MemoryDataSet(["Hello", "Hi"]),
        "params:penalty":
        MemoryDataSet(0.1),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
        "params:threshold":
        MemoryDataSet(0.5),
    })

    pipeline_hook = MlflowPipelineHook()

    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=catalog_with_parameters,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(
        run_params=dummy_run_params,
        pipeline=pipeline_ml_with_parameters,
        catalog=catalog_with_parameters,
    )
    runner.run(pipeline_ml_with_parameters, catalog_with_parameters)

    current_run_id = mlflow.active_run().info.run_id

    # This is what we want to test: model must be saved and the parameters automatically persisted on disk
    pipeline_hook.after_pipeline_run(
        run_params=dummy_run_params,
        pipeline=pipeline_ml_with_parameters,
        catalog=catalog_with_parameters,
    )

    # the 2 parameters which are inputs of inference pipeline
    # must have been persisted and logged inside the model's artifacts
    model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model")
    assert set(model.metadata.to_dict()["flavors"]["python_function"]
               ["artifacts"].keys()) == {
                   "model", "params:stopwords", "params:threshold"
               }

    # the model should be loadable and predict() should work (this tests KedroPipelineModel)
    assert model.predict(pd.DataFrame(data=[1],
                                      columns=["a"])).values[0][0] == 1
def test_mlflow_pipeline_hook_metrics_with_run_id(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    dummy_pipeline_ml,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)

    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

    with mlflow.start_run():
        existing_run_id = mlflow.active_run().info.run_id

    dummy_catalog_with_run_id = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
        "my_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id),
        "another_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"),
    })

    pipeline_hook = MlflowPipelineHook()

    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog_with_run_id,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )
    runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

    current_run_id = mlflow.active_run().info.run_id

    pipeline_hook.after_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )

    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    all_runs_id = set([
        run.run_id for run in mlflow_client.list_run_infos(experiment_id="0")
    ])

    # the metrics are supposed to have been logged inside existing_run_id
    run_data = mlflow_client.get_run(existing_run_id).data

    # Check if metrics datasets have prefix with its names.
    # for metric
    assert all_runs_id == {current_run_id, existing_run_id}
    assert run_data.metrics["my_metrics.metric_key"] == 1.1
    assert run_data.metrics["foo.metric_key"] == 1.1
def test_mlflow_hook_save_pipeline_ml_with_copy_mode(
    kedro_project_with_mlflow_conf,
    dummy_pipeline_ml,
    dummy_catalog,
    dummy_run_params,
    copy_mode,
    expected,
):
    # config_with_base_mlflow_conf is a conftest fixture
    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(
            project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()
        mlflow_hook = MlflowHook()
        runner = SequentialRunner()
        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline_ml.training,
            inference=dummy_pipeline_ml.inference,
            input_name=dummy_pipeline_ml.input_name,
            log_model_kwargs={
                "artifact_path":
                dummy_pipeline_ml.log_model_kwargs["artifact_path"],
                "conda_env": {
                    "python": "3.7.0",
                    "dependencies": ["kedro==0.16.5"]
                },
            },
            kpm_kwargs={
                "copy_mode": copy_mode,
            },
        )
        mlflow_hook.before_pipeline_run(run_params=dummy_run_params,
                                        pipeline=pipeline_to_run,
                                        catalog=dummy_catalog)
        runner.run(pipeline_to_run, dummy_catalog, session._hook_manager)
        run_id = mlflow.active_run().info.run_id
        mlflow_hook.after_pipeline_run(run_params=dummy_run_params,
                                       pipeline=pipeline_to_run,
                                       catalog=dummy_catalog)

        mlflow_tracking_uri = (kedro_project_with_mlflow_conf /
                               "mlruns").as_uri()
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        loaded_model = mlflow.pyfunc.load_model(
            model_uri=f"runs:/{run_id}/model")

        actual_copy_mode = {
            name: ds._copy_mode
            for name, ds in loaded_model._model_impl.python_model.
            loaded_catalog._data_sets.items()
        }

        assert actual_copy_mode == expected
def test_mlflow_pipeline_hook_with_different_pipeline_types(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook()
    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog)
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)
    # test : parameters should have been logged
    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    run_data = mlflow_client.get_run(run_id).data

    # all run_params are recorded as tags
    for k, v in dummy_run_params.items():
        if v:
            assert run_data.tags[k] == str(v)

    # params are not recorded because we don't have MlflowNodeHook here
    # and the model should not be logged when it is not a PipelineML
    nb_artifacts = len(mlflow_client.list_artifacts(run_id))
    if isinstance(pipeline_to_run, PipelineML):
        assert nb_artifacts == 1
    else:
        assert nb_artifacts == 0

    # Check if metrics datasets have prefix with its names.
    # for metric
    assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics"
    assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo"

    if isinstance(pipeline_to_run, PipelineML):
        trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
        assert trained_model.metadata.signature.to_dict() == {
            "inputs": '[{"name": "a", "type": "long"}]',
            "outputs": None,
        }
Example #24
0
    def _run_one_task(self, config_filename):
        # create node from Task
        expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
        multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node()
        # Prepare a data catalog
        config, log = self._init_config_log(config_filename,
                                            self.base_directory,
                                            self.config_directory)
        expanded_config = [{"input": {"type": "NClassRandomClassificationWithNoise",
                                      "parameters": {"n_samples": 100, "shuffle": True, "random_state": 0, "noise": 0}
                                      },
                            "split": {"type": "traintest",
                                      "parameters": {"test_size": 20, "random_state": 0, "shuffle": True}
                                      },
                            "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier",
                                          "hyperparameters": {
                                              "n_neighbors": 15,
                                              "algorithm": "auto",
                                              "weights": "uniform"
                                          }
                                          }
                            },
                           {"input": {"type": "make_circles",
                                      "parameters": {
                                          "n_samples": 100,
                                          "shuffle": True,
                                          "noise": 0,
                                          "random_state": 0,
                                          "factor": 0.3
                                      }},
                            "split": {"type": "traintest",
                                      "parameters": {"test_size": 20, "random_state": 0, "shuffle": True}
                                      },
                            "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier",
                                          "hyperparameters": {
                                              "n_neighbors": 15,
                                              "algorithm": "auto",
                                              "weights": "uniform"
                                          }
                                          }
                            },
                           {"input": {"type": "load_iris",
                                      "parameters": {}
                                      },
                            "split": {"type": "traintest",
                                      "parameters": {"test_size": 20, "random_state": 0, "shuffle": True}
                                      },
                            "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier",
                                          "hyperparameters": {
                                              "n_neighbors": 15,
                                              "algorithm": "auto",
                                              "weights": "uniform"
                                          }
                                          }
                            }]

        data_catalog = DataCatalog({'config': MemoryDataSet(),
                                    'log': MemoryDataSet(),
                                    'base_directory': MemoryDataSet(),
                                    'expanded_config': MemoryDataSet()})
        data_catalog.save('config', config)
        data_catalog.save('log', log)
        data_catalog.save('base_directory', self.base_directory)
        data_catalog.save('expanded_config', expanded_config)
        # Assemble nodes into a pipeline
        pipeline = Pipeline([expand_config_node, multiple_learning_node])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        runner.run(pipeline, data_catalog)
        return log, data_catalog
def run_pipeline_dict(pipelines, logger):
    runner = SequentialRunner()

    for kk, pipeline in pipelines.items():
        logger.info(f'Running pipeline {kk}')
        runner.run(pipeline, kedro_catalog)
def test_mlflow_pipeline_hook_metrics_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params
):

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        mlflow_conf = get_mlflow_config()
        mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
            }
        )

        pipeline_hook = MlflowPipelineHook()

        runner = SequentialRunner()
        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

        current_run_id = mlflow.active_run().info.run_id

        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}
        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1
Example #27
0
#%%
# Prepare second node
def join_statements(greeting):
    return f"{greeting} Kedro!"

node_join_statements = node(
    join_statements, 
    inputs="my_salutation", 
    outputs="my_message"
)
#%%
# Assemble nodes into a pipeline
pipeline = Pipeline([node_return_greeting, node_join_statements])

#%%

# Create a runner to run the pipeline
runner = SequentialRunner()

#%%
# Run the pipeline
output = runner.run(pipeline, data_catalog)

print(output)
print(type(output))

#%%


Example #28
0
    df = df.dropna()
    return df


# Plot the amount of people who survived and who died.
def plot_survival_breakdown(df):
    plt.figure(figsize=(6, 4))
    fig, ax = plt.subplots()
    df.Survived.value_counts().plot(kind="barh", color="blue", alpha=0.65)
    ax.set_ylim(-1, len(df.Survived.value_counts()))
    plt.title("Survival Breakdown (1 = Survived, 0 = Died)")
    return fig


# Create nodes
clean_data_node = node(clean_raw_data,
                       inputs="titanic_training_data",
                       outputs="df_clean")
plot_survival_breakdown_node = node(plot_survival_breakdown,
                                    inputs="df_clean",
                                    outputs="survival_breakdown_chart")

# Assemble nodes into a pipeline
pipeline = Pipeline([clean_data_node, plot_survival_breakdown_node])

# Create a runner to run the pipeline
runner = SequentialRunner()

# Run the pipeline
print(runner.run(pipeline, io))
# Prepare first node


def return_greeting():
    return "Hello"


return_greeting_node = node(func=return_greeting,
                            inputs=None,
                            outputs="my_salutation")

# Prepare second node


def join_statements(greeting):
    return f"{greeting} Kedro!"


join_statements_node = node(join_statements,
                            inputs="my_salutation",
                            outputs="my_message")

# Assemble nodes into a pipeline
greeting_pipeline = pipeline([return_greeting_node, join_statements_node])

# Create a runner to run the pipeline
runner = SequentialRunner()

# Run the pipeline
print(runner.run(greeting_pipeline, data_catalog))