Esempio n. 1
0
def _parse_model_ref(parsed: ParseResult, client: MlflowClient):
    model = parsed.hostname
    path = parsed.path.lstrip("/")
    if path.isdigit():
        mv = client.get_model_version(model, int(path))
        run = client.get_run(mv.run_id)
        return (
            "models:/{}/{}".format(model, path),
            run.data.tags,
            run.data.params,
        )
    if not path:
        stage = "none"  # TODO allow setting default stage from config
    else:
        stage = path.lower()
    results = client.get_latest_versions(model, stages=[stage])
    if not results:
        raise SpecError(
            "No versions found for model {} in stage {}".format(model, stage)
        )
    run = client.get_run(results[0].run_id)
    return (
        "models:/{}/{}".format(model, results[0].version),
        run.data.tags,
        run.data.params,
    )
Esempio n. 2
0
def test_chunk_info(tmpdir: py.path.local) -> None:

    num_objective = mlflow.utils.validation.MAX_METRICS_PER_BATCH + 1
    num_params = mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH + 1

    def objective(trial: optuna.trial.Trial) -> Tuple[float, ...]:
        for i in range(num_params):
            trial.suggest_float(f"x_{i}", 0, 1)

        return tuple([1.0] * num_objective)

    tracking_uri = f"file:{tmpdir}"
    study_name = "my_study"
    n_trials = 1

    mlflc = MLflowCallback(tracking_uri=tracking_uri)
    study = optuna.create_study(study_name=study_name,
                                directions=["maximize"] * num_objective)
    study.optimize(objective, n_trials=n_trials, callbacks=[mlflc])

    mlfl_client = MlflowClient(tracking_uri)
    experiment = mlfl_client.list_experiments()[0]
    run_infos = mlfl_client.list_run_infos(experiment.experiment_id)
    assert len(run_infos) == n_trials

    run = mlfl_client.get_run(run_infos[0].run_id)
    run_dict = run.to_dictionary()

    # The `tags` contains param's distributions and other information too, such as trial number.
    assert len(run_dict["data"]["tags"]) > num_params
    assert len(run_dict["data"]["params"]) == num_params
    assert len(run_dict["data"]["metrics"]) == num_objective
def mlflow_tracking_in_task_example(check_time=datetime.datetime.now()):
    # type: ( datetime.datetime)-> str
    logger.info("Running MLFlow tracking integration check!")
    logger.info("MLFlow tracking URI: {}".format(get_tracking_uri()))

    start_run()

    # params
    log_param("param1", randint(0, 100))
    log_param("param2", randint(0, 100))

    # metrics
    log_metric("foo1", random())
    log_metric("foo1", random() + 1)
    log_metric("foo2", random())
    log_metric("foo2", random() + 1)

    # artifacts
    if not os.path.exists("outputs"):
        os.makedirs("outputs")
    with open("outputs/test1.txt", "w") as f1, open("outputs/test2.txt",
                                                    "w") as f2:
        f1.write("hello")
        f2.write("world!")
    log_artifacts("outputs")

    # Get run metadata & data from the tracking server
    service = MlflowClient()
    run_id = active_run().info.run_id
    run = service.get_run(run_id)
    logger.info("Metadata & data for run with UUID %s: %s" % (run_id, run))

    end_run()

    logger.info("MLFlow tracking integration check completed!")
Esempio n. 4
0
def test_create_model_version_run_link_with_configured_profile(
        mock_registry_store):
    experiment_id = "test-exp-id"
    hostname = "https://workspace.databricks.com/"
    workspace_id = "10002"
    run_id = "runid"
    workspace_url = construct_run_url(hostname, experiment_id, run_id,
                                      workspace_id)
    get_run_mock = mock.MagicMock()
    get_run_mock.return_value = Run(
        RunInfo(run_id, experiment_id, "userid", "status", 0, 1, None), None)
    with mock.patch(
            "mlflow.tracking.client.is_in_databricks_notebook",
            return_value=False
    ), mock.patch(
            "mlflow.tracking.client.get_workspace_info_from_databricks_secrets",
            return_value=(hostname, workspace_id),
    ):
        client = MlflowClient(tracking_uri="databricks",
                              registry_uri="otherplace")
        client.get_run = get_run_mock
        mock_registry_store.create_model_version.return_value = ModelVersion(
            "name",
            1,
            0,
            1,
            source="source",
            run_id=run_id,
            run_link=workspace_url)
        model_version = client.create_model_version("name", "source", "runid")
        assert model_version.run_link == workspace_url
        # verify that the client generated the right URL
        mock_registry_store.create_model_version.assert_called_once_with(
            "name", "source", "runid", [], workspace_url, None)
Esempio n. 5
0
def test_study_name(tmpdir):
    tracking_file_name = "file:{}".format(tmpdir)
    study_name = "my_study"
    n_trials = 2
    num_folds = 3

    study = optuna.create_study(study_name=study_name)
    study.optimize(_objective_func_factory(tracking_file_name, num_folds),
                   n_trials=n_trials)

    mlfl_client = MlflowClient(tracking_file_name)
    experiments = mlfl_client.list_experiments()
    assert len(experiments) == 1

    experiment = experiments[0]
    assert experiment.name == study_name
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == n_trials + n_trials * num_folds

    first_run_id = run_infos[-1].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()
    assert "x" in first_run_dict["data"]["params"]
    assert first_run_dict["data"]["tags"]["direction"] == "MINIMIZE"
Esempio n. 6
0
def test_log_metric_none(tmpdir: py.path.local) -> None:
    tracking_file_name = "file:{}".format(tmpdir)
    metric_name = "my_metric_name"
    study_name = "my_study"
    metric_value = None

    mlflc = MLflowCallback(tracking_uri=tracking_file_name,
                           metric_name=metric_name)
    study = optuna.create_study(study_name=study_name)
    mlflc._initialize_experiment(study)

    with mlflow.start_run():
        mlflc._log_metrics(metric_value)

    mlfl_client = MlflowClient(tracking_file_name)
    experiments = mlfl_client.list_experiments()
    experiment = experiments[0]
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == 1

    first_run_id = run_infos[0].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()

    # when `values` is `None`, do not save values with metric names
    assert metric_name not in first_run_dict["data"]["metrics"]
Esempio n. 7
0
def test_create_model_version_run_link_in_notebook_with_default_profile(
        mock_registry_store):
    experiment_id = 'test-exp-id'
    hostname = 'https://workspace.databricks.com/'
    workspace_id = '10002'
    run_id = 'runid'
    workspace_url = construct_run_url(hostname, experiment_id, run_id,
                                      workspace_id)
    get_run_mock = mock.MagicMock()
    get_run_mock.return_value = Run(
        RunInfo(run_id, experiment_id, 'userid', 'status', 0, 1, None), None)
    with mock.patch('mlflow.tracking.client.is_in_databricks_notebook',
                    return_value=True), \
            mock.patch('mlflow.tracking.client.get_workspace_info_from_dbutils',
                       return_value=(hostname, workspace_id)):
        client = MlflowClient(tracking_uri='databricks',
                              registry_uri='otherplace')
        client.get_run = get_run_mock
        mock_registry_store.create_model_version.return_value = \
            ModelVersion('name', 1, 0, 1, source='source', run_id=run_id, run_link=workspace_url)
        model_version = client.create_model_version('name', 'source', 'runid')
        assert (model_version.run_link == workspace_url)
        # verify that the client generated the right URL
        mock_registry_store.create_model_version.assert_called_once_with(
            "name", 'source', 'runid', [], workspace_url)
Esempio n. 8
0
def test_tag_truncation(tmpdir: py.path.local) -> None:

    tracking_uri = f"file:{tmpdir}"
    study_name = "my_study"
    n_trials = 3

    mlflc = MLflowCallback(tracking_uri=tracking_uri)
    study = optuna.create_study(study_name=study_name)
    study.optimize(_objective_func_long_user_attr,
                   n_trials=n_trials,
                   callbacks=[mlflc])

    mlfl_client = MlflowClient(tracking_uri)
    experiments = mlfl_client.list_experiments()
    assert len(experiments) == 1

    experiment = experiments[0]
    assert experiment.name == study_name
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == n_trials

    first_run_id = run_infos[0].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()

    my_user_attr = first_run_dict["data"]["tags"]["my_user_attr"]
    assert len(my_user_attr) <= 5000
Esempio n. 9
0
def test_autolog_logs_expected_data():
    mlflow.paddle.autolog()

    with mlflow.start_run() as run:
        train_model()

    client = MlflowClient()
    data = client.get_run(run.info.run_id).data

    # Testing params are logged
    for param_key, expected_param_value in [("optimizer_name", "Adam"),
                                            ("learning_rate", "0.01")]:
        assert param_key in data.params
        assert data.params[param_key] == expected_param_value

    # Testing metrics are logged
    for metric_key in [
            "batch_size", "loss", "step", "eval_batch_size", "eval_loss",
            "eval_step"
    ]:
        assert metric_key in data.metrics
        metric_history = client.get_metric_history(run.info.run_id, metric_key)
        assert len(metric_history) == NUM_EPOCHS

    # Testing model_summary.txt is saved
    artifacts = client.list_artifacts(run.info.run_id)
    assert any(x.path == "model_summary.txt" for x in artifacts)
Esempio n. 10
0
def test_autolog_early_stopping_callback():
    mlflow.paddle.autolog()

    early_stopping = paddle.callbacks.EarlyStopping("loss",
                                                    mode="min",
                                                    patience=1,
                                                    min_delta=0)
    with mlflow.start_run() as run:
        train_model(callbacks=[early_stopping])

    client = MlflowClient()
    data = client.get_run(run.info.run_id).data

    for param_key in ["monitor", "patience", "min_delta", "baseline"]:
        assert param_key in data.params
        assert data.params[param_key] == str(getattr(early_stopping,
                                                     param_key))

    for metric_key in ["stopped_epoch", "best_value"]:
        assert metric_key in data.metrics
        assert float(data.metrics[metric_key]) == getattr(
            early_stopping, metric_key)

    for metric_key in ["loss", "step"]:
        assert metric_key in data.metrics
        metric_history = client.get_metric_history(run.info.run_id, metric_key)
        assert len(metric_history) == NUM_EPOCHS
Esempio n. 11
0
def _parse_runid_ref(parsed: ParseResult, client: MlflowClient):
    runid = parsed.hostname
    run = client.get_run(runid)
    path = parsed.path.lstrip("/")
    if path:
        return (
            "runs:/{}/{}".format(runid, path),
            run.data.tags,
            run.data.params,
        )
    else:
        artifacts = client.list_artifacts(runid)
        if not artifacts:
            raise SpecError("Run {} has no artifacts".format(runid))
        elif len(artifacts) == 1:
            return (
                "runs:/{}/{}".format(runid, artifacts[0].path),
                run.data.tags,
                run.data.params,
            )
        else:
            # TODO allow setting default path from config
            raise SpecError(
                (
                    "Run {} has more than 1 artifact ({})."
                    "Please specify path like "
                    "mlflows://<runid>/path/to/artifact in "
                    "CREATE MODEL or ML_PREDICT"
                ).format(runid, [x.path for x in artifacts])
            )
Esempio n. 12
0
def load_model_resources(run_id: str) -> Tuple[PyFuncModel, Any, LabelEncoder]:

    # Retrieves Mlflow Run Data
    mlflow_client = MlflowClient(tracking_uri=settings.TRACKING_URI)
    model_run = mlflow_client.get_run(run_id)
    artifact_uri = model_run.info.artifact_uri

    # Load the set of functions and parameters to preprocess data
    preprocessing_model_path = os.path.join(
        model_run.info.artifact_uri.replace(
            model_run.info.run_id,
            model_run.data.tags.get('mlflow.parentRunId')), 'log',
        'preprocessing_model')
    preprocessing_model = mlflow.pyfunc.load_model(preprocessing_model_path)

    # Load the model
    model = mlflow.sklearn.load_model(f'{artifact_uri}/model')

    # Load the label encoder if it exists
    parent_path = artifact_uri.replace(
        model_run.info.run_id, model_run.data.tags.get('mlflow.parentRunId'))
    label_encoder_path = os.path.join(parent_path, 'label_encoder')
    label_encoder_model = mlflow.sklearn.load_model(label_encoder_path)

    return preprocessing_model, model, label_encoder_model
Esempio n. 13
0
def test_log_metric(tmpdir: py.path.local, func: Callable, names: List[str],
                    values: List[float]) -> None:

    tracking_uri = f"file:{tmpdir}"
    study_name = "my_study"

    mlflc = MLflowCallback(tracking_uri=tracking_uri, metric_name=names)
    study = optuna.create_study(
        study_name=study_name,
        directions=["minimize" for _ in range(len(values))])
    study.enqueue_trial({"x": 1.0, "y": 1.0, "z": 1.0})
    study.optimize(func, n_trials=1, callbacks=[mlflc])

    mlfl_client = MlflowClient(tracking_uri)
    experiments = mlfl_client.list_experiments()
    experiment = experiments[0]
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == 1

    first_run_id = run_infos[0].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()

    assert all(name in first_run_dict["data"]["metrics"] for name in names)
    assert all([
        first_run_dict["data"]["metrics"][name] == val
        for name, val in zip(names, values)
    ])
Esempio n. 14
0
def test_log_params(tmpdir: py.path.local) -> None:

    tracking_uri = f"file:{tmpdir}"
    metric_name = "metric"
    study_name = "my_study"

    mlflc = MLflowCallback(tracking_uri=tracking_uri, metric_name=metric_name)
    study = optuna.create_study(study_name=study_name)
    study.enqueue_trial({"x": 1.0, "y": 1.0, "z": 1.0})
    study.optimize(_objective_func, n_trials=1, callbacks=[mlflc])

    mlfl_client = MlflowClient(tracking_uri)
    experiments = mlfl_client.list_experiments()
    experiment = experiments[0]
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == 1

    first_run_id = run_infos[0].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()

    for param_name, param_value in study.best_params.items():
        assert param_name in first_run_dict["data"]["params"]
        assert first_run_dict["data"]["params"][param_name] == str(param_value)
        assert first_run_dict["data"]["tags"][
            f"{param_name}_distribution"] == str(
                study.best_trial.distributions[param_name])
Esempio n. 15
0
def test_log_metric(tmpdir: py.path.local, names: List[str],
                    values: List[float]) -> None:

    tracking_file_name = "file:{}".format(tmpdir)
    study_name = "my_study"

    mlflc = MLflowCallback(tracking_uri=tracking_file_name, metric_name=names)
    study = optuna.create_study(study_name=study_name)
    mlflc._initialize_experiment(study)

    with mlflow.start_run():
        mlflc._log_metrics(values)

    mlfl_client = MlflowClient(tracking_file_name)
    experiments = mlfl_client.list_experiments()
    experiment = experiments[0]
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == 1

    first_run_id = run_infos[0].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()

    assert all(name in first_run_dict["data"]["metrics"] for name in names)
    assert all([
        first_run_dict["data"]["metrics"][name] == val
        for name, val in zip(names, values)
    ])
Esempio n. 16
0
def main():
    client = MlflowClient()
    experiments = client.list_experiments()
    predict_experiment = next(ex for ex in experiments if ex.name == "Predict")
    predict_run_infos = client.list_run_infos(predict_experiment.experiment_id)

    predictions_dfs = []
    for run_info in predict_run_infos:
        run = client.get_run(run_info.run_id)

        # Build model name
        model = run.data.params["model"]

        if model == "bagging-rgcn-with-embeddings":
            edge_lists_merged_layers = run.data.params[
                "embeddings_merged_layers"]
            model_name = (
                f"{model} | embeddings_merged_layers={edge_lists_merged_layers}"
            )
        else:
            merged_layers = run.data.params["merged_layers"]
            model_name = f"{model} | merged_layers={merged_layers}"

        node_features = run.data.params.get("node_features", "")
        if node_features:
            model_name += f", node_features={node_features}"

        # Get predictions artifact
        abs_artifact_uri = run_info.artifact_uri.replace("file://", "")
        results_artifact_uri = path.join(abs_artifact_uri, "results")
        predictions_file, *_ = glob(
            path.join(results_artifact_uri, "predictions*.tsv"))

        pred_df = pd.read_csv(predictions_file,
                              sep="\t",
                              index_col="node",
                              usecols=["node", "rank"])
        pred_df.rename(columns={"rank": model_name}, inplace=True)
        predictions_dfs.append(pred_df)

    prediction_df = pd.concat(predictions_dfs, axis=1)
    prediction_df.sort_values(by="node", inplace=True)

    # Save tsv file
    out_tsv_file = path.join(config.REPORTS_DIR, f"predictions.tsv")
    print(f"Saving {out_tsv_file}...")
    prediction_df.to_csv(out_tsv_file, sep="\t")

    # Create heatmap
    sns.set(font_scale=1.5)
    fig, ax = plt.subplots(figsize=(0.7 * prediction_df.shape[1],
                                    0.5 * prediction_df.shape[0]))
    fig.suptitle("Predicted rank for each node by model.")
    sns.heatmap(prediction_df.T, annot=True, cbar=False, ax=ax)

    # Save heatmap
    out_png_file = path.join(config.REPORTS_DIR, f"predictions_heatmap.png")
    print(f"Saving {out_png_file}...")
    fig.savefig(out_png_file, bbox_inches="tight", dpi=150)
Esempio n. 17
0
def test_node_hook_logging(
    tmp_path,
    mocker,
    monkeypatch,
    dummy_run_params,
    dummy_catalog,
    dummy_pipeline,
    dummy_node,
    config_dir,
    flatten_dict_params,
    expected,
):

    mocker.patch("logging.config.dictConfig")
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    # config = KedroMlflowConfig(
    #     project_path=tmp_path,
    #     node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"},
    # )
    # # the function is imported inside the other file antd this is the file to patch
    # # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method
    # mocker.patch(
    #     "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config
    # )

    _write_yaml(
        tmp_path / "conf" / "base" / "mlflow.yml",
        dict(hooks=dict(node=dict(flatten_dict_params=flatten_dict_params,
                                  recursive=False,
                                  sep="-")), ),
    ),

    mlflow_node_hook = MlflowNodeHook()

    node_inputs = {
        v: dummy_catalog._data_sets.get(v)
        for k, v in dummy_node._inputs.items()
    }

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params,
                                             pipeline=dummy_pipeline,
                                             catalog=dummy_catalog)
        mlflow_node_hook.before_node_run(
            node=dummy_node,
            catalog=dummy_catalog,
            inputs=node_inputs,
            is_async=False,
            run_id="132",
        )
        run_id = mlflow.active_run().info.run_id

    mlflow_client = MlflowClient(mlflow_tracking_uri)
    current_run = mlflow_client.get_run(run_id)
    assert current_run.data.params == expected
Esempio n. 18
0
def test_node_hook_logging_above_limit_tag_strategy(kedro_project,
                                                    dummy_run_params,
                                                    param_length):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="tag")), ),
    )

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=Pipeline([]),
                catalog=DataCatalog(),
            )

            # IMPORTANT: Overpassing the parameters limit
            # should raise an error for all mlflow backend
            # but it does not on FileStore backend :
            # https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425
            # Since we use FileStore system for simplicty for tests logging works
            # But we have enforced failure (which is slightly different from mlflow
            # behaviour)
            mlflow_node_hook.before_node_run(
                node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
                catalog=DataCatalog(),  # can be empty
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == {}
        assert {
            k: v
            for k, v in current_run.data.tags.items()
            if not k.startswith("mlflow")
        } == {
            "my_param": param_value
        }
def test_mlflow_hook_save_pipeline_ml(
    kedro_project_with_mlflow_conf,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
):

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(
            project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()  # triggers conf setup

        # config_with_base_mlflow_conf is a conftest fixture
        mlflow_hook = MlflowHook()
        mlflow_hook.after_context_created(context)  # setup mlflow config
        runner = SequentialRunner()
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of below arguments,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(run_params=dummy_run_params,
                                        pipeline=pipeline_to_run,
                                        catalog=dummy_catalog)
        runner.run(pipeline_to_run, dummy_catalog, session._hook_manager)
        run_id = mlflow.active_run().info.run_id
        mlflow_hook.after_pipeline_run(run_params=dummy_run_params,
                                       pipeline=pipeline_to_run,
                                       catalog=dummy_catalog)
        # test : parameters should have been logged
        mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri)
        run_data = mlflow_client.get_run(run_id).data

        # all run_params are recorded as tags
        for k, v in dummy_run_params.items():
            if v:
                assert run_data.tags[k] == str(v)

        # params are not recorded because we don't have MlflowHook here
        # and the model should not be logged when it is not a PipelineML
        nb_artifacts = len(mlflow_client.list_artifacts(run_id))
        if isinstance(pipeline_to_run, PipelineML):
            assert nb_artifacts == 1
        else:
            assert nb_artifacts == 0

        if isinstance(pipeline_to_run, PipelineML):
            trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
            assert trained_model.metadata.signature.to_dict() == {
                "inputs": '[{"name": "a", "type": "long"}]',
                "outputs": None,
            }
def modelmetrics(metric: str) -> str:
    '''
    This function takes metricname and return score value:
    The availalible matrics:
    
    all : All metrics used to evaluate the model
    accuracy:
    auc_score:
    f1_score: 
    
    
    Parameters:
    ----------
    metric: str
    
    Return:
    ------
    str
    '''
    client = MlflowClient()

    try:
        if metric == 'all':
            return client.get_run(
                '3e8f282376364196a439678a824bccf8').data.metrics

        elif metric == 'accuracy':
            return f'Accuracy: {client.get_run("3e8f282376364196a439678a824bccf8").data.metrics["Accuracy"]}'

        elif metric == 'auc_score':
            return f'AUC Score: {client.get_run("3e8f282376364196a439678a824bccf8").data.metrics["auc_score"]}'

        elif metric == 'f1_score':
            return f'FI Score: {client.get_run("3e8f282376364196a439678a824bccf8").data.metrics["f1_score"]}'

        else:
            return client.get_run(
                '3e8f282376364196a439678a824bccf8').data.metrics

    except ValueError as e:
        return f'ValueError is {e}'
Esempio n. 21
0
    def _exists(self) -> bool:
        """Check if the metric exists in remote mlflow storage exists.

        Returns:
            bool: Does the metric name exist in the given run_id?
        """
        mlflow_client = MlflowClient()
        run_id = self.run_id  # will get the active run if nothing is specified
        run = mlflow_client.get_run(run_id) if run_id else mlflow.active_run()

        flag_exist = self.key in run.data.metrics.keys() if run else False
        return flag_exist
Esempio n. 22
0
def store_run_df(experiment_name, experiment_id):
    client = MlflowClient()
    if client.list_experiments()[0].name == experiment_name:
        run_df = pd.DataFrame([(run.run_uuid, run.start_time, run.artifact_uri)
                               for run in client.list_run_infos(experiment_id)
                               ])
        run_df.columns = ['run_uuid', 'start_time', 'artifact_uri']
        run_df['start_time'] = pd.to_datetime(run_df['start_time'], unit='ms')
        run_df = run_df.sort_values("start_time", ascending=False)
        run_df['train_accuracy'] = [
            client.get_run(
                run_df.loc[i]['run_uuid']).data.metrics['train_accuracy']
            if len(client.get_run(run_df.loc[i]['run_uuid']).data.metrics) > 0
            else 0 for i in range(len(run_df))
        ]
        run_df['test_accuracy'] = [
            client.get_run(
                run_df.loc[i]['run_uuid']).data.metrics['test_accuracy']
            if len(client.get_run(run_df.loc[i]['run_uuid']).data.metrics) > 0
            else 0 for i in range(len(run_df))
        ]
        return run_df
Esempio n. 23
0
def latent_pca_data(run_id: str,
                    pca_components: int = 3,
                    test_lookback: str = 20201020):
    logger = logging.getLogger(__name__)

    # get mlflow client
    mlflow_client = MlflowClient()
    # get run to be explained
    data = mlflow_client.get_run(run_id).data
    # get latent size
    latent_size = int(data.params["latent_size"])

    # read from directory
    latent_dir = os.path.join("data/output/explain/latent", run_id)
    latents = []

    latents = []
    test_paths = [
        p for p in Path(latent_dir).iterdir()
        if int(os.path.basename(p)[:8]) > test_lookback
    ]

    for p in test_paths:
        df = pq.read_table(p).to_pandas()
        latents.append(df)

    test = pd.concat(latents, axis=0)
    test = test.loc[test["epoch_class"] >= 0, :]
    scaler = RobustScaler().fit(test.iloc[:, :latent_size])
    test_scaled = scaler.transform(test.iloc[:, :latent_size])

    print(test.shape)

    labels, probs = hdbscan_cluster(test_scaled, min_cluster_size=5)

    # PCA
    logger.info(f"Creating PCA with {pca_components} components.")

    # fit pca
    pca = PCA(n_components=pca_components)
    pca.fit(test_scaled)

    components = pca.transform(test_scaled)

    # create df for visualization
    pca_columns = [f"PC{i+1}" for i in range(pca_components)]
    components = pd.DataFrame(components, columns=pca_columns).reset_index()

    explained = pca.explained_variance_ratio_.sum() * 100

    return test, components, explained, labels, probs
Esempio n. 24
0
def test_node_hook(tmp_path):
    mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True,
                                      recursive=True,
                                      sep="-")

    def fake_fun(arg1, arg2, arg3):
        return None

    node_test = node(
        func=fake_fun,
        inputs={
            "arg1": "params:param1",
            "arg2": "foo",
            "arg3": "parameters"
        },
        outputs="out",
    )
    catalog = DataCatalog({
        "params:param1": 1,
        "foo": MemoryDataSet(),
        "bar": MemoryDataSet(),
        "parameters": {
            "param1": 1,
            "param2": 2
        },
    })
    node_inputs = {
        v: catalog._data_sets.get(v)
        for k, v in node_test._inputs.items()
    }

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow_node_hook.before_node_run(
            node=node_test,
            catalog=catalog,
            inputs=node_inputs,
            is_async=False,
            run_id="132",
        )
        run_id = mlflow.active_run().info.run_id

    mlflow_client = MlflowClient(mlflow_tracking_uri)
    current_run = mlflow_client.get_run(run_id)
    assert current_run.data.params == {
        "param1": "1",
        "parameters-param1": "1",
        "parameters-param2": "2",
    }
Esempio n. 25
0
def test_mlflow(ray_start_4_cpus, tmp_path):
    config = TestConfig()

    params = {"p1": "p1"}

    temp_dir = tmp_path
    num_workers = 4

    def train_func(config):
        train.report(episode_reward_mean=4)
        train.report(episode_reward_mean=5)
        train.report(episode_reward_mean=6)
        return 1

    callback = MLflowLoggerCallback(experiment_name="test_exp",
                                    logdir=temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, config=params, callbacks=[callback])

    from mlflow.tracking import MlflowClient

    client = MlflowClient(
        tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri())

    experiment_id = client.get_experiment_by_name("test_exp").experiment_id
    all_runs = callback.mlflow_util._mlflow.search_runs(
        experiment_ids=[experiment_id])
    assert len(all_runs) == 1
    # all_runs is a pandas dataframe.
    all_runs = all_runs.to_dict(orient="records")
    run_id = all_runs[0]["run_id"]
    run = client.get_run(run_id)

    assert run.data.params == params
    assert ("episode_reward_mean" in run.data.metrics
            and run.data.metrics["episode_reward_mean"] == 6.0)
    assert (TRAINING_ITERATION in run.data.metrics
            and run.data.metrics[TRAINING_ITERATION] == 3.0)

    metric_history = client.get_metric_history(run_id=run_id,
                                               key="episode_reward_mean")

    assert len(metric_history) == 3
    iterations = [metric.step for metric in metric_history]
    assert iterations == [1, 2, 3]
    rewards = [metric.value for metric in metric_history]
    assert rewards == [4, 5, 6]
Esempio n. 26
0
def test_log_params(tmpdir: py.path.local) -> None:
    tracking_file_name = "file:{}".format(tmpdir)
    metric_name = "my_metric_name"
    study_name = "my_study"

    param1_name = "my_param1"
    param1_value = "a"
    param2_name = "my_param2"
    param2_value = 5

    params = {param1_name: param1_value, param2_name: param2_value}

    mlflc = MLflowCallback(tracking_uri=tracking_file_name,
                           metric_name=metric_name)
    study = optuna.create_study(study_name=study_name)
    mlflc._initialize_experiment(study)

    with mlflow.start_run():

        trial = optuna.trial.create_trial(
            params=params,
            distributions={
                param1_name:
                optuna.distributions.CategoricalDistribution(["a", "b"]),
                param2_name:
                optuna.distributions.UniformDistribution(0, 10),
            },
            value=5.0,
        )
        mlflc._log_params(trial.params)

    mlfl_client = MlflowClient(tracking_file_name)
    experiments = mlfl_client.list_experiments()
    experiment = experiments[0]
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == 1

    first_run_id = run_infos[0].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()

    assert param1_name in first_run_dict["data"]["params"]
    assert first_run_dict["data"]["params"][param1_name] == param1_value

    assert param2_name in first_run_dict["data"]["params"]
    assert first_run_dict["data"]["params"][param2_name] == str(param2_value)
Esempio n. 27
0
def test_track_in_mlflow_decorator(tmpdir: py.path.local) -> None:

    tracking_uri = f"file:{tmpdir}"
    study_name = "my_study"
    n_trials = 3

    metric_name = "additional_metric"
    metric = 3.14

    mlflc = MLflowCallback(tracking_uri=tracking_uri)

    def _objective_func(trial: optuna.trial.Trial) -> float:
        """Objective function"""

        x = trial.suggest_float("x", -1.0, 1.0)
        y = trial.suggest_float("y", 20, 30, log=True)
        z = trial.suggest_categorical("z", (-1.0, 1.0))
        assert isinstance(z, float)
        trial.set_user_attr("my_user_attr", "my_user_attr_value")
        mlflow.log_metric(metric_name, metric)
        return (x - 2)**2 + (y - 25)**2 + z

    tracked_objective = mlflc.track_in_mlflow()(_objective_func)

    study = optuna.create_study(study_name=study_name)
    study.optimize(tracked_objective, n_trials=n_trials, callbacks=[mlflc])

    mlfl_client = MlflowClient(tracking_uri)
    experiments = mlfl_client.list_experiments()
    assert len(experiments) == 1

    experiment = experiments[0]
    assert experiment.name == study_name
    experiment_id = experiment.experiment_id

    run_infos = mlfl_client.list_run_infos(experiment_id)
    assert len(run_infos) == n_trials

    first_run_id = run_infos[0].run_id
    first_run = mlfl_client.get_run(first_run_id)
    first_run_dict = first_run.to_dictionary()

    assert metric_name in first_run_dict["data"]["metrics"]
    assert first_run_dict["data"]["metrics"][metric_name] == metric

    assert tracked_objective.__name__ == _objective_func.__name__
    assert tracked_objective.__doc__ == _objective_func.__doc__
def test_node_hook_logging(
    kedro_project,
    dummy_run_params,
    dummy_catalog,
    dummy_pipeline,
    dummy_node,
    flatten,
    expected,
):

    _write_yaml(
        kedro_project / "conf" / "base" / "mlflow.yml",
        dict(tracking=dict(params=dict(
            dict_params=dict(flatten=flatten, recursive=False, sep="-")))),
    )

    node_inputs = {
        v: dummy_catalog._data_sets.get(v)
        for k, v in dummy_node._inputs.items()
    }

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()

    bootstrap_project(kedro_project)
    with KedroSession.create(project_path=kedro_project, ) as session:
        context = session.load_context()
        mlflow_node_hook = MlflowHook()
        mlflow_node_hook.after_context_created(context)  # setup mlflow_config
        mlflow.set_tracking_uri(mlflow_tracking_uri)
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=dummy_pipeline,
                catalog=dummy_catalog,
            )
            mlflow_node_hook.before_node_run(
                node=dummy_node,
                catalog=dummy_catalog,
                inputs=node_inputs,
                is_async=False,
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == expected
Esempio n. 29
0
def test_run_name(tmpdir: py.path.local, run_name: Optional[str],
                  expected: str) -> None:

    tracking_uri = f"file:{tmpdir}"

    mlflow_kwargs = {"run_name": run_name}
    mlflc = MLflowCallback(tracking_uri=tracking_uri,
                           mlflow_kwargs=mlflow_kwargs)
    study = optuna.create_study()
    study.optimize(_objective_func, n_trials=1, callbacks=[mlflc])

    mlfl_client = MlflowClient(tracking_uri)
    experiment = mlfl_client.list_experiments()[0]
    run_info = mlfl_client.list_run_infos(experiment.experiment_id)[0]
    run = mlfl_client.get_run(run_info.run_id)
    tags = run.data.tags
    assert tags["mlflow.runName"] == expected
Esempio n. 30
0
def test_modelspec(mlflow_client: MlflowClient):
    mv = mlflow_client.search_model_versions("name='rikai-test'")[0]
    run = mlflow_client.get_run(run_id=mv.run_id)
    spec = MlflowModelSpec(
        "models:/rikai-test/{}".format(mv.version),
        run.data.tags,
        tracking_uri="fake",
    )
    assert spec.flavor == "pytorch"
    assert spec.schema == parse_schema(
        "STRUCT<boxes:ARRAY<ARRAY<float>>,"
        "scores:ARRAY<float>, labels:ARRAY<int>>")
    assert spec._spec["transforms"]["pre"] == (
        "rikai.contrib.torch.transforms.fasterrcnn_resnet50_fpn"
        ".pre_processing")
    assert spec._spec["transforms"]["post"] == (
        "rikai.contrib.torch.transforms.fasterrcnn_resnet50_fpn."
        "post_processing")
    assert spec.model_uri == "models:/rikai-test/{}".format(mv.version)