Esempio n. 1
0
            def log_model_without_starting_new_run():
                """
                Performs the exact same operations as `log_model` without starting a new run
                """
                with TempDir() as tmp:
                    artifact_path = "model"
                    local_path = tmp.path("model")
                    mlflow_model = Model(artifact_path=artifact_path, run_id=_AUTOLOG_RUN_ID)
                    save_model_kwargs = dict(
                        tf_saved_model_dir=serialized.decode("utf-8"),
                        tf_meta_graph_tags=[tag_constants.SERVING],
                        tf_signature_def_key="predict",
                    )
                    save_model(path=local_path, mlflow_model=mlflow_model, **save_model_kwargs)
                    client = MlflowClient()
                    client.log_artifacts(_AUTOLOG_RUN_ID, local_path, artifact_path)

                    try:
                        client._record_logged_model(_AUTOLOG_RUN_ID, mlflow_model)
                    except MlflowException:
                        # We need to swallow all mlflow exceptions to maintain backwards
                        # compatibility with older tracking servers. Only print out a warning
                        # for now.
                        _logger.warning(
                            _LOG_MODEL_METADATA_WARNING_TEMPLATE,
                            get_artifact_uri(_AUTOLOG_RUN_ID),
                        )
 def get_underlying_uri(runs_uri):
     from mlflow.tracking.artifact_utils import get_artifact_uri
     (run_id,
      artifact_path) = RunsArtifactRepository.parse_runs_uri(runs_uri)
     uri = get_artifact_uri(run_id, artifact_path)
     assert not RunsArtifactRepository.is_runs_uri(
         uri)  # avoid an infinite loop
     return uri
 def __init__(self, artifact_uri):
     from mlflow.tracking.artifact_utils import get_artifact_uri
     from mlflow.store.artifact_repository_registry import get_artifact_repository
     (run_id, artifact_path) = RunsArtifactRepository.parse_runs_uri(artifact_uri)
     uri = get_artifact_uri(run_id, artifact_path)
     assert urllib.parse.urlparse(uri).scheme != "runs"  # avoid an infinite loop
     super(RunsArtifactRepository, self).__init__(artifact_uri)
     self.repo = get_artifact_repository(uri)
Esempio n. 4
0
 def get_underlying_uri(runs_uri):
     from mlflow.tracking.artifact_utils import get_artifact_uri
     (run_id,
      artifact_path) = RunsArtifactRepository.parse_runs_uri(runs_uri)
     tracking_uri = get_databricks_profile_uri_from_artifact_uri(runs_uri)
     uri = get_artifact_uri(run_id, artifact_path, tracking_uri)
     assert not RunsArtifactRepository.is_runs_uri(
         uri)  # avoid an infinite loop
     return add_databricks_profile_info_to_artifact_uri(uri, tracking_uri)
Esempio n. 5
0
def svm_model_uri():
    X, y = get_breast_cancer_dataset()
    clf = sklearn.svm.LinearSVC()
    clf.fit(X, y)

    with mlflow.start_run() as run:
        mlflow.sklearn.log_model(clf, "svm_model")
        svm_model_uri = get_artifact_uri(run.info.run_id, "svm_model")

    return svm_model_uri
Esempio n. 6
0
def binary_logistic_regressor_model_uri():
    X, y = get_breast_cancer_dataset()
    clf = sklearn.linear_model.LogisticRegression()
    clf.fit(X, y)

    with mlflow.start_run() as run:
        mlflow.sklearn.log_model(clf, "bin_clf_model")
        binary_logistic_regressor_model_uri = get_artifact_uri(
            run.info.run_id, "bin_clf_model")

    return binary_logistic_regressor_model_uri
Esempio n. 7
0
def multiclass_logistic_regressor_model_uri():
    X, y = get_iris()
    clf = sklearn.linear_model.LogisticRegression(max_iter=2)
    clf.fit(X, y)

    with mlflow.start_run() as run:
        mlflow.sklearn.log_model(clf, "clf_model")
        multiclass_logistic_regressor_model_uri = get_artifact_uri(
            run.info.run_id, "clf_model")

    return multiclass_logistic_regressor_model_uri
Esempio n. 8
0
def spark_linear_regressor_model_uri():
    spark_df = get_diabetes_spark_dataset()
    reg = SparkLinearRegression()
    spark_reg_model = reg.fit(spark_df)

    with mlflow.start_run() as run:
        mlflow.spark.log_model(spark_reg_model, "spark_reg_model")
        spark_linear_regressor_model_uri = get_artifact_uri(
            run.info.run_id, "spark_reg_model")

    return spark_linear_regressor_model_uri
Esempio n. 9
0
def linear_regressor_model_uri():
    X, y = get_diabetes_dataset()
    reg = sklearn.linear_model.LinearRegression()
    reg.fit(X, y)

    with mlflow.start_run() as run:
        mlflow.sklearn.log_model(reg, "reg_model")
        linear_regressor_model_uri = get_artifact_uri(run.info.run_id,
                                                      "reg_model")

    return linear_regressor_model_uri
Esempio n. 10
0
    def download_artifacts(self, run_id, relative_path, dst_path=None):
        local_location = os.path.join(dst_path, relative_path)
        if os.path.exists(local_location
                          ):  # TODO check file digest or something similar??
            logger.debug(
                f"Skipped downloading file because a file f{local_location} with the same name already exists."
            )
            return local_location

        return artifact_utils.get_artifact_uri(run_id=run_id,
                                               artifact_path=relative_path)
Esempio n. 11
0
def get_artifact_uri(artifact_path=None):
    """
    Get the absolute URI of the specified artifact in the currently active run.
    If `path` is not specified, the artifact root URI of the currently active
    run will be returned; calls to ``log_artifact`` and ``log_artifacts`` write
    artifact(s) to subdirectories of the artifact root URI.

    If no run is active, this method will create a new active run.

    :param artifact_path: The run-relative artifact path for which to obtain an absolute URI.
                          For example, "path/to/artifact". If unspecified, the artifact root URI
                          for the currently active run will be returned.
    :return: An *absolute* URI referring to the specified artifact or the currently adtive run's
             artifact root. For example, if an artifact path is provided and the currently active
             run uses an S3-backed store, this may be a uri of the form
             ``s3://<bucket_name>/path/to/artifact/root/path/to/artifact``. If an artifact path
             is not provided and the currently active run uses an S3-backed store, this may be a
             URI of the form ``s3://<bucket_name>/path/to/artifact/root``.

    .. code-block:: python
        :caption: Example

        import mlflow

        features = "rooms, zipcode, median_price, school_rating, transport"
        with open("features.txt", 'w') as f:
            f.write(features)

        # Log the artifact in a directory "features" under the root artifact_uri/features
        with mlflow.start_run():
            mlflow.log_artifact("features.txt", artifact_path="features")

            # Fetch the artifact uri root directory
            artifact_uri = mlflow.get_artifact_uri()
            print("Artifact uri: {}".format(artifact_uri))

            # Fetch a specific artifact uri
            artifact_uri = mlflow.get_artifact_uri(artifact_path="features/features.txt")
            print("Artifact uri: {}".format(artifact_uri))

    .. code-block:: text
        :caption: Output

        Artifact uri: file:///.../0/a46a80f1c9644bd8f4e5dd5553fffce/artifacts
        Artifact uri: file:///.../0/a46a80f1c9644bd8f4e5dd5553fffce/artifacts/features/features.txt
    """
    return artifact_utils.get_artifact_uri(
        run_id=_get_or_start_run().info.run_id, artifact_path=artifact_path
    )
Esempio n. 12
0
def get_artifact_uri(artifact_path=None):
    """
    Get the absolute URI of the specified artifact in the currently active run.
    If `path` is not specified, the artifact root URI of the currently active
    run will be returned; calls to ``log_artifact`` and ``log_artifacts`` write
    artifact(s) to subdirectories of the artifact root URI.

    :param artifact_path: The run-relative artifact path for which to obtain an absolute URI.
                          For example, "path/to/artifact". If unspecified, the artifact root URI
                          for the currently active run will be returned.
    :return: An *absolute* URI referring to the specified artifact or the currently adtive run's
             artifact root. For example, if an artifact path is provided and the currently active
             run uses an S3-backed store, this may be a uri of the form
             ``s3://<bucket_name>/path/to/artifact/root/path/to/artifact``. If an artifact path
             is not provided and the currently active run uses an S3-backed store, this may be a
             URI of the form ``s3://<bucket_name>/path/to/artifact/root``.
    """
    return artifact_utils.get_artifact_uri(
        run_id=_get_or_start_run().info.run_id, artifact_path=artifact_path)
def main():
    parser = argparse.ArgumentParser(
        description="Execute python scripts in Databricks")
    parser.add_argument("-o",
                        "--output_local_path",
                        help="Output path where the artifacts will be written",
                        required=True)
    parser.add_argument("-m",
                        "--model_name",
                        help="Model Registry Name",
                        required=True)
    args = parser.parse_args()

    model_name = args.model_name
    output_local_path = args.output_local_path

    cli_profile_name = "registry"
    # TODO: Document that we assume that the registry profile will be created in the local machine:
    # dbutils.fs.put(f"file:///root/.databrickscfg", f"[{cli_profile_name}]\nhost={shard}\ntoken={token}",
    #                overwrite=True)

    TRACKING_URI = f"databricks://{cli_profile_name}"
    print(f"TRACKING_URI: {TRACKING_URI}")
    artifact_path = 'model'
    from mlflow.tracking import MlflowClient
    remote_client = MlflowClient(tracking_uri=TRACKING_URI)
    mlflow.set_tracking_uri(TRACKING_URI)
    # client = mlflow.tracking.MlflowClient()
    latest_model = remote_client.get_latest_versions(name=model_name,
                                                     stages=["staging"])
    print(f"Latest Model: {latest_model}")
    run_id = latest_model[0].run_id
    artifact_uri = artifact_utils.get_artifact_uri(run_id)
    print(f"artifact_uri: {artifact_uri}")
    model_uri = f"runs:/{latest_model[0].run_id}/{artifact_path}"
    print(f"model_uri: {model_uri}")

    print(f"Downloading model artifacts to : {output_local_path}")
    remote_client.download_artifacts(run_id=run_id,
                                     path=artifact_path,
                                     dst_path=output_local_path)
Esempio n. 14
0
    def evaluate(self,
                 model,
                 model_type,
                 dataset,
                 run_id,
                 evaluator_config=None,
                 **kwargs) -> EvaluationResult:
        client = mlflow.tracking.MlflowClient()
        X, y = dataset._extract_features_and_labels()
        y_pred = model.predict(X)
        if model_type == "classifier":
            accuracy_score = sk_metrics.accuracy_score(y, y_pred)

            metrics = EvaluationMetrics(accuracy_score=accuracy_score)
            self._log_metrics(run_id, metrics, dataset.name)
            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
            confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv"
            confusion_matrix_artifact = Array2DEvaluationArtifact(
                uri=get_artifact_uri(run_id, confusion_matrix_artifact_name),
                content=confusion_matrix,
            )
            confusion_matrix_csv_buff = io.StringIO()
            confusion_matrix_artifact.save(confusion_matrix_csv_buff)
            client.log_text(run_id, confusion_matrix_csv_buff.getvalue(),
                            confusion_matrix_artifact_name)
            artifacts = {
                confusion_matrix_artifact_name: confusion_matrix_artifact
            }
        elif model_type == "regressor":
            mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
            mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
            metrics = EvaluationMetrics(
                mean_absolute_error=mean_absolute_error,
                mean_squared_error=mean_squared_error)
            self._log_metrics(run_id, metrics, dataset.name)
            artifacts = {}
        else:
            raise ValueError(f"Unsupported model type {model_type}")

        return EvaluationResult(metrics=metrics, artifacts=artifacts)
Esempio n. 15
0
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri,
                             iris_dataset):
    y_true = iris_dataset.labels_data
    classifier_model = mlflow.pyfunc.load_model(
        multiclass_logistic_regressor_model_uri)
    y_pred = classifier_model.predict(iris_dataset.features_data)
    expected_accuracy_score = accuracy_score(y_true, y_pred)
    expected_metrics = {
        "accuracy_score": expected_accuracy_score,
    }
    expected_saved_metrics = {
        "accuracy_score_on_iris_dataset": expected_accuracy_score,
    }

    expected_csv_artifact = confusion_matrix(y_true, y_pred)
    cm_figure = sklearn.metrics.ConfusionMatrixDisplay.from_predictions(
        y_true, y_pred).figure_
    img_buf = io.BytesIO()
    cm_figure.savefig(img_buf)
    img_buf.seek(0)
    expected_image_artifact = Image.open(img_buf)

    with mlflow.start_run() as run:
        eval_result = evaluate(
            classifier_model,
            iris_dataset._constructor_args["data"],
            model_type="classifier",
            targets=iris_dataset._constructor_args["targets"],
            dataset_name=iris_dataset.name,
            evaluators="dummy_evaluator",
        )

    csv_artifact_name = "confusion_matrix_on_iris_dataset"
    saved_csv_artifact_path = get_local_artifact_path(
        run.info.run_id, csv_artifact_name + ".csv")

    png_artifact_name = "confusion_matrix_image_on_iris_dataset"
    saved_png_artifact_path = get_local_artifact_path(
        run.info.run_id, png_artifact_name) + ".png"

    _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id)
    assert saved_metrics == expected_saved_metrics
    assert set(saved_artifacts) == {
        csv_artifact_name + ".csv", png_artifact_name + ".png"
    }

    assert eval_result.metrics == expected_metrics
    confusion_matrix_artifact = eval_result.artifacts[csv_artifact_name]
    assert np.array_equal(confusion_matrix_artifact.content,
                          expected_csv_artifact)
    assert confusion_matrix_artifact.uri == get_artifact_uri(
        run.info.run_id, csv_artifact_name + ".csv")
    assert np.array_equal(
        confusion_matrix_artifact._load(saved_csv_artifact_path),
        expected_csv_artifact)
    confusion_matrix_image_artifact = eval_result.artifacts[png_artifact_name]
    assert (ImageChops.difference(confusion_matrix_image_artifact.content,
                                  expected_image_artifact).getbbox() is None)
    assert confusion_matrix_image_artifact.uri == get_artifact_uri(
        run.info.run_id, png_artifact_name + ".png")
    assert (ImageChops.difference(
        confusion_matrix_image_artifact._load(saved_png_artifact_path),
        expected_image_artifact,
    ).getbbox() is None)

    with TempDir() as temp_dir:
        temp_dir_path = temp_dir.path()
        eval_result.save(temp_dir_path)

        with open(temp_dir.path("metrics.json"), "r") as fp:
            assert json.load(fp) == eval_result.metrics

        with open(temp_dir.path("artifacts_metadata.json"), "r") as fp:
            json_dict = json.load(fp)
            assert "confusion_matrix_on_iris_dataset" in json_dict
            assert json_dict["confusion_matrix_on_iris_dataset"] == {
                "uri":
                confusion_matrix_artifact.uri,
                "class_name":
                "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact",
            }

            assert "confusion_matrix_image_on_iris_dataset" in json_dict
            assert json_dict["confusion_matrix_image_on_iris_dataset"] == {
                "uri":
                confusion_matrix_image_artifact.uri,
                "class_name":
                "mlflow.models.evaluation.artifacts.ImageEvaluationArtifact",
            }

        assert set(os.listdir(temp_dir.path("artifacts"))) == {
            "confusion_matrix_on_iris_dataset.csv",
            "confusion_matrix_image_on_iris_dataset.png",
        }

        loaded_eval_result = EvaluationResult.load(temp_dir_path)
        assert loaded_eval_result.metrics == eval_result.metrics
        loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[
            csv_artifact_name]
        assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri
        assert np.array_equal(
            confusion_matrix_artifact.content,
            loaded_confusion_matrix_artifact.content,
        )
        loaded_confusion_matrix_image_artifact = loaded_eval_result.artifacts[
            png_artifact_name]
        assert confusion_matrix_image_artifact.uri == loaded_confusion_matrix_image_artifact.uri
        assert (ImageChops.difference(
            confusion_matrix_image_artifact.content,
            loaded_confusion_matrix_image_artifact.content,
        ).getbbox() is None)

        new_confusion_matrix_artifact = Array2DEvaluationArtifact(
            uri=confusion_matrix_artifact.uri)
        new_confusion_matrix_artifact._load()
        assert np.array_equal(
            confusion_matrix_artifact.content,
            new_confusion_matrix_artifact.content,
        )
        new_confusion_matrix_image_artifact = ImageEvaluationArtifact(
            uri=confusion_matrix_image_artifact.uri)
        new_confusion_matrix_image_artifact._load()
        assert np.array_equal(
            confusion_matrix_image_artifact.content,
            new_confusion_matrix_image_artifact.content,
        )
Esempio n. 16
0
 def download_tmp_artifacts(self, run_id, relative_path):
     return artifact_utils.get_artifact_uri(run_id=run_id,
                                            artifact_path=relative_path)
    artifact_path = artifact_path or ''
    for (dirpath, _, filenames) in os.walk(local_dir):
        artifact_subdir = artifact_path
        if dirpath != local_dir:
            rel_path = os.path.relpath(dirpath, local_dir)
            rel_path = relative_path_to_artifact_path(rel_path)
            artifact_subdir = posixpath.join(artifact_path, rel_path)
        for name in filenames:
            file_path = os.path.join(dirpath, name)
            _copy_artifact(file_path, artifact_uri, artifact_subdir)


# COMMAND ----------

from mlflow.tracking import artifact_utils
artifact_uri = artifact_utils.get_artifact_uri(run_id)

copy_artifacts(artifact_uri, artifact_path)

# COMMAND ----------

# MAGIC %md ##### Create an MlflowClient with Tracking URI set to the registry workspace

# COMMAND ----------

from mlflow.tracking import MlflowClient
remote_client = MlflowClient(tracking_uri=TRACKING_URI)

# COMMAND ----------

# MAGIC %md ##### Call register_model() using the remote client, using the new DBFS location as “source”.
Esempio n. 18
0
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri,
                             iris_dataset):
    y_true = iris_dataset.labels_data
    classifier_model = mlflow.pyfunc.load_model(
        multiclass_logistic_regressor_model_uri)
    y_pred = classifier_model.predict(iris_dataset.features_data)
    expected_accuracy_score = accuracy_score(y_true, y_pred)
    expected_metrics = {
        "accuracy_score": expected_accuracy_score,
    }
    expected_saved_metrics = {
        "accuracy_score_on_iris_dataset": expected_accuracy_score,
    }

    expected_artifact = confusion_matrix(y_true, y_pred)

    with mlflow.start_run() as run:
        eval_result = evaluate(
            classifier_model,
            iris_dataset._constructor_args["data"],
            model_type="classifier",
            targets=iris_dataset._constructor_args["targets"],
            dataset_name=iris_dataset.name,
            evaluators="dummy_evaluator",
        )

    artifact_name = "confusion_matrix_on_iris_dataset.csv"
    saved_artifact_path = get_local_artifact_path(run.info.run_id,
                                                  artifact_name)

    _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id)
    assert saved_metrics == expected_saved_metrics
    assert saved_artifacts == [artifact_name]

    assert eval_result.metrics == expected_metrics
    confusion_matrix_artifact = eval_result.artifacts[artifact_name]
    assert np.array_equal(confusion_matrix_artifact.content, expected_artifact)
    assert confusion_matrix_artifact.uri == get_artifact_uri(
        run.info.run_id, artifact_name)
    assert np.array_equal(confusion_matrix_artifact._load(saved_artifact_path),
                          expected_artifact)

    with TempDir() as temp_dir:
        temp_dir_path = temp_dir.path()
        eval_result.save(temp_dir_path)

        with open(temp_dir.path("metrics.json"), "r") as fp:
            assert json.load(fp) == eval_result.metrics

        with open(temp_dir.path("artifacts_metadata.json"), "r") as fp:
            assert json.load(fp) == {
                "confusion_matrix_on_iris_dataset.csv": {
                    "uri":
                    confusion_matrix_artifact.uri,
                    "class_name":
                    "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact",
                }
            }

        assert os.listdir(temp_dir.path("artifacts")) == [
            "confusion_matrix_on_iris_dataset.csv"
        ]

        loaded_eval_result = EvaluationResult.load(temp_dir_path)
        assert loaded_eval_result.metrics == eval_result.metrics
        loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[
            artifact_name]
        assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri
        assert np.array_equal(
            confusion_matrix_artifact.content,
            loaded_confusion_matrix_artifact.content,
        )

        new_confusion_matrix_artifact = Array2DEvaluationArtifact(
            uri=confusion_matrix_artifact.uri)
        new_confusion_matrix_artifact._load()
        assert np.array_equal(
            confusion_matrix_artifact.content,
            new_confusion_matrix_artifact.content,
        )
Esempio n. 19
0
    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config,
                 **kwargs) -> EvaluationResult:
        client = mlflow.tracking.MlflowClient()
        X = dataset.features_data
        y = dataset.labels_data
        y_pred = model.predict(X)
        if model_type == "classifier":
            accuracy_score = sk_metrics.accuracy_score(y, y_pred)

            metrics = {"accuracy_score": accuracy_score}
            self._log_metrics(run_id, metrics, dataset.name)
            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
            confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}"
            confusion_matrix_artifact = Array2DEvaluationArtifact(
                uri=get_artifact_uri(run_id,
                                     confusion_matrix_artifact_name + ".csv"),
                content=confusion_matrix,
            )
            confusion_matrix_csv_buff = io.StringIO()
            confusion_matrix_artifact._save(confusion_matrix_csv_buff)
            client.log_text(
                run_id,
                confusion_matrix_csv_buff.getvalue(),
                confusion_matrix_artifact_name + ".csv",
            )

            confusion_matrix_figure = sk_metrics.ConfusionMatrixDisplay.from_predictions(
                y, y_pred).figure_
            img_buf = io.BytesIO()
            confusion_matrix_figure.savefig(img_buf)
            img_buf.seek(0)
            confusion_matrix_image = Image.open(img_buf)

            confusion_matrix_image_artifact_name = f"confusion_matrix_image_on_{dataset.name}"
            confusion_matrix_image_artifact = ImageEvaluationArtifact(
                uri=get_artifact_uri(
                    run_id, confusion_matrix_image_artifact_name + ".png"),
                content=confusion_matrix_image,
            )
            confusion_matrix_image_artifact._save(
                confusion_matrix_image_artifact_name + ".png")
            client.log_image(run_id, confusion_matrix_image,
                             confusion_matrix_image_artifact_name + ".png")

            artifacts = {
                confusion_matrix_artifact_name:
                confusion_matrix_artifact,
                confusion_matrix_image_artifact_name:
                confusion_matrix_image_artifact,
            }
        elif model_type == "regressor":
            mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
            mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
            metrics = {
                "mean_absolute_error": mean_absolute_error,
                "mean_squared_error": mean_squared_error,
            }
            self._log_metrics(run_id, metrics, dataset.name)
            artifacts = {}
        else:
            raise ValueError(f"Unsupported model type {model_type}")

        return EvaluationResult(metrics=metrics, artifacts=artifacts)
Esempio n. 20
0
def get_local_artifact_path(run_id, artifact_path):
    return get_artifact_uri(run_id, artifact_path).replace("file://", "")