def log_model_without_starting_new_run(): """ Performs the exact same operations as `log_model` without starting a new run """ with TempDir() as tmp: artifact_path = "model" local_path = tmp.path("model") mlflow_model = Model(artifact_path=artifact_path, run_id=_AUTOLOG_RUN_ID) save_model_kwargs = dict( tf_saved_model_dir=serialized.decode("utf-8"), tf_meta_graph_tags=[tag_constants.SERVING], tf_signature_def_key="predict", ) save_model(path=local_path, mlflow_model=mlflow_model, **save_model_kwargs) client = MlflowClient() client.log_artifacts(_AUTOLOG_RUN_ID, local_path, artifact_path) try: client._record_logged_model(_AUTOLOG_RUN_ID, mlflow_model) except MlflowException: # We need to swallow all mlflow exceptions to maintain backwards # compatibility with older tracking servers. Only print out a warning # for now. _logger.warning( _LOG_MODEL_METADATA_WARNING_TEMPLATE, get_artifact_uri(_AUTOLOG_RUN_ID), )
def get_underlying_uri(runs_uri): from mlflow.tracking.artifact_utils import get_artifact_uri (run_id, artifact_path) = RunsArtifactRepository.parse_runs_uri(runs_uri) uri = get_artifact_uri(run_id, artifact_path) assert not RunsArtifactRepository.is_runs_uri( uri) # avoid an infinite loop return uri
def __init__(self, artifact_uri): from mlflow.tracking.artifact_utils import get_artifact_uri from mlflow.store.artifact_repository_registry import get_artifact_repository (run_id, artifact_path) = RunsArtifactRepository.parse_runs_uri(artifact_uri) uri = get_artifact_uri(run_id, artifact_path) assert urllib.parse.urlparse(uri).scheme != "runs" # avoid an infinite loop super(RunsArtifactRepository, self).__init__(artifact_uri) self.repo = get_artifact_repository(uri)
def get_underlying_uri(runs_uri): from mlflow.tracking.artifact_utils import get_artifact_uri (run_id, artifact_path) = RunsArtifactRepository.parse_runs_uri(runs_uri) tracking_uri = get_databricks_profile_uri_from_artifact_uri(runs_uri) uri = get_artifact_uri(run_id, artifact_path, tracking_uri) assert not RunsArtifactRepository.is_runs_uri( uri) # avoid an infinite loop return add_databricks_profile_info_to_artifact_uri(uri, tracking_uri)
def svm_model_uri(): X, y = get_breast_cancer_dataset() clf = sklearn.svm.LinearSVC() clf.fit(X, y) with mlflow.start_run() as run: mlflow.sklearn.log_model(clf, "svm_model") svm_model_uri = get_artifact_uri(run.info.run_id, "svm_model") return svm_model_uri
def binary_logistic_regressor_model_uri(): X, y = get_breast_cancer_dataset() clf = sklearn.linear_model.LogisticRegression() clf.fit(X, y) with mlflow.start_run() as run: mlflow.sklearn.log_model(clf, "bin_clf_model") binary_logistic_regressor_model_uri = get_artifact_uri( run.info.run_id, "bin_clf_model") return binary_logistic_regressor_model_uri
def multiclass_logistic_regressor_model_uri(): X, y = get_iris() clf = sklearn.linear_model.LogisticRegression(max_iter=2) clf.fit(X, y) with mlflow.start_run() as run: mlflow.sklearn.log_model(clf, "clf_model") multiclass_logistic_regressor_model_uri = get_artifact_uri( run.info.run_id, "clf_model") return multiclass_logistic_regressor_model_uri
def spark_linear_regressor_model_uri(): spark_df = get_diabetes_spark_dataset() reg = SparkLinearRegression() spark_reg_model = reg.fit(spark_df) with mlflow.start_run() as run: mlflow.spark.log_model(spark_reg_model, "spark_reg_model") spark_linear_regressor_model_uri = get_artifact_uri( run.info.run_id, "spark_reg_model") return spark_linear_regressor_model_uri
def linear_regressor_model_uri(): X, y = get_diabetes_dataset() reg = sklearn.linear_model.LinearRegression() reg.fit(X, y) with mlflow.start_run() as run: mlflow.sklearn.log_model(reg, "reg_model") linear_regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model") return linear_regressor_model_uri
def download_artifacts(self, run_id, relative_path, dst_path=None): local_location = os.path.join(dst_path, relative_path) if os.path.exists(local_location ): # TODO check file digest or something similar?? logger.debug( f"Skipped downloading file because a file f{local_location} with the same name already exists." ) return local_location return artifact_utils.get_artifact_uri(run_id=run_id, artifact_path=relative_path)
def get_artifact_uri(artifact_path=None): """ Get the absolute URI of the specified artifact in the currently active run. If `path` is not specified, the artifact root URI of the currently active run will be returned; calls to ``log_artifact`` and ``log_artifacts`` write artifact(s) to subdirectories of the artifact root URI. If no run is active, this method will create a new active run. :param artifact_path: The run-relative artifact path for which to obtain an absolute URI. For example, "path/to/artifact". If unspecified, the artifact root URI for the currently active run will be returned. :return: An *absolute* URI referring to the specified artifact or the currently adtive run's artifact root. For example, if an artifact path is provided and the currently active run uses an S3-backed store, this may be a uri of the form ``s3://<bucket_name>/path/to/artifact/root/path/to/artifact``. If an artifact path is not provided and the currently active run uses an S3-backed store, this may be a URI of the form ``s3://<bucket_name>/path/to/artifact/root``. .. code-block:: python :caption: Example import mlflow features = "rooms, zipcode, median_price, school_rating, transport" with open("features.txt", 'w') as f: f.write(features) # Log the artifact in a directory "features" under the root artifact_uri/features with mlflow.start_run(): mlflow.log_artifact("features.txt", artifact_path="features") # Fetch the artifact uri root directory artifact_uri = mlflow.get_artifact_uri() print("Artifact uri: {}".format(artifact_uri)) # Fetch a specific artifact uri artifact_uri = mlflow.get_artifact_uri(artifact_path="features/features.txt") print("Artifact uri: {}".format(artifact_uri)) .. code-block:: text :caption: Output Artifact uri: file:///.../0/a46a80f1c9644bd8f4e5dd5553fffce/artifacts Artifact uri: file:///.../0/a46a80f1c9644bd8f4e5dd5553fffce/artifacts/features/features.txt """ return artifact_utils.get_artifact_uri( run_id=_get_or_start_run().info.run_id, artifact_path=artifact_path )
def get_artifact_uri(artifact_path=None): """ Get the absolute URI of the specified artifact in the currently active run. If `path` is not specified, the artifact root URI of the currently active run will be returned; calls to ``log_artifact`` and ``log_artifacts`` write artifact(s) to subdirectories of the artifact root URI. :param artifact_path: The run-relative artifact path for which to obtain an absolute URI. For example, "path/to/artifact". If unspecified, the artifact root URI for the currently active run will be returned. :return: An *absolute* URI referring to the specified artifact or the currently adtive run's artifact root. For example, if an artifact path is provided and the currently active run uses an S3-backed store, this may be a uri of the form ``s3://<bucket_name>/path/to/artifact/root/path/to/artifact``. If an artifact path is not provided and the currently active run uses an S3-backed store, this may be a URI of the form ``s3://<bucket_name>/path/to/artifact/root``. """ return artifact_utils.get_artifact_uri( run_id=_get_or_start_run().info.run_id, artifact_path=artifact_path)
def main(): parser = argparse.ArgumentParser( description="Execute python scripts in Databricks") parser.add_argument("-o", "--output_local_path", help="Output path where the artifacts will be written", required=True) parser.add_argument("-m", "--model_name", help="Model Registry Name", required=True) args = parser.parse_args() model_name = args.model_name output_local_path = args.output_local_path cli_profile_name = "registry" # TODO: Document that we assume that the registry profile will be created in the local machine: # dbutils.fs.put(f"file:///root/.databrickscfg", f"[{cli_profile_name}]\nhost={shard}\ntoken={token}", # overwrite=True) TRACKING_URI = f"databricks://{cli_profile_name}" print(f"TRACKING_URI: {TRACKING_URI}") artifact_path = 'model' from mlflow.tracking import MlflowClient remote_client = MlflowClient(tracking_uri=TRACKING_URI) mlflow.set_tracking_uri(TRACKING_URI) # client = mlflow.tracking.MlflowClient() latest_model = remote_client.get_latest_versions(name=model_name, stages=["staging"]) print(f"Latest Model: {latest_model}") run_id = latest_model[0].run_id artifact_uri = artifact_utils.get_artifact_uri(run_id) print(f"artifact_uri: {artifact_uri}") model_uri = f"runs:/{latest_model[0].run_id}/{artifact_path}" print(f"model_uri: {model_uri}") print(f"Downloading model artifacts to : {output_local_path}") remote_client.download_artifacts(run_id=run_id, path=artifact_path, dst_path=output_local_path)
def evaluate(self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs) -> EvaluationResult: client = mlflow.tracking.MlflowClient() X, y = dataset._extract_features_and_labels() y_pred = model.predict(X) if model_type == "classifier": accuracy_score = sk_metrics.accuracy_score(y, y_pred) metrics = EvaluationMetrics(accuracy_score=accuracy_score) self._log_metrics(run_id, metrics, dataset.name) confusion_matrix = sk_metrics.confusion_matrix(y, y_pred) confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv" confusion_matrix_artifact = Array2DEvaluationArtifact( uri=get_artifact_uri(run_id, confusion_matrix_artifact_name), content=confusion_matrix, ) confusion_matrix_csv_buff = io.StringIO() confusion_matrix_artifact.save(confusion_matrix_csv_buff) client.log_text(run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name) artifacts = { confusion_matrix_artifact_name: confusion_matrix_artifact } elif model_type == "regressor": mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred) mean_squared_error = sk_metrics.mean_squared_error(y, y_pred) metrics = EvaluationMetrics( mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error) self._log_metrics(run_id, metrics, dataset.name) artifacts = {} else: raise ValueError(f"Unsupported model type {model_type}") return EvaluationResult(metrics=metrics, artifacts=artifacts)
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): y_true = iris_dataset.labels_data classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) y_pred = classifier_model.predict(iris_dataset.features_data) expected_accuracy_score = accuracy_score(y_true, y_pred) expected_metrics = { "accuracy_score": expected_accuracy_score, } expected_saved_metrics = { "accuracy_score_on_iris_dataset": expected_accuracy_score, } expected_csv_artifact = confusion_matrix(y_true, y_pred) cm_figure = sklearn.metrics.ConfusionMatrixDisplay.from_predictions( y_true, y_pred).figure_ img_buf = io.BytesIO() cm_figure.savefig(img_buf) img_buf.seek(0) expected_image_artifact = Image.open(img_buf) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="dummy_evaluator", ) csv_artifact_name = "confusion_matrix_on_iris_dataset" saved_csv_artifact_path = get_local_artifact_path( run.info.run_id, csv_artifact_name + ".csv") png_artifact_name = "confusion_matrix_image_on_iris_dataset" saved_png_artifact_path = get_local_artifact_path( run.info.run_id, png_artifact_name) + ".png" _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) assert saved_metrics == expected_saved_metrics assert set(saved_artifacts) == { csv_artifact_name + ".csv", png_artifact_name + ".png" } assert eval_result.metrics == expected_metrics confusion_matrix_artifact = eval_result.artifacts[csv_artifact_name] assert np.array_equal(confusion_matrix_artifact.content, expected_csv_artifact) assert confusion_matrix_artifact.uri == get_artifact_uri( run.info.run_id, csv_artifact_name + ".csv") assert np.array_equal( confusion_matrix_artifact._load(saved_csv_artifact_path), expected_csv_artifact) confusion_matrix_image_artifact = eval_result.artifacts[png_artifact_name] assert (ImageChops.difference(confusion_matrix_image_artifact.content, expected_image_artifact).getbbox() is None) assert confusion_matrix_image_artifact.uri == get_artifact_uri( run.info.run_id, png_artifact_name + ".png") assert (ImageChops.difference( confusion_matrix_image_artifact._load(saved_png_artifact_path), expected_image_artifact, ).getbbox() is None) with TempDir() as temp_dir: temp_dir_path = temp_dir.path() eval_result.save(temp_dir_path) with open(temp_dir.path("metrics.json"), "r") as fp: assert json.load(fp) == eval_result.metrics with open(temp_dir.path("artifacts_metadata.json"), "r") as fp: json_dict = json.load(fp) assert "confusion_matrix_on_iris_dataset" in json_dict assert json_dict["confusion_matrix_on_iris_dataset"] == { "uri": confusion_matrix_artifact.uri, "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact", } assert "confusion_matrix_image_on_iris_dataset" in json_dict assert json_dict["confusion_matrix_image_on_iris_dataset"] == { "uri": confusion_matrix_image_artifact.uri, "class_name": "mlflow.models.evaluation.artifacts.ImageEvaluationArtifact", } assert set(os.listdir(temp_dir.path("artifacts"))) == { "confusion_matrix_on_iris_dataset.csv", "confusion_matrix_image_on_iris_dataset.png", } loaded_eval_result = EvaluationResult.load(temp_dir_path) assert loaded_eval_result.metrics == eval_result.metrics loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[ csv_artifact_name] assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri assert np.array_equal( confusion_matrix_artifact.content, loaded_confusion_matrix_artifact.content, ) loaded_confusion_matrix_image_artifact = loaded_eval_result.artifacts[ png_artifact_name] assert confusion_matrix_image_artifact.uri == loaded_confusion_matrix_image_artifact.uri assert (ImageChops.difference( confusion_matrix_image_artifact.content, loaded_confusion_matrix_image_artifact.content, ).getbbox() is None) new_confusion_matrix_artifact = Array2DEvaluationArtifact( uri=confusion_matrix_artifact.uri) new_confusion_matrix_artifact._load() assert np.array_equal( confusion_matrix_artifact.content, new_confusion_matrix_artifact.content, ) new_confusion_matrix_image_artifact = ImageEvaluationArtifact( uri=confusion_matrix_image_artifact.uri) new_confusion_matrix_image_artifact._load() assert np.array_equal( confusion_matrix_image_artifact.content, new_confusion_matrix_image_artifact.content, )
def download_tmp_artifacts(self, run_id, relative_path): return artifact_utils.get_artifact_uri(run_id=run_id, artifact_path=relative_path)
artifact_path = artifact_path or '' for (dirpath, _, filenames) in os.walk(local_dir): artifact_subdir = artifact_path if dirpath != local_dir: rel_path = os.path.relpath(dirpath, local_dir) rel_path = relative_path_to_artifact_path(rel_path) artifact_subdir = posixpath.join(artifact_path, rel_path) for name in filenames: file_path = os.path.join(dirpath, name) _copy_artifact(file_path, artifact_uri, artifact_subdir) # COMMAND ---------- from mlflow.tracking import artifact_utils artifact_uri = artifact_utils.get_artifact_uri(run_id) copy_artifacts(artifact_uri, artifact_path) # COMMAND ---------- # MAGIC %md ##### Create an MlflowClient with Tracking URI set to the registry workspace # COMMAND ---------- from mlflow.tracking import MlflowClient remote_client = MlflowClient(tracking_uri=TRACKING_URI) # COMMAND ---------- # MAGIC %md ##### Call register_model() using the remote client, using the new DBFS location as “source”.
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): y_true = iris_dataset.labels_data classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) y_pred = classifier_model.predict(iris_dataset.features_data) expected_accuracy_score = accuracy_score(y_true, y_pred) expected_metrics = { "accuracy_score": expected_accuracy_score, } expected_saved_metrics = { "accuracy_score_on_iris_dataset": expected_accuracy_score, } expected_artifact = confusion_matrix(y_true, y_pred) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="dummy_evaluator", ) artifact_name = "confusion_matrix_on_iris_dataset.csv" saved_artifact_path = get_local_artifact_path(run.info.run_id, artifact_name) _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) assert saved_metrics == expected_saved_metrics assert saved_artifacts == [artifact_name] assert eval_result.metrics == expected_metrics confusion_matrix_artifact = eval_result.artifacts[artifact_name] assert np.array_equal(confusion_matrix_artifact.content, expected_artifact) assert confusion_matrix_artifact.uri == get_artifact_uri( run.info.run_id, artifact_name) assert np.array_equal(confusion_matrix_artifact._load(saved_artifact_path), expected_artifact) with TempDir() as temp_dir: temp_dir_path = temp_dir.path() eval_result.save(temp_dir_path) with open(temp_dir.path("metrics.json"), "r") as fp: assert json.load(fp) == eval_result.metrics with open(temp_dir.path("artifacts_metadata.json"), "r") as fp: assert json.load(fp) == { "confusion_matrix_on_iris_dataset.csv": { "uri": confusion_matrix_artifact.uri, "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact", } } assert os.listdir(temp_dir.path("artifacts")) == [ "confusion_matrix_on_iris_dataset.csv" ] loaded_eval_result = EvaluationResult.load(temp_dir_path) assert loaded_eval_result.metrics == eval_result.metrics loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[ artifact_name] assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri assert np.array_equal( confusion_matrix_artifact.content, loaded_confusion_matrix_artifact.content, ) new_confusion_matrix_artifact = Array2DEvaluationArtifact( uri=confusion_matrix_artifact.uri) new_confusion_matrix_artifact._load() assert np.array_equal( confusion_matrix_artifact.content, new_confusion_matrix_artifact.content, )
def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs) -> EvaluationResult: client = mlflow.tracking.MlflowClient() X = dataset.features_data y = dataset.labels_data y_pred = model.predict(X) if model_type == "classifier": accuracy_score = sk_metrics.accuracy_score(y, y_pred) metrics = {"accuracy_score": accuracy_score} self._log_metrics(run_id, metrics, dataset.name) confusion_matrix = sk_metrics.confusion_matrix(y, y_pred) confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}" confusion_matrix_artifact = Array2DEvaluationArtifact( uri=get_artifact_uri(run_id, confusion_matrix_artifact_name + ".csv"), content=confusion_matrix, ) confusion_matrix_csv_buff = io.StringIO() confusion_matrix_artifact._save(confusion_matrix_csv_buff) client.log_text( run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name + ".csv", ) confusion_matrix_figure = sk_metrics.ConfusionMatrixDisplay.from_predictions( y, y_pred).figure_ img_buf = io.BytesIO() confusion_matrix_figure.savefig(img_buf) img_buf.seek(0) confusion_matrix_image = Image.open(img_buf) confusion_matrix_image_artifact_name = f"confusion_matrix_image_on_{dataset.name}" confusion_matrix_image_artifact = ImageEvaluationArtifact( uri=get_artifact_uri( run_id, confusion_matrix_image_artifact_name + ".png"), content=confusion_matrix_image, ) confusion_matrix_image_artifact._save( confusion_matrix_image_artifact_name + ".png") client.log_image(run_id, confusion_matrix_image, confusion_matrix_image_artifact_name + ".png") artifacts = { confusion_matrix_artifact_name: confusion_matrix_artifact, confusion_matrix_image_artifact_name: confusion_matrix_image_artifact, } elif model_type == "regressor": mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred) mean_squared_error = sk_metrics.mean_squared_error(y, y_pred) metrics = { "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, } self._log_metrics(run_id, metrics, dataset.name) artifacts = {} else: raise ValueError(f"Unsupported model type {model_type}") return EvaluationResult(metrics=metrics, artifacts=artifacts)
def get_local_artifact_path(run_id, artifact_path): return get_artifact_uri(run_id, artifact_path).replace("file://", "")