def main(config: Config) -> None: MLFLOW_ARTIFACT_LOCATION = os.environ.get( "MLFLOW_ARTIFACT_LOCATION", "./data/processed/mlruns/artifacts" ) client = mlf_tracking.MlflowClient() try: _ = client.create_experiment( config.experiment_name, artifact_location=MLFLOW_ARTIFACT_LOCATION, ) except mlf_exceptions.MlflowException: pass params = {} if config.git_version is not None: params["version"] = config.git_version mlf_projects.run( config.uri, entry_point="main", experiment_name=config.experiment_name, use_conda=False, **params, )
def run(uri, entry_point, version, param_list, experiment_id, mode, cluster_spec, git_username, git_password, no_conda, new_dir, storage_dir): """ Run an MLflow project from the given URI. If running locally (the default), the URI can be either a Git repository URI or a local path. If running on Databricks, the URI must be a Git repository. By default, Git projects will run in a new working directory with the given parameters, while local projects will run from the project's root directory. """ param_dict = {} for s in param_list: index = s.find("=") if index == -1: print("Invalid format for -P parameter: '%s'. Use -P name=value." % s, file=sys.stderr) sys.exit(1) name = s[:index] value = s[index + 1:] if name in param_dict: print("Repeated parameter: '%s'" % name, file=sys.stderr) sys.exit(1) param_dict[_encode(name)] = _encode(value) try: projects.run(_encode(uri), _encode(entry_point), _encode(version), experiment_id=experiment_id, parameters=param_dict, mode=_encode(mode), cluster_spec=_encode(cluster_spec), git_username=_encode(git_username), git_password=_encode(git_password), use_conda=(not no_conda), use_temp_cwd=new_dir, storage_dir=_encode(storage_dir)) except projects.ExecutionException as e: print(e.message, file=sys.stderr) sys.exit(1)
def test_dnn(): old_uri = tracking.get_tracking_uri() try: with TempDir(chdr=False, remove_on_exit=True) as tmp: diamonds = tmp.path("diamonds") estimator = tmp.path("estimator") artifacts = tmp.path("artifacts") os.mkdir(diamonds) os.mkdir(estimator) os.mkdir(artifacts) tracking.set_tracking_uri(artifacts) # Download the diamonds dataset via mlflow run run(".", entry_point="main", version=None, parameters={"dest-dir": diamonds}, experiment_id=tracking._get_experiment_id(), mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Run the main dnn app via mlflow run("apps/dnn-regression", entry_point="main", version=None, parameters={ "model-dir": estimator, "train": os.path.join(diamonds, "train_diamonds.parquet"), "test": os.path.join(diamonds, "test_diamonds.parquet"), "hidden-units": "30,30", "label-col": "price", "steps": 5000, "batch-size": 128 }, experiment_id=tracking._get_experiment_id(), mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Loading the saved model as a pyfunc. pyfunc = tensorflow.load_pyfunc( os.path.join(estimator, os.listdir(estimator)[0])) df = pandas.read_parquet( os.path.join(diamonds, "test_diamonds.parquet")) predict_df = pyfunc.predict(df) assert 'predictions' in predict_df assert isinstance(predict_df['predictions'][0][0], numpy.float32) finally: tracking.set_tracking_uri(old_uri)
def run(uri, entry_point, version, param_list, experiment_id, mode, cluster_spec, git_username, git_password, no_conda, storage_dir, run_id): """ Run an MLflow project from the given URI. For local runs, the run will block until it completes. Otherwise, the project will run asynchronously. If running locally (the default), the URI can be either a Git repository URI or a local path. If running on Databricks, the URI must be a Git repository. By default, Git projects run in a new working directory with the given parameters, while local projects run from the project's root directory. """ param_dict = {} for s in param_list: index = s.find("=") if index == -1: print("Invalid format for -P parameter: '%s'. Use -P name=value." % s, file=sys.stderr) sys.exit(1) name = s[:index] value = s[index + 1:] if name in param_dict: print("Repeated parameter: '%s'" % name, file=sys.stderr) sys.exit(1) param_dict[name] = value cluster_spec_arg = cluster_spec if cluster_spec is not None and os.path.splitext( cluster_spec)[-1] != ".json": try: cluster_spec_arg = json.loads(cluster_spec) except ValueError as e: print("Invalid cluster spec JSON. Parse error: %s" % e) raise try: projects.run( uri, entry_point, version, experiment_id=experiment_id, parameters=param_dict, mode=mode, cluster_spec=cluster_spec_arg, git_username=git_username, git_password=git_password, use_conda=(not no_conda), storage_dir=storage_dir, block=mode == "local" or mode is None, run_id=run_id, ) except projects.ExecutionException as e: _logger.error("=== %s ===", e) sys.exit(1)
def test_gbt(): old_uri = tracking.get_tracking_uri() with TempDir(chdr=False, remove_on_exit=True) as tmp: try: diamonds = tmp.path("diamonds") artifacts = tmp.path("artifacts") os.mkdir(diamonds) os.mkdir(artifacts) tracking.set_tracking_uri(artifacts) # Download the diamonds dataset via mlflow run run(".", entry_point="main", version=None, parameters={"dest-dir": diamonds}, experiment_id=0, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) initial = os.path.join(artifacts, "0") dir_list = os.listdir(initial) # Run the main gbt app via mlflow run("apps/gbt-regression", entry_point="main", version=None, parameters={"train": os.path.join(diamonds, "train_diamonds.parquet"), "test": os.path.join(diamonds, "test_diamonds.parquet"), "n-trees": 10, "m-depth": 3, "learning-rate": .1, "loss": "rmse", "label-col": "price"}, experiment_id=0, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Identifying the new run's folder main = None for item in os.listdir(initial): if item not in dir_list: main = item pyfunc = load_pyfunc(os.path.join(initial, main, "artifacts/model/model.pkl")) df = pandas.read_parquet(os.path.join(diamonds, "test_diamonds.parquet")) # Removing the price column from the DataFrame so we can use the features to predict df = df.drop(columns="price") # Predicting from the saved pyfunc predict = pyfunc.predict(df) # Make sure the data is of the right type assert isinstance(predict[0], numpy.float32) finally: tracking.set_tracking_uri(old_uri)
def run(uri, entry_point, version, param_list, experiment_id, mode, cluster_spec, git_username, git_password, no_conda, new_dir, storage_dir, run_id): """ Run an MLflow project from the given URI. Blocks till the run completes. If running locally (the default), the URI can be either a Git repository URI or a local path. If running on Databricks, the URI must be a Git repository. By default, Git projects run in a new working directory with the given parameters, while local projects run from the project's root directory. """ param_dict = {} for s in param_list: index = s.find("=") if index == -1: print("Invalid format for -P parameter: '%s'. Use -P name=value." % s, file=sys.stderr) sys.exit(1) name = s[:index] value = s[index + 1:] if name in param_dict: print("Repeated parameter: '%s'" % name, file=sys.stderr) sys.exit(1) param_dict[name] = value try: projects.run( uri, entry_point, version, experiment_id=experiment_id, parameters=param_dict, mode=mode, cluster_spec=cluster_spec, git_username=git_username, git_password=git_password, use_conda=(not no_conda), use_temp_cwd=new_dir, storage_dir=storage_dir, block=True, run_id=run_id, ) except projects.ExecutionException: import traceback traceback.print_exc(file=sys.stderr) sys.exit(1)
def reproduce_run(run_id, experiment_name, rel_tol=1e-09, verbose=False): # Get target run run1 = client.get_run(run_id) dump_run(run1,"Target Run", verbose) uri = get_tag(run1, mlflow_tags.MLFLOW_SOURCE_NAME) print(f"git_uri: {mlflow_tags.MLFLOW_SOURCE_NAME}: {uri}") version = get_tag(run1, mlflow_tags.MLFLOW_GIT_COMMIT) print("version:",version) # Execute the run - reproduced run res = projects.run(uri, parameters=run1.data.params, version=version, experiment_name=experiment_name) print("Reproduced Run Result:") print(" run_id:",res.run_id) print(" get_status:",res.get_status()) # Print results of reproduced run run2 = client.get_run(res.run_id) dump_run(run2, "Reproduced Run", verbose) # Print metrics comparison between target and reproduced run data = [ [k,v, run2.data.metrics[k]] for k,v in run1.data.metrics.items() ] df = pd.DataFrame(data, columns = ["Metric","Run1", "Run2"]) print() print(tabulate(df, headers="keys", tablefmt="psql", showindex=False)) eq = runs_equal(run1, run2, rel_tol) print("Runs equal:",eq)
def test_simple_run_pip_synchronous(): submitted_run = projects.run(os.path.join(os.path.dirname(__file__), "resources", "pip_project"), backend="yarn", entry_point="compute_intersection", parameters={"size": 10003}, synchronous=True) # failure launches ExecutionException without the infos of the active run _check_merged_logs(submitted_run.skein_app_id, "Time taken in secs:", True)
def test_simple_run_pip(): submitted_run = projects.run(os.path.join(os.path.dirname(__file__), "resources", "pip_project"), backend="yarn", entry_point="compute_intersection", parameters={"size": 10002}, synchronous=False) result_status = submitted_run.wait() _check_merged_logs(submitted_run.skein_app_id, "Time taken in secs:", result_status)
def test_linear(): old_uri = tracking.get_tracking_uri() with TempDir(chdr=False, remove_on_exit=True) as tmp: try: diamonds = tmp.path("diamonds") root_tracking_dir = tmp.path("root_tracking_dir") os.mkdir(diamonds) os.mkdir(root_tracking_dir) tracking.set_tracking_uri(root_tracking_dir) # Download the diamonds dataset via mlflow run mlflow.set_experiment("test-experiment") run(".", entry_point="main", version=None, parameters={"dest-dir": diamonds}, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Run the main linear app via mlflow submitted_run = run("apps/linear-regression", entry_point="main", version=None, parameters={ "train": os.path.join(diamonds, "train_diamonds.parquet"), "test": os.path.join(diamonds, "test_diamonds.parquet"), "alpha": .001, "l1-ratio": .5, "label-col": "price" }, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) pyfunc = load_pyfunc(path="model", run_id=submitted_run.run_id) df = pandas.read_parquet( os.path.join(diamonds, "test_diamonds.parquet")) # Removing the price column from the DataFrame so we can use the features to predict df = df.drop(columns="price") # Predicting from the saved pyfunc predict = pyfunc.predict(df) # Make sure the data is of the right type assert isinstance(predict[0], numpy.float64) finally: tracking.set_tracking_uri(old_uri)
# COMMAND ---------- # MAGIC %md Use MLflow Fluent API # COMMAND ---------- res_sub = mlflow.run("https://github.com/mlflow/mlflow-example", parameters={ "alpha": 0.6, "l1_ratio": 0.1 }) print(f"status={res_sub.get_status()}") print(f"run_id={res_sub.run_id}") # COMMAND ---------- # MAGIC %md Use MLflow Projects API # COMMAND ---------- import mlflow res_sub = projects.run( "https://github.com/dmatrix/mlflow-workshop-project-expamle-1", parameters={ 'batch_size': 5, 'epochs': 1000 }) print(f"status={res_sub.get_status()}") print(f"run_id={res_sub.run_id}")
#!/usr/bin/env python3 # -*- coding:utf-8 -*- # datetime:2019/1/11 13:12 import os import mlflow from mlflow import projects if __name__ == "__main__": path = "e:\\github\\mlflow\\examples\\sklearn_elasticnet_wine" relative_path = "../../../github//mlflow//examples//sklearn_elasticnet_wine" projects.run(uri=path, parameters={"alpha": 0.5}, use_conda=False, experiment_id="2")