def workflow(training_data): with mlflow.start_run() as active_run: # noqa: F841, E501 preprocess_run = mlflow.run(".", "preprocess", parameters={ # noqa: F841, E501 "training_data": training_data}) train_model_run = mlflow.run(".", "train", parameters={ # noqa: F841, E501 "training_data": training_data})
def go(config: DictConfig): # Setup the wandb experiment. All runs will be grouped under this name os.environ["WANDB_PROJECT"] = config["main"]["project_name"] os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"] # You can get the path at the root of the MLflow project with this: root_path = hydra.utils.get_original_cwd() _ = mlflow.run( os.path.join(root_path, "download_data"), "main", parameters={ "file_url": config["data"]["file_url"], "artifact_name": "iris.csv", "artifact_type": "raw_data", "artifact_description": "Input data" }, ) _ = mlflow.run( os.path.join(root_path, "process_data"), "main", parameters={ "input_artifact": "iris.csv:latest", "artifact_name": "clean_data.csv", "artifact_type": "clean_data", "artifact_description": "Data after the preprocessing step" }, )
def run(self, path: str = '.', remote: str = None, **kwargs): logger.info('Starting experiment ...') docker_args_default = {'network': "host", 'ipc': 'host', 'rm': ''} if not self.use_localhost: gpu_params = {'gpus': 'all', 'runtime': 'nvidia'} logger.info('Adding docker args: {0}'.format(gpu_params)) docker_args_default.update(gpu_params) # update docker_args_default with values passed by project if 'docker_args' in kwargs: docker_args_default.update(kwargs['docker_args']) kwargs['docker_args'] = docker_args_default # check image exists and build if not logger.info('Checking for existing image') client = docker.from_env() images = [str(img['RepoTags']) for img in client.api.images()] if all([(self.experiment_name + ':latest') not in item for item in images]): logger.info('No existing image found') self.build_experiment_image(path=self.project_path) else: logger.info('Found existing project image') artifact_uri = mlflow.get_artifact_uri() print("Artifact uri: {}".format(artifact_uri)) mlflow.run(self.project_path, experiment_id=self.experiment_id, use_conda=False, **kwargs)
def main(): uri_mlproject = os.getcwd( ) # Assumes MLproject is present in the current directory. mlflow.set_tracking_uri('sqlite:///mlflow_synth.db') basic_setting = { 'labels': 'synth,simple', 'epochs': 200, 'label_noise': 0.1, 'model': 'mlp_model', 'dataset': 'sinusoid2d', 'dimension': 2, } params_choices = { 'setting1': { 'optimizer': 'sgd', 'learning_rate': 0.01, 'momentum': 0.9, 'fl_arr': np.r_[0.0, 0.26, 0.27, 0.28], }, 'setting2': { 'optimizer': 'adam', 'learning_rate': 0.001, 'fl_arr': np.r_[0.0, 0.24, 0.25, 0.26], } } params = dict(params_choices['setting1'], **basic_setting) # Concatenate the configurations. fl_arr = params.pop( 'fl_arr' ) # Drop it; mlflow only accepts parameters defined in MLproject. with mlflow.start_run() as run: for fl in fl_arr: tmp_params = params.copy() tmp_params['flood_level'] = fl mlflow.run(uri=uri_mlproject, entry_point='synthetic', parameters=tmp_params, use_conda=False) query = 'tags."synth" = "True" and tags."simple" = "True" and attribute.status = "FINISHED"' df = get_data(query) acc_wo_fl = df[df['fl'].values == '0.0'] chosen_fl_idx = df['vaAcc'].values.argmax() chosen_fl = df.iloc[chosen_fl_idx]['fl'] acc_w_fl = df.iloc[[chosen_fl_idx]] print('Test Acc. {:.3f}, Train Acc. {:.3f}: without flooding'.format( acc_wo_fl['teAcc'].values[0], acc_wo_fl['trAcc'].values[0])) print('Test Acc. {:.3f}, Train Acc. {:.3f}: with flooding {}'.format( acc_w_fl['teAcc'].values[0], acc_w_fl['trAcc'].values[0], chosen_fl)) make_graph(query, chosen_fl)
def run(entrypoint, # entrypoint of the run parameters, # parameters of the run config_sha): # sha256 of config file """ Launch run. Args: entrypoint: Entrypoint of the run parameters: Parameters of the run config_sha: Sha256 of config file Returns: Launched run. """ # get mlflow tracking client client = mlflow.tracking.MlflowClient() logger.info("Launching new run for entrypoint={} and parameters={}".format(entrypoint, parameters)) # submit (start) run submitted_run = mlflow.run(".", entrypoint, parameters=parameters) # log config file sha256 as parameter in the submitted run client.log_param(submitted_run.run_id, 'config_sha', config_sha) # return run return client.get_run(submitted_run.run_id)
def train(train_path, val_path, test_path, output_path, param_path): with open(param_path, 'r') as f: params = json.load(f) shutil.rmtree(output_path, ignore_errors=True) os.makedirs(output_path, exist_ok=True) run_params = merge( params, { 'train_path': train_path, 'val_path': val_path, 'test_path': test_path, 'output_path': output_path, 'artifact_path': output_path, }) mlflow.run('models/gradient_boost/project', parameters=run_params)
def _get_or_run(entrypoint, parameters, source_version, use_cache=True): existing_run = _already_ran(entrypoint, parameters, source_version) if use_cache and existing_run: print("Found existing run for entrypoint=%s and parameters=%s" % (entrypoint, parameters)) return existing_run print("Launching new run for entrypoint=%s and parameters=%s" % (entrypoint, parameters)) submitted_run = mlflow.run(".", entrypoint, parameters=parameters) return mlflow.tracking.MlflowClient().get_run(submitted_run.run_id)
def main(alpha, l1_ratio): warnings.filterwarnings("ignore") np.random.seed(40) with mlflow.start_run() as active_run: load_data_run = mlflow.tracking.MlflowClient().get_run( mlflow.run(".", "load_data", parameters={}).run_id) wine_quality_csv_uri = os.path.join( load_data_run.info.artifact_uri, "wine_quality-dir/wine_quality-red.csv") mlflow.run(".", "train", parameters={ "wine_quality_csv": wine_quality_csv_uri, "alpha": alpha, "l1_ratio": l1_ratio })
def workflow(split_prop): # Note: The entrypoint names are defined in MLproject. The artifact directories # are documented by each step's .py file. with mlflow.start_run() as active_run: load_raw_data_run = mlflow.run(".", "load_raw_data") load_raw_data_run_id = mlflow.tracking.MlflowClient()\ .get_run(load_raw_data_run.run_id) loans_csv_uri = os.path.join(load_raw_data_run_id.info.artifact_uri, "loans-raw-csv-dir") etl_data_run = mlflow.run(".", "etl_data", parameters={"loans_csv_uri": loans_csv_uri}) etl_data_run_id = mlflow.tracking.MlflowClient() \ .get_run(etl_data_run.run_id) loans_parquet_uri = os.path.join(etl_data_run_id.info.artifact_uri, "loans-processed-parquet-dir")
def _get_or_run(entrypoint, parameters, git_commit, use_cache=True): existing_run = _already_ran(entrypoint, parameters, git_commit) if use_cache and existing_run: print("Found existing run for entrypoint=%s and parameters=%s" % (entrypoint, parameters)) return existing_run print("Launching new run for entrypoint=%s and parameters=%s" % (entrypoint, parameters)) submitted_run = mlflow.run(uri="./steps", entry_point=entrypoint, parameters=parameters, backend="local") return mlflow.tracking.MlflowClient().get_run(submitted_run.run_id)
def main(project_entry_point, project_path, project_experiment_folder): load_dotenv() experiment_id = create_experiment(project_experiment_folder) cluster_config = read_with_env(f"{project_path}/cluster.json.j2") parameter_file = ( f"{project_path}/parameters.json.j2" if project_entry_point == "main" else f"{project_path}/{project_entry_point}/parameters.json.j2") parameters = read_with_env(parameter_file) if path.exists( parameter_file) else {} mlflow.run( project_path, experiment_id=experiment_id, entry_point=project_entry_point, backend="databricks", backend_config=cluster_config, synchronous=False, parameters=parameters, )
def run_entrypoint(entrypoint, parameters): print("Launching new run for entrypoint=%s and parameters=%s" % (entrypoint, parameters)) submitted_run = mlflow.run(".", entrypoint, parameters=parameters, use_conda=False) return mlflow.tracking.MlflowClient().get_run(submitted_run.run_id)
def _run(entrypoint, parameters={}, source_version=None, use_cache=True): #existing_run = _already_ran(entrypoint, parameters, source_version) #if use_cache and existing_run: # print("Found existing run for entrypoint=%s and parameters=%s" % (entrypoint, parameters)) # return existing_run print("Launching new run for entrypoint=%s and parameters=%s" % (entrypoint, parameters)) submitted_run = mlflow.run(".", entrypoint, parameters=parameters) return submitted_run
def _get_or_run(entrypoint, parameters, git_commit, use_cache=True): existing_run = _already_ran(entrypoint, parameters, git_commit) if use_cache and existing_run: print( f"Found existing run for entrypoint={entrypoint} and parameters={parameters}" ) return existing_run print( f"Launching new run for entrypoint={entrypoint} and parameters={parameters}" ) submitted_run = mlflow.run(".", entrypoint, parameters=parameters) return mlflow.tracking.MlflowClient().get_run(submitted_run.run_id)
def main(config, project_path): pipeline = config["pipeline"] with mlflow.start_run(): for step in pipeline: if step["run"]: print(f"Running {step['step']} in pipeline:") submitted_run = mlflow.run( project_path, entry_point=step["step"], parameters=step["parameters"], use_conda=False, ) submitted_run.wait() else: print(f"Skipped {step['step']}") print("Run finished")
def run(experiement_id, kf_run_id): mlflow.set_tracking_uri("databricks") submitted_run = mlflow.run( ".", entry_point="main", experiment_name=None, experiment_id=experiement_id, parameters=None, backend='databricks', backend_config='clusterconfig.json', ) mlflowClient = mlflow.tracking.MlflowClient().get_run(submitted_run.run_id) if (mlflowClient.info.status != "FINISHED"): raise Exception("MLflow Experiment failed") print("Experiment Completed") print("Status: " + mlflowClient.info.status) print("MLFLOW Run ID: " + mlflowClient.info.run_id) print("MLFLOW Artifact URI" + mlflowClient.info.artifact_uri) print("KubeFlow Run ID" + kf_run_id)
def go(config: DictConfig): # Setup the wandb experiment. All runs will be grouped under this name os.environ["WANDB_PROJECT"] = config["main"]["project_name"] os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"] # You can get the path at the root of the MLflow project with this: root_path = hydra.utils.get_original_cwd() # Serialize decision tree configuration model_config = os.path.abspath("random_forest_config.json") with open(model_config, "w+") as fp: json.dump(dict(config["random_forest"]), fp) _ = mlflow.run( os.path.join(root_path, "random_forest"), "main", parameters={ "train_data": config["data"]["train_data"], "model_config": model_config, }, )
training_params = json.load(tc) training_file_path = os.path.join(training_path, 'train.h5') validation_file_path = os.path.join(validation_path, 'val.h5') testing_file_path = os.path.join(testing_path, 'test.h5') mlflow_params = merge(training_params, { 'train_path': training_file_path, 'val_path': validation_file_path, 'test_path': testing_file_path, 'output_path': mlflow_out_path, 'artifact_path': model_path }) os.makedirs(mlflow_out_path, exist_ok=True) mlflow.run(mlflow_project_uri, parameters=mlflow_params, use_conda=False) print('Training complete.') sys.exit(0) except Exception as e: # Write out an error file. This will be returned as the failureReason in the # DescribeTrainingJob result. trc = traceback.format_exc() with open(os.path.join(output_path, 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) # Printing this causes the exception to be in the training job logs, as well. print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) # A non-zero exit code causes the training job to be marked as Failed. sys.exit(255)
import mlflow """ Read documentation on https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.run """ if __name__ == '__main__': params_1 = {'alpha': 0.5, 'l1_ratio': 0.01} params_2 = {'epochs': 5} # Two runs from GitHub Project mlflow.run("git://github.com/mlflow/mlflow-example.git", use_conda=False, parameters=params_1) mlflow.run("git://github.com/dmatrix/mlflow-example.git", use_conda=False, parameters=params_2)
import mlflow mlflow.run("[email protected]:kstrempel/LinearRegressionFishLength.git")
def execute(config: DictConfig): """ Main procedure for MLops pipeline """ os.environ["WANDB_PROJECT"] = config["main"]["project_name"] os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"] steps_par = config['main']['steps'] active_steps = steps_par.split(",") if steps_par != "all" else _steps root_path = hydra.utils.get_original_cwd() with tempfile.TemporaryDirectory() as tmp_dir: if "download_data" in active_steps: _ = mlflow.run( os.path.join(root_path, "src", "download_data"), "main", parameters={ "sample": config["etl"]["sample"], "artifact_name": "sample.csv", "artifact_type": "raw_data", "artifact_description": "Raw file as downloaded" }, ) if "basic_cleaning" in active_steps: _ = mlflow.run( os.path.join( root_path, "src", "basic_cleaning"), "main", parameters={ "tmp_directory": tmp_dir, "input_artifact": "sample.csv:latest", "output_artifact": "clean_sample.csv", "output_type": "clean_sample", "output_description": "Data with outliers and null values removed", "min_price": config['etl']['min_price'], "max_price": config['etl']['max_price']}, ) if "data_check" in active_steps: _ = mlflow.run( os.path.join(root_path, "src", "data_check"), "main", parameters={ "csv": "clean_sample.csv:latest", "ref": "clean_sample.csv:reference", "kl_threshold": config['data_check']['kl_threshold'], "min_price": config['etl']['min_price'], "max_price": config['etl']['max_price'] }, ) if "data_split" in active_steps: _ = mlflow.run( os.path.join(root_path, "src", "train_val_test_split"), "main", parameters={ "input": "clean_sample.csv:latest", "test_size": config["modeling"]["test_size"], "random_seed": config["modeling"]["random_seed"], "stratify_by": config["modeling"]["stratify_by"] }, ) if "train_random_forest" in active_steps: # NOTE: we need to serialize the random forest configuration into # JSON rf_config = os.path.abspath("rf_config.json") with open(rf_config, "w+") as file_p: json.dump( dict( config["modeling"]["random_forest"].items()), file_p) # DO NOT TOUCH # NOTE: use the rf_config we just created as the rf_config parameter for # the train_random_forest step _ = mlflow.run( os.path.join( root_path, "src", "train_random_forest"), "main", parameters={ "trainval_artifact": "trainval_data.csv:latest", "val_size": config["modeling"]["val_size"], "random_seed": config["modeling"]["random_seed"], "stratify_by": config["modeling"]["stratify_by"], "rf_config": rf_config, "max_tfidf_features": config["modeling"]["max_tfidf_features"], "output_artifact": config["modeling"]["output_artifact"]}, ) if "test_regression_model" in active_steps: _ = mlflow.run( os.path.join(root_path, "src", "test_regression_model"), "main", parameters={ "mlflow_model": config["modeling"]["output_artifact"] + ":prod", "test_dataset": "test_data.csv:latest" } )
def go(config: DictConfig): # Setup the wandb experiment. All runs will be grouped under this name os.environ["WANDB_PROJECT"] = config["main"]["project_name"] os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"] # You can get the path at the root of the MLflow project with this: root_path = hydra.utils.get_original_cwd() # Check which steps we need to execute if isinstance(config["main"]["execute_steps"], str): # This was passed on the command line as a comma-separated list of steps steps_to_execute = config["main"]["execute_steps"].split(",") else: assert isinstance(config["main"]["execute_steps"], list) steps_to_execute = config["main"]["execute_steps"] # Download step if "download" in steps_to_execute: _ = mlflow.run( os.path.join(root_path, "download"), "main", parameters={ "file_url": config["data"]["file_url"], "artifact_name": "raw_data.parquet", "artifact_type": "raw_data", "artifact_description": "Data as downloaded" }, ) if "preprocess" in steps_to_execute: ## YOUR CODE HERE: call the preprocess step pass if "check_data" in steps_to_execute: ## YOUR CODE HERE: call the check_data step pass if "segregate" in steps_to_execute: ## YOUR CODE HERE: call the segregate step pass if "random_forest" in steps_to_execute: # Serialize decision tree configuration model_config = os.path.abspath("random_forest_config.yml") with open(model_config, "w+") as fp: fp.write(OmegaConf.to_yaml(config["random_forest_pipeline"])) ## YOUR CODE HERE: call the random_forest step pass if "evaluate" in steps_to_execute: ## YOUR CODE HERE: call the evaluate step pass
def test_rllib_agent(): import mlflow mlflow.run("example_agents/rllib_agent")
token = dbutils.notebook.entry_point.getDbutils().notebook().getContext( ).apiToken().get() dbutils.fs.put( "file:///root/.databrickscfg", "[DEFAULT]\nhost=https://community.cloud.databricks.com\ntoken = " + token, overwrite=True) # COMMAND ---------- # MAGIC %md Use MLflow Fluent API # COMMAND ---------- res_sub = mlflow.run("https://github.com/mlflow/mlflow-example", parameters={ "alpha": 0.6, "l1_ratio": 0.1 }) print(f"status={res_sub.get_status()}") print(f"run_id={res_sub.run_id}") # COMMAND ---------- # MAGIC %md Use MLflow Projects API # COMMAND ---------- import mlflow res_sub = projects.run( "https://github.com/dmatrix/mlflow-workshop-project-expamle-1", parameters={
import mlflow import warnings import mlflow.pyfunc import pandas as pd import numpy as np # # Short example how to run a MLflow GitHub Project programmatically using # MLflow Fluent APIs https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.run # if __name__ == '__main__': # Suppress any deprcated warnings warnings.filterwarnings("ignore", category=DeprecationWarning) parameters = {'convSize': 2, 'epochs': 5} # ml_project_uri ="git://github.com/Isaac4real/MLflow_Project.git" ml_project_uri = "./MLflow_project" # Iterate over three different runs with different parameters print("Running with param = ", parameters) # why does mlflow.run() not work?? Because the conda.yaml file in MLflow_project folder needs to be accurate, # it needs to show the exact versions of packages and python version. The batch_size param also needs to be given, # refer to MLProject file in MLflow_project folder, # running mlflow generates a folder named mlruns in MLflow_project folder or the root. res_sub = mlflow.run(ml_project_uri, parameters=parameters) print("status= ", res_sub.get_status()) print("run_id= ", res_sub.run_id)
def go(config: DictConfig): # Setup the wandb experiment. All runs will be grouped under this name os.environ["WANDB_PROJECT"] = config["main"]["project_name"] os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"] # Steps to execute steps_par = config['main']['steps'] active_steps = steps_par.split(",") if steps_par != "all" else _steps # Move to a temporary directory with tempfile.TemporaryDirectory() as tmp_dir: if "download" in active_steps: # Download file and load in W&B _ = mlflow.run( f"{config['main']['components_repository']}/get_data", "main", version="master", parameters={ "sample": config["etl"]["sample"], "artifact_name": "sample.csv", "artifact_type": "raw_data", "artifact_description": "Raw file as downloaded" }, ) if "basic_cleaning" in active_steps: _ = mlflow.run( os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"), "main", parameters={ "input_artifact": "sample.csv:latest", "output_artifact": "clean_sample.csv", "output_type": "clean_sample", "output_description": "Data with outliers and null values removed", "min_price": config['etl']['min_price'], "max_price": config['etl']['max_price'] }, ) if "data_check" in active_steps: _ = mlflow.run( os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"), "main", parameters={ "csv": "clean_sample.csv:latest", "ref": "clean_sample.csv:reference", "kl_threshold": config["data_check"]['kl_threshold'], "min_price": config["etl"]["min_price"], "max_price": config["etl"]["max_price"] }, ) if "data_split" in active_steps: _ = mlflow.run( f"{config['main']['components_repository']}/train_val_test_split", "main", parameters={ "input": "clean_sample.csv:latest", "test_size": config["modeling"]["test_size"], "random_seed": config["modeling"]["random_seed"], "stratify_by": config["modeling"]["stratify_by"] }, ) if "train_random_forest" in active_steps: # NOTE: we need to serialize the random forest configuration into JSON rf_config = os.path.abspath("rf_config.json") with open(rf_config, "w+") as fp: json.dump(dict(config["modeling"]["random_forest"].items()), fp) # DO NOT TOUCH # NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest # step _ = mlflow.run(os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"), 'main', parameters={ "trainval_artifact": "trainval_data.csv:latest", "val_size": config["modeling"]["val_size"], "random_seed": config["modeling"]["random_seed"], "stratify_by": config["modeling"]["stratify_by"], "rf_config": rf_config, "max_tfidf_features": 10, "output_artifact": "random_forest_export" }) if "test_regression_model" in active_steps: _ = mlflow.run( f"{config['main']['components_repository']}/test_regression_model", "main", parameters={ "mlflow_model": "random_forest_export:prod", "test_dataset": "test_data.csv:latest" }) pass
import mlflow project_uri = "https://github.com/pengfei99/mlflow-pokemon-example.git" # params = { "remote_server_uri": "http://pengfei.org:8000", "experiment_name": "test-2", "data_url": "https://minio.lab.sspcloud.fr/pengfei/sspcloud-demo/pokemon-partial.csv", "n_estimator": 40, "max_depth": 30, "min_samples_split": 2 } # Run MLflow project and create a reproducible conda environment mlflow.run(project_uri, parameters=params)
def _already_ran(entry_point_name, # entry point name of the run parameters, # parameters of the run git_commit, # git version of the code run config_sha, # sha256 of config file ignore_git=False, # whether to ignore git version or not (default: False) experiment_id=None, # experiment id (default: None) resume=False): # whether to resume a failed/killed previous run or not (default: False) """ Best-effort detection of if a run with the given entrypoint name, parameters, and experiment id already ran. The run must have completed successfully and have at least the parameters provided. Args: entry_point_name: Entry point name of the run parameters: Parameters of the run git_commit: Git version of the code run config_sha: Sha256 of config file ignore_git: Whether to ignore git version or not (default: False) experiment_id: Experiment id (default: None) resume: Whether to resume a failed/killed previous run (only for training) or not (default: False) Returns: Previously executed run if found, None otherwise. """ # if experiment ID is not provided retrieve current experiment ID experiment_id = experiment_id if experiment_id is not None else _get_experiment_id() # instantiate MLflowClient (creates and manages experiments and runs) client = mlflow.tracking.MlflowClient() # get reversed list of run information (from last to first) all_run_infos = reversed(client.list_run_infos(experiment_id)) run_to_resume_id = None # for all runs info for run_info in all_run_infos: # fetch run from backend store full_run = client.get_run(run_info.run_id) # get run dictionary of tags tags = full_run.data.tags # if there is no entry point, or the entry point for the run is different from 'entry_point_name', continue if tags.get(mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT, None) != entry_point_name: continue # initialize 'match_failed' bool to false match_failed = False # for each parameter in the provided run parameters for param_key, param_value in parameters.items(): # get run param value from the run dictionary of parameters run_value = full_run.data.params.get(param_key) # if the current parameter value is different from the run parameter set 'match_failed' to true and break if str(run_value) != str(param_value): match_failed = True break # if the current run is not the one we are searching for go to the next one if match_failed: continue # get previous run git commit version previous_version = tags.get(mlflow_tags.MLFLOW_GIT_COMMIT, None) # if the previous version is different from the current one, go to the next one if not ignore_git and git_commit != previous_version: logger.warning("Run matched, but has a different source version, so skipping (found={}, expected={})" .format(previous_version, git_commit)) continue # get config file sha256 from the run run_config_sha = full_run.data.params.get('config_sha') # if the config file sha256 for the run is different from the current sha, go to the next one if str(run_config_sha) != str(config_sha): logger.warning("Run matched, but config is different.") continue # if the run is not finished if run_info.to_proto().status != RunStatus.FINISHED: if resume: # if resume is enabled, set current run to resume id -> if no newer completed run is found, # this stopped run will be resumed run_to_resume_id = run_info.run_id continue else: # otherwise skip it and try with the next one logger.warning("Run matched, but is not FINISHED, so skipping " "(run_id={}, status={})" .format(run_info.run_id, run_info.status)) continue # otherwise (if the run was found and it is exactly the same), return the found run return client.get_run(run_info.run_id) # if no previously executed (and finished) run was found but a stopped run was found, resume such run if run_to_resume_id is not None: logger.info("Resuming run with entrypoint=%s and parameters=%s" % (entry_point_name, parameters)) # update new run parameters with the stopped run id parameters.update({ 'run_id': run_to_resume_id }) # submit new run that will resume the previously interrupted one submitted_run = mlflow.run(".", entry_point_name, parameters=parameters) # log config file sha256 as parameter in the submitted run client.log_param(submitted_run.run_id, 'config_sha', config_sha) # return submitted (new) run return mlflow.tracking.MlflowClient().get_run(submitted_run.run_id) # if the searched run was not found return 'None' logger.warning("No matching run has been found.") return None