def run(training_data, max_runs, batch_size, max_p, epochs, metric, gpy_model, gpy_acquisition, initial_design, seed): bounds = [ { 'name': 'lr', 'type': 'continuous', 'domain': (1e-5, 1e-1) }, { 'name': 'momentum', 'type': 'continuous', 'domain': (0.0, 1.0) }, ] # create random file to store run ids of the training tasks tracking_client = kiwi.tracking.MlflowClient() def new_eval(nepochs, experiment_id, null_train_loss, null_valid_loss, null_test_loss, return_all=False): """ Create a new eval function :param nepochs: Number of epochs to train the model. :experiment_id: Experiment id for the training run :valid_null_loss: Loss of a null model on the validation dataset :test_null_loss: Loss of a null model on the test dataset. :return_test_loss: Return both validation and test loss if set. :return: new eval function. """ def eval(params): """ Train Keras model with given parameters by invoking MLflow run. Notice we store runUuid and resulting metric in a file. We will later use these to pick the best run and to log the runUuids of the child runs as an artifact. This is a temporary workaround until MLflow offers better mechanism of linking runs together. :param params: Parameters to the train_keras script we optimize over: learning_rate, drop_out_1 :return: The metric value evaluated on the validation data. """ lr, momentum = params[0] with kiwi.start_run(nested=True) as child_run: p = kiwi.projects.run(run_id=child_run.info.run_id, uri=".", entry_point="train", parameters={ "training_data": training_data, "epochs": str(nepochs), "learning_rate": str(lr), "momentum": str(momentum), "seed": str(seed) }, experiment_id=experiment_id, synchronous=False) succeeded = p.wait() if succeeded: training_run = tracking_client.get_run(p.run_id) metrics = training_run.data.metrics # cap the loss at the loss of the null model train_loss = min(null_valid_loss, metrics["train_{}".format(metric)]) valid_loss = min(null_valid_loss, metrics["val_{}".format(metric)]) test_loss = min(null_test_loss, metrics["test_{}".format(metric)]) else: # run failed => return null loss tracking_client.set_terminated(p.run_id, "FAILED") train_loss = null_train_loss valid_loss = null_valid_loss test_loss = null_test_loss kiwi.log_metrics({ "train_{}".format(metric): train_loss, "val_{}".format(metric): valid_loss, "test_{}".format(metric): test_loss }) if return_all: return train_loss, valid_loss, test_loss else: return valid_loss return eval with kiwi.start_run() as run: experiment_id = run.info.experiment_id # Evaluate null model first. # We use null model (predict everything to the mean) as a reasonable upper bound on loss. # We need an upper bound to handle the failed runs (e.g. return NaNs) because GPyOpt can not # handle Infs. # Always including a null model in our results is also a good ML practice. train_null_loss, valid_null_loss, test_null_loss = new_eval( 0, experiment_id, _inf, _inf, _inf, True)(params=[[0, 0]]) myProblem = GPyOpt.methods.BayesianOptimization( new_eval(epochs, experiment_id, train_null_loss, valid_null_loss, test_null_loss), bounds, evaluator_type="local_penalization" if min(batch_size, max_p) > 1 else "sequential", batch_size=batch_size, num_cores=max_p, model_type=gpy_model, acquisition_type=gpy_acquisition, initial_design_type=initial_design, initial_design_numdata=max_runs >> 2, exact_feval=False) myProblem.run_optimization(max_runs) matplotlib.use('agg') plt.switch_backend('agg') with TempDir() as tmp: acquisition_plot = tmp.path("acquisition_plot.png") convergence_plot = tmp.path("convergence_plot.png") myProblem.plot_acquisition(filename=acquisition_plot) myProblem.plot_convergence(filename=convergence_plot) if os.path.exists(convergence_plot): kiwi.log_artifact(convergence_plot, "converegence_plot") if os.path.exists(acquisition_plot): kiwi.log_artifact(acquisition_plot, "acquisition_plot") # find the best run, log its metrics as the final metrics of this run. client = MlflowClient() runs = client.search_runs( [experiment_id], "tags.mlflow.parentRunId = '{run_id}' ".format( run_id=run.info.run_id)) best_val_train = _inf best_val_valid = _inf best_val_test = _inf best_run = None for r in runs: if r.data.metrics["val_rmse"] < best_val_valid: best_run = r best_val_train = r.data.metrics["train_rmse"] best_val_valid = r.data.metrics["val_rmse"] best_val_test = r.data.metrics["test_rmse"] kiwi.set_tag("best_run", best_run.info.run_id) kiwi.log_metrics({ "train_{}".format(metric): best_val_train, "val_{}".format(metric): best_val_valid, "test_{}".format(metric): best_val_test })
def run(training_data, max_runs, max_p, epochs, metric, seed): train_metric = "train_{}".format(metric) val_metric = "val_{}".format(metric) test_metric = "test_{}".format(metric) np.random.seed(seed) tracking_client = kiwi.tracking.MlflowClient() def new_eval(nepochs, experiment_id, null_train_loss=_inf, null_val_loss=_inf, null_test_loss=_inf): def eval(parms): lr, momentum = parms with kiwi.start_run(nested=True) as child_run: p = kiwi.projects.run(run_id=child_run.info.run_id, uri=".", entry_point="train", parameters={ "training_data": training_data, "epochs": str(nepochs), "learning_rate": str(lr), "momentum": str(momentum), "seed": str(seed) }, experiment_id=experiment_id, synchronous=False) succeeded = p.wait() if succeeded: training_run = tracking_client.get_run(p.run_id) metrics = training_run.data.metrics # cap the loss at the loss of the null model train_loss = min(null_train_loss, metrics[train_metric]) val_loss = min(null_val_loss, metrics[val_metric]) test_loss = min(null_test_loss, metrics[test_metric]) else: # run failed => return null loss tracking_client.set_terminated(p.run_id, "FAILED") train_loss = null_train_loss val_loss = null_val_loss test_loss = null_test_loss kiwi.log_metrics({ "train_{}".format(metric): train_loss, "val_{}".format(metric): val_loss, "test_{}".format(metric): test_loss }) return p.run_id, train_loss, val_loss, test_loss return eval with kiwi.start_run() as run: experiment_id = run.info.experiment_id _, null_train_loss, null_val_loss, null_test_loss = new_eval( 0, experiment_id)((0, 0)) runs = [(np.random.uniform(1e-5, 1e-1), np.random.uniform(0, 1.0)) for _ in range(max_runs)] with ThreadPoolExecutor(max_workers=max_p) as executor: _ = executor.map( new_eval(epochs, experiment_id, null_train_loss, null_val_loss, null_test_loss), runs) # find the best run, log its metrics as the final metrics of this run. client = MlflowClient() runs = client.search_runs( [experiment_id], "tags.mlflow.parentRunId = '{run_id}' ".format( run_id=run.info.run_id)) best_val_train = _inf best_val_valid = _inf best_val_test = _inf best_run = None for r in runs: if r.data.metrics["val_rmse"] < best_val_valid: best_run = r best_val_train = r.data.metrics["train_rmse"] best_val_valid = r.data.metrics["val_rmse"] best_val_test = r.data.metrics["test_rmse"] kiwi.set_tag("best_run", best_run.info.run_id) kiwi.log_metrics({ "train_{}".format(metric): best_val_train, "val_{}".format(metric): best_val_valid, "test_{}".format(metric): best_val_test })
def train(training_data, max_runs, epochs, metric, algo, seed): """ Run hyperparameter optimization. """ # create random file to store run ids of the training tasks tracking_client = kiwi.tracking.MlflowClient() def new_eval(nepochs, experiment_id, null_train_loss, null_valid_loss, null_test_loss, return_all=False): """ Create a new eval function :param nepochs: Number of epochs to train the model. :experiment_id: Experiment id for the training run :valid_null_loss: Loss of a null model on the validation dataset :test_null_loss: Loss of a null model on the test dataset. :return_test_loss: Return both validation and test loss if set. :return: new eval function. """ def eval(params): """ Train Keras model with given parameters by invoking MLflow run. Notice we store runUuid and resulting metric in a file. We will later use these to pick the best run and to log the runUuids of the child runs as an artifact. This is a temporary workaround until MLflow offers better mechanism of linking runs together. :param params: Parameters to the train_keras script we optimize over: learning_rate, drop_out_1 :return: The metric value evaluated on the validation data. """ import kiwi.tracking lr, momentum = params with kiwi.start_run(nested=True) as child_run: p = kiwi.projects.run( uri=".", entry_point="train", run_id=child_run.info.run_id, parameters={ "training_data": training_data, "epochs": str(nepochs), "learning_rate": str(lr), "momentum": str(momentum), "seed": seed }, experiment_id=experiment_id, use_conda=False, # We are already in the environment synchronous= False # Allow the run to fail if a model is not properly created ) succeeded = p.wait() if succeeded: training_run = tracking_client.get_run(p.run_id) metrics = training_run.data.metrics # cap the loss at the loss of the null model train_loss = min(null_train_loss, metrics["train_{}".format(metric)]) valid_loss = min(null_valid_loss, metrics["val_{}".format(metric)]) test_loss = min(null_test_loss, metrics["test_{}".format(metric)]) else: # run failed => return null loss tracking_client.set_terminated(p.run_id, "FAILED") train_loss = null_train_loss valid_loss = null_valid_loss test_loss = null_test_loss kiwi.log_metrics({ "train_{}".format(metric): train_loss, "val_{}".format(metric): valid_loss, "test_{}".format(metric): test_loss }) if return_all: return train_loss, valid_loss, test_loss else: return valid_loss return eval space = [ hp.uniform('lr', 1e-5, 1e-1), hp.uniform('momentum', .0, 1.0), ] with kiwi.start_run() as run: experiment_id = run.info.experiment_id # Evaluate null model first. train_null_loss, valid_null_loss, test_null_loss = new_eval( 0, experiment_id, _inf, _inf, _inf, True)(params=[0, 0]) best = fmin( fn=new_eval(epochs, experiment_id, train_null_loss, valid_null_loss, test_null_loss), space=space, algo=tpe.suggest if algo == "tpe.suggest" else rand.suggest, max_evals=max_runs) kiwi.set_tag("best params", str(best)) # find the best run, log its metrics as the final metrics of this run. client = MlflowClient() runs = client.search_runs( [experiment_id], "tags.mlflow.parentRunId = '{run_id}' ".format( run_id=run.info.run_id)) best_val_train = _inf best_val_valid = _inf best_val_test = _inf best_run = None for r in runs: if r.data.metrics["val_rmse"] < best_val_valid: best_run = r best_val_train = r.data.metrics["train_rmse"] best_val_valid = r.data.metrics["val_rmse"] best_val_test = r.data.metrics["test_rmse"] kiwi.set_tag("best_run", best_run.info.run_id) kiwi.log_metrics({ "train_{}".format(metric): best_val_train, "val_{}".format(metric): best_val_valid, "test_{}".format(metric): best_val_test })