def __init__(self, evaluate_every_n_games, job_dir): self.score_for_winning_position_history = [] self.evaluate_every_n_games = evaluate_every_n_games self.job_dir = job_dir self.pct_loss_vs_minimax_history = [] self.minimax_agent = MinimaxAgent(TicTacToe) self.hpt = hypertune.HyperTune()
def _train_and_evaluate(estimator, dataset_path, output_dir): """Runs model training and evaluation. Args: estimator: (pipeline.Pipeline), Pipeline instance, assemble pre-processing steps and model training dataset_path: (string), Path containing training data output_dir: (string), directory that the trained model will be exported Returns: None """ estimator.fit(dataset_path) loss = estimator.score(dataset_path) logging.info(loss) # Write model and eval metrics to `output_dir` model_output_path = os.path.join(output_dir, "model", MODEL_FILE_NAME) dump_object(estimator, model_output_path) # The default name of the metric is training/hptuning/metric. # We recommend that you assign a custom name # The only functional difference is that if you use a custom name, # you must set the hyperparameterMetricTag value in the # HyperparameterSpec object in your job request to match your chosen name. hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag="loss", metric_value=loss, global_step=1000, )
def test(args, model, device, test_loader, epoch): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, size_average=False).item() # sum up batch loss pred = output.max( 1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) # Uses hypertune to report metrics for hyperparameter tuning. hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='my_loss', metric_value=test_loss, global_step=epoch)
def main(args): paths = {} if (args.cloud_type.lower() == "gcp"): paths = gcp_path_setup(args) elif (args.cloud_type.lower() in ("aws")): paths = aws_path_setup(args) elif (args.cloud_type.lower() in ("azure")): paths = azure_path_setup(args) config_params = {} config_params['CV_folds'] = args.cv_folds config_params['compute'] = args.compute_type config_params['dataset'] = 'airline' config_params['dataset_filename'] = args.data_name config_params['cloud_type'] = args.cloud_type config_params['model_type'] = args.model_type config_params['num_samples'] = args.num_samples config_params['paths'] = paths config_params['do_ax_hpo'] = args.do_ax_hpo config_params['ht_est_range'] = args.ht_est_range config_params['ht_depth_range'] = args.ht_depth_range config_params['ht_features_range'] = args.ht_features_range config_params['ht_experiments'] = args.ht_experiments if ('RandomForest' in args.model_type): model_params = { 'max_depth': args.hpo_max_depth, 'max_features': args.hpo_max_features, 'n_bins': args.hpo_num_bins, 'n_estimators': args.hpo_num_est, 'seed': random.random(), # 'seed': 0 } elif ('XGBoost' in args.model_type): model_params = { 'alpha': args.hpo_alpha, 'gamma': args.hpo_gamma, 'lambda': args.hpo_lambda, 'learning_rate': args.hpo_lr, 'max_depth': args.hpo_max_depth, 'num_boost_round': args.hpo_num_boost_round, 'random_state': 0, 'tree_method': 'gpu_hist' if ('GPU' in config_params['compute']) else 'hist' } model, accuracy = train(model_params=model_params, config_params=config_params) if (args.cloud_type.lower() in ("gcp", ) and args.do_hpo): hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='hpo_accuracy', metric_value=accuracy)
def _train_and_evaluate(estimator, dataset, model_dir): """Runs model training and evaluation.""" x_train, y_train, x_eval, y_eval = dataset estimator.fit(x_train, y_train) logging.info("Completed training XGBOOST model") bst = estimator.get_booster() bst_filename = 'model.bst' bst.save_model(bst_filename) model_output_path = os.path.join(model_dir, bst_filename) utils.upload_blob(model_output_path.split("/")[2], bst_filename, "/".join(model_output_path.split("/")[3:])) logging.info("Successfully uploaded file to GCS at location %s", model_dir) y_pred = estimator.predict(x_eval) # Binarize multiclass labels lb = preprocessing.LabelBinarizer() lb.fit(y_eval) y_test = lb.transform(y_eval) y_pred = lb.transform(y_pred) score = metrics.roc_auc_score(y_test, y_pred, average='macro') logging.info("AUC Score: %s", str(score)) hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='roc_auc', metric_value=score, global_step=1000 )
def _train_model_report_metrics(tree_params, make_validation_labels_purchase_only): logging.info("setting group for dataset...") train_data = set_group_for_dataset(_LOCAL_TRAIN_FILE, query_id_column) valid_data = set_group_for_dataset(_LOCAL_VALID_FILE, query_id_column) alpha_values = np.arange(0.0, 1.1, 0.1) best_eval_result = [] for alpha in alpha_values: evals_result = {} train_data.alpha = alpha valid_data.alpha = alpha logging.info("Training model...") lgb.train( params=tree_params, train_set=train_data, valid_sets=[valid_data], fobj=combined_objective, feval=combined_eval, # callbacks=[lgb.print_evaluation()], evals_result=evals_result) best_eval_result.append(_get_best_eval_result(evals_result)) df = pandas.DataFrame(zip(alpha_values, best_eval_result)) print(df) eval_scores = evals_result['valid_0']['ndcg_1'] hpt = hypertune.HyperTune() for idx, score in enumerate(eval_scores): epoch = idx + 1 _report_metric(hpt, epoch, score)
def tune(): X, y = get_data() too = torch.optim.Adam, torch.optim.Adadelta, torch.optim.Adagrad, torch.optim.ASGD to = ht.CategoricalParameter('torch_optimizer', options=too) eta = ht.ContinuousParameter('eta', lower_bound=1e-10, upper_bound=1e-1) mi = ht.DiscreteParameter('max_iter', lower_bound=1e2, upper_bound=1e4) hl1 = ht.DiscreteParameter('', lower_bound=10, upper_bound=100) hl2 = ht.DiscreteParameter('', lower_bound=10, upper_bound=100) hls = ht.TupleParameter('hidden_layer_sizes', values=(hl1, hl2)) tp1 = ht.CategoricalParameter('', options=(nn.Linear, )) tp2 = ht.CategoricalParameter('', options=(nn.Linear, )) tp3 = ht.CategoricalParameter('', options=(nn.Linear, )) top = ht.TupleParameter('topology', values=(tp1, tp2, tp3)) hypers = [to, eta, mi, hls, top] tuner = ht.HyperTune(algorithm=Net, parameters=hypers, train_func=Net.fit, objective_func=Net.mse, train_func_args=(X, y), objective_func_args=(X, y), max_evals=100, maximize=False, num_replications=1) tuner.tune() print(tuner.get_results())
def train_and_report_metrics(xs, ys, num_repeat, extractor_class, useless_var_for_hparam_search=None): """ Trains the model multiple times with the same parameters and returns the average metrics """ all_val_auc = [] all_val_accuracy = [] for i in range(num_repeat): single_train_metrics = extractor_class().train_single_run(xs, ys, i) all_val_auc.append(single_train_metrics['val_auc']) all_val_accuracy.append(single_train_metrics['val_accuracy']) metrics = { "mean_val_auc": np.mean(all_val_auc), "mean_val_accuracy": np.mean(all_val_accuracy), "val_auc_std": np.std(all_val_auc), "val_accuracy_std": np.std(all_val_accuracy) } print(metrics, flush=True) hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='mean_val_auc', metric_value=metrics['mean_val_auc']) return metrics
def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, hptune): """Trains the Covertype Classifier model.""" df_train = pd.read_csv(training_dataset_path) df_validation = pd.read_csv(validation_dataset_path) if not hptune: df_train = pd.concat([df_train, df_validation]) numeric_features = [ 'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points' ] categorical_features = ['Wilderness_Area', 'Soil_Type'] preprocessor = ColumnTransformer(transformers=[( 'num', StandardScaler(), numeric_features), ('cat', OneHotEncoder(), categorical_features)]) pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', SGDClassifier(loss='log'))]) num_features_type_map = { feature: 'float64' for feature in numeric_features } df_train = df_train.astype(num_features_type_map) df_validation = df_validation.astype(num_features_type_map) print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter)) X_train = df_train.drop('Cover_Type', axis=1) y_train = df_train['Cover_Type'] pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter) pipeline.fit(X_train, y_train) if hptune: X_validation = df_validation.drop('Cover_Type', axis=1) y_validation = df_validation['Cover_Type'] accuracy = pipeline.score(X_validation, y_validation) print('Model accuracy: {}'.format(accuracy)) # Log it with hypertune hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='accuracy', metric_value=accuracy) # Save the model if not hptune: model_filename = 'model.pkl' with open(model_filename, 'wb') as model_file: pickle.dump(pipeline, model_file) gcs_model_path = '{}/{}'.format(job_dir, model_filename) subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout) print('Saved model in: {}'.format(gcs_model_path))
def _run(game, network_params, memory_params, explore_decay, ops): """Sets up and runs the gaming simulation. Initializes TensorFlow, the training agent, and the game environment. The agent plays the game from the starting state for a number of episodes set by the user. Args: args: The arguments from the command line parsed by_parse_arguments. """ # Setup TensorBoard Writer. trial_id = json.loads(os.environ.get('TF_CONFIG', '{}')).get('task', {}).get('trial', '') output_path = ops.job_dir if not trial_id else ops.job_dir + '/' hpt = hypertune.HyperTune() graph = tf.Graph() with graph.as_default(): env = gym.make(game) agent = _create_agent(env, network_params, memory_params, explore_decay) def _train_or_evaluate(print_score, training=False): """Runs a gaming simulation and writes results for tensorboard. Args: print_score (bool): True to print a score to the console. training (bool): True if the agent is training, False to eval. """ reward = _play(agent, env, training) if print_score: print( 'Training - ' if training else 'Evaluating - ', 'Episode: {}'.format(episode), 'Total reward: {}'.format(reward), ) if training: agent.learn() return hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='episode_reward', metric_value=reward, global_step=episode) return for episode in range(1, ops.episodes + 1): print_score = ops.print_rate and episode % ops.print_rate == 0 get_summary = ops.eval_rate and episode % ops.eval_rate == 0 _train_or_evaluate(print_score, training=True) if get_summary: _train_or_evaluate(print_score) _record_video(env, agent, output_path) agent.network.save(output_path, save_format='tf')
def test_parameter_scope(self): cca = ht.ConstantParameter('a', value='cc.a') ccb = ht.ConstantParameter('b', value='cc.b') aio = ht.ObjectParameter('O', obj=CC, parameters=(cca, ccb)) aia = ht.ConstantParameter('a', value='ai.a') r = ht.HyperTune(AI, [aia, aio], fit, acc, max_evals=0).tune()['params'] exp = {'O': {'a': 'cc.a', 'b': 'cc.b'}, 'a': 'ai.a'} self.assertEqual(r, exp) r = ht.HyperTune(AI, [aia, aio], fit, acc, max_evals=0).tune()['params'] self.assertEqual(r, exp) aio = ht.ObjectParameter('O', obj=CC, parameters=(ccb, cca)) r = ht.HyperTune(AI, [aia, aio], fit, acc, max_evals=0).tune()['params'] self.assertEqual(r, exp)
def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, hptune): df_train = pd.read_csv(training_dataset_path) df_validation = pd.read_csv(validation_dataset_path) if not hptune: df_train = pd.concat([df_train, df_validation]) numeric_feature_indexes = slice(0, 10) categorical_feature_indexes = slice(10, 12) preprocessor = ColumnTransformer( transformers=[('num', StandardScaler(), numeric_feature_indexes ), ('cat', OneHotEncoder(), categorical_feature_indexes)]) pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', SGDClassifier(loss='log', tol=1e-3))]) num_features_type_map = { feature: 'float64' for feature in df_train.columns[numeric_feature_indexes] } df_train = df_train.astype(num_features_type_map) df_validation = df_validation.astype(num_features_type_map) print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter)) X_train = df_train.drop('Cover_Type', axis=1) y_train = df_train['Cover_Type'] pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter) pipeline.fit(X_train, y_train) if hptune: X_validation = df_validation.drop('Cover_Type', axis=1) y_validation = df_validation['Cover_Type'] accuracy = pipeline.score(X_validation, y_validation) print('Model accuracy: {}'.format(accuracy)) # Log it with hypertune hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='accuracy', metric_value=accuracy) # TODO: Score the model with the validation data and capture the result # with the hypertune library # Save the model if not hptune: model_filename = 'model.pkl' with open(model_filename, 'wb') as model_file: pickle.dump(pipeline, model_file) gcs_model_path = "{}/{}".format(job_dir, model_filename) subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout) print("Saved model in: {}".format(gcs_model_path))
def train_and_evaluate(hparams): batch_size = hparams['batch_size'] eval_data_path = hparams['eval_data_path'] nnsize = hparams['nnsize'] nbuckets = hparams['nbuckets'] lr = hparams['lr'] num_evals = hparams['num_evals'] num_examples_to_train_on = hparams['num_examples_to_train_on'] output_dir = hparams['output_dir'] train_data_path = hparams['train_data_path'] if tf.io.gfile.exists(output_dir): tf.io.gfile.rmtree(output_dir) timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') savedmodel_dir = os.path.join(output_dir, 'savedmodel') model_export_path = os.path.join(savedmodel_dir, timestamp) checkpoint_path = os.path.join(output_dir, 'checkpoints') tensorboard_path = os.path.join(output_dir, 'tensorboard') dnn_model = build_dnn_model(nbuckets, nnsize, lr) logging.info(dnn_model.summary()) trainds = create_train_dataset(train_data_path, batch_size) evalds = create_eval_dataset(eval_data_path, batch_size) steps_per_epoch = num_examples_to_train_on // (batch_size * num_evals) checkpoint_cb = callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True, verbose=1) tensorboard_cb = callbacks.TensorBoard(tensorboard_path, histogram_freq=1) history = dnn_model.fit( trainds, validation_data=evalds, epochs=num_evals, steps_per_epoch=max(1, steps_per_epoch), verbose=2, # 0=silent, 1=progress bar, 2=one line per epoch callbacks=[checkpoint_cb, tensorboard_cb]) # Exporting the model with default serving function. tf.saved_model.save(dnn_model, model_export_path) # TODO 1 hp_metric = history.history['val_rmse'][num_evals - 1] # TODO 1 hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric(hyperparameter_metric_tag='rmse', metric_value=hp_metric, global_step=num_evals) return history
def test_default_parameters(self): cea = ht.ConstantParameter('a', value='ce.a') cec = ht.ConstantParameter('c', value='ce.c') toa = (None, ) * 2 args = CE, [cec, cea], fit, acc, *toa, 0 print(args) r = ht.HyperTune(*args).tune()['params'] exp = {'a': 'ce.a', 'c': 'ce.c'} self.assertEqual(r, exp)
def _train_and_evaluate(estimator, output_dir): """Runs model training and evaluation. Args: estimator: (pipeline.Pipeline), Pipeline instance, in this case, model training dataset: (pandas.DataFrame), DataFrame containing training data output_dir: (string), directory that the trained model will be exported Returns: None """ """X_train, y_train =utils._feature_label_split(df_train,"is_churn","msno") df_val = utils.read_from_bigquery("amiable-octane-267022.kkbox.output_val_1","amiable-octane-267022") X_val, y_val =utils._feature_label_split(df_val,"is_churn","msno")""" df_train = utils.over_sample("amiable-octane-267022.kkbox.output_train_1", "amiable-octane-267022") X_train, y_train = utils._feature_label_split(df_train, "is_churn", "msno") df_val = utils.over_sample("amiable-octane-267022.kkbox.output_val_1", "amiable-octane-267022") X_val, y_val = utils._feature_label_split(df_val, "is_churn", "msno") estimator.fit(X_train, y_train) f1_scorer = make_scorer(f1_score) accuracy_scorer = make_scorer(accuracy_score) if metadata.HYPERPARAMTER_TUNING: scores = model_selection.cross_val_score(estimator, X_val, y_val, cv=3, scoring=f1_scorer) #,scoring=f1_scorer logging.info('Score: %s', scores) #tune hyper hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='F1_SCORE', metric_value=np.mean(scores), global_step=10000) #joblib.dump(estimator, 'model.joblib') # Write model and eval metrics to `output_dir` model_output_path = os.path.join(output_dir, 'model', metadata.MODEL_FILE_NAME) utils.dump_object(estimator, model_output_path)
def train_evaluate(gcs_csv_path, gcs_output_path, hptune=False, n_estimators=300, learning_rate=0.1, scale_pos_weight='TRUE'): # Load dataframe from GCS cover_df = pd.read_csv(gcs_csv_path) # Split data X_train, X_test, y_train, y_test = train_test_split(cover_df) n_pos = y_train.sum() n_neg = y_train.shape[0] - n_pos # Preprocess data preprocessor = fit_preprocessor(pd.concat([X_train, X_test])) X_train = preprocessor.transform(X_train) # Prepare hyperparams and train model hparams = { 'n_estimators': n_estimators, 'learning_rate': learning_rate } if scale_pos_weight in ('TRUE', 'True', 'true'): hparams['scale_pos_weight'] = n_neg / n_pos clf = train_model(X_train, y_train, **hparams) # Evaluate model on test set X_test = preprocessor.transform(X_test) acc, f1 = evaluate_model(clf, X_test, y_test) print( f'n_pos: {n_pos} - n_neg {n_neg}' f'\tAccuracy: {acc} \t F1-score: {f1}' ) # Report metric to cloud hypertune if hptune: hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='f1-score', metric_value=f1 ) # Train model on all data and save to GCS else: clf = train_model(np.append(X_train, X_test, axis=0), np.append(y_train, y_test), **hparams) # Save clf and preprocessor with tempfile.TemporaryDirectory() as tmpdir: clf.save_model(f'{tmpdir}/xgboost.bin') joblib.dump(preprocessor, f'{tmpdir}/preprocessor.joblib') upload_file(f'{tmpdir}/xgboost.bin', f'{gcs_output_path}/xgboost.bin') upload_file(f'{tmpdir}/preprocessor.joblib', f'{gcs_output_path}/preprocessor.joblib')
def _train_and_evaluate(estimator, dataset, model_dir, params): """Runs model training and evaluation.""" x_train, y_train, x_eval, y_eval = dataset estimator.fit(x_train, y_train) model_path = os.path.join(model_dir, "model.joblib") utils.dump_object(estimator, model_path) scores = model_selection.cross_val_score( estimator, x_eval, y_eval, cv=params.cross_validations) metric_path = os.path.join(model_dir, "eval_metrics.joblib") utils.dump_object(scores, metric_path) hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag="score", metric_value=np.mean(scores))
def main(): args = get_args() path_data = args.pathdata output_bucket = args.pathoutput storage = args.storage numberestimators = args.numberestimators full_table_path = args.bqtable if storage in ['BQ', 'bq' 'bigquery', 'BigQuery']: dataset = utils.read_df_from_bigquery(full_table_path) else: dataset = utils.get_data_from_gcs(path_data) x_train, y_train, x_val, y_val = utils.data_train_test_split(dataset) pipeline = model.get_pipeline(numberestimators, args.minsamplesleaf) pipeline.fit(x_train, y_train) scores = model_selection.cross_val_score(pipeline, x_val, y_val, cv=3) model_output_path = os.path.join(output_bucket, 'model', metadata.MODEL_FILE_NAME) metric_output_path = os.path.join(output_bucket, 'experiment', metadata.METRIC_FILE_NAME) utils.dump_object(pipeline, model_output_path) utils.dump_object(scores, metric_output_path) accuracy = pipeline.score(x_val, y_val) hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='accuracy', metric_value=accuracy, global_step=1000) print("model score: %.3f" % pipeline.score(x_val, y_val)) print('pipeline run done :)')
def test(sequential_model, test_loader, criterion, epoch, report_metric=False): """Test / Evaluate the DNNs performance with a test / eval dataset. Read the data from the dataloader and calculate the loss. Lastly, display some statistics about the performance of the DNN during testing. Args: sequential_model: The neural network that you are testing, based on nn.Module test_loader: The test / evaluation dataset criterion: The loss function epoch: The current epoch that the training loop is on report_metric: Whether to report metrics for hyperparameter tuning """ sequential_model.eval() test_loss = 0.0 correct = 0 with torch.no_grad(): for _, data in enumerate(test_loader, 0): features = data['features'] target = data['target'] output = sequential_model(features) # sum up batch loss test_loss += criterion(output, target) # compute accuracy for a binary classifier # Values > 0.5 = 1 # Values <= 0.5 = 0 correct += ((output > 0.5) == (target > 0.5)).sum().item() # get the average loss for the test set. test_loss /= (len(test_loader.sampler) / test_loader.batch_size) if report_metric: # Uses hypertune to report metrics for hyperparameter tuning. hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='test_loss', metric_value=test_loss, global_step=epoch) # print statistics print('\nTest set:\n\tAverage loss: {:.4f}'.format(test_loss)) print('\tAccuracy: {}/{} ({:.0f}%)\n'.format( correct, len(test_loader.sampler), 100. * correct / len(test_loader.sampler)))
def test_import(self): print('go') a = ht.ContinuousParameter('a', lower_bound=0, upper_bound=1) hypers = [a] gs = ht.optimizers.GridSearch(depth=1, resolution=0.1) tuner = ht.HyperTune(algorithm=A, parameters=hypers, optimizer=gs, train_func=A.fit, objective_func=A.acc, max_evals=100, maximize=False, num_replications=1) results = tuner.tune() print(results)
def __init__(self, path, train_config, update_freq='epoch', metric='epoch_acc/val', hparams=None): # Parse params self.path = path self.log_stage = self._parse_stage(update_freq) self.log_freq = self._parse_freq(update_freq) self.train_config = train_config self.metric = metric self.hpt = hypertune.HyperTune() self.hparams = hparams # Initialise summary writer if path.startswith('gs://'): gsutil.gcloud_auth() log_path = os.path.join(path, 'logs') super().__init__(log_path)
def train_and_evaluate(args): model = build_wide_deep_model(args["nnsize"], args["nembeds"]) print("Here is our Wide-and-Deep architecture so far:\n") print(model.summary()) trainds = load_dataset(args["train_data_path"], args["batch_size"], 'train') evalds = load_dataset(args["eval_data_path"], 1000, 'eval') if args["eval_steps"]: evalds = evalds.take(count=args["eval_steps"]) num_batches = args["batch_size"] * args["num_epochs"] steps_per_epoch = args["train_examples"] // num_batches checkpoint_path = os.path.join(args["output_dir"], "checkpoints/babyweight") cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True) history = model.fit( trainds, validation_data=evalds, epochs=args["num_epochs"], steps_per_epoch=steps_per_epoch, verbose=2, # 0=silent, 1=progress bar, 2=one line per epoch callbacks=[cp_callback]) EXPORT_PATH = os.path.join( args["output_dir"], datetime.datetime.now().strftime("%Y%m%d%H%M%S")) tf.saved_model.save( obj=model, export_dir=EXPORT_PATH) # with default serving function hp_metric = history.history['val_rmse'][-1] hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric(hyperparameter_metric_tag='rmse', metric_value=hp_metric, global_step=args['num_epochs']) print("Exported trained model to {}".format(EXPORT_PATH))
def train_model(args): """Load the data, train the model, test the model, export / save the model """ torch.manual_seed(args.seed) # Open our dataset train_loader, test_loader = data_utils.load_data(args.test_split, args.seed, args.batch_size) # Create the model net = model.SonarDNN().double() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, nesterov=False) # Train / Test the model latest_accuracy = 0.0 for epoch in range(1, args.epochs + 1): train(net, train_loader, optimizer) latest_accuracy = test(net, test_loader) # The default name of the metric is training/hptuning/metric. # We recommend that you assign a custom name. The only functional # difference is that if you use a custom name, you must set the # hyperparameterMetricTag value in the HyperparameterSpec object in your # job request to match your chosen name. # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#HyperparameterSpec hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='my_accuracy_tag', metric_value=latest_accuracy, global_step=args.epochs) # Export the trained model torch.save(net.state_dict(), args.model_name) if args.job_dir: # Save the model to GCS data_utils.save_model(args.job_dir, args.model_name) else: print('Accuracy: {:.0f}%'.format(latest_accuracy))
def _train_and_evaluate(estimator, dataset, output_dir): """Runs model training and evaluation. Args: estimator: (pipeline.Pipeline), Pipeline instance, assemble pre-processing steps and model training dataset: (pandas.DataFrame), DataFrame containing training data output_dir: (string), directory that the trained model will be exported Returns: None """ x_train, y_train, x_val, y_val = utils.data_train_test_split(dataset) estimator.fit(x_train, y_train) # Write model and eval metrics to `output_dir` model_output_path = os.path.join(output_dir, 'model', metadata.MODEL_FILE_NAME) utils.dump_object(estimator, model_output_path) if metadata.METRIC_FILE_NAME is not None: # Note: for now, use `cross_val_score` defaults (i.e. 3-fold) scores = model_selection.cross_val_score(estimator, x_val, y_val, cv=3) logging.info('Scores: %s', scores) metric_output_path = os.path.join(output_dir, 'experiment', metadata.METRIC_FILE_NAME) utils.dump_object(scores, metric_output_path) # The default name of the metric is training/hptuning/metric. # We recommend that you assign a custom name # The only functional difference is that if you use a custom name, # you must set the hyperparameterMetricTag value in the # HyperparameterSpec object in the job request to match your chosen name hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='my_metric_tag', metric_value=np.mean(scores), global_step=1000)
def __call__(self, trainer): log_report = self._log_report if isinstance(log_report, str): log_report = trainer.get_extension(log_report) elif isinstance(log_report, log_report_module.LogReport): log_report(trainer) # update the log report else: raise TypeError('log report has a wrong type %s' % type(log_report)) log = log_report.log log_len = self._log_len hpt = hypertune.HyperTune() while len(log) > log_len: target_log = log[log_len] hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag=self._hp_metric_tag, metric_value=target_log[self._hp_metric_val], global_step=target_log[self._hp_global_step]) log_len += 1 self.log_len = log_len
def run_job(opts): def input_and_label(rec): return rec['ref'], rec['ref'] ds = read_dataset(opts['input']).map(input_and_label).batch(opts['batch_size']).repeat() checkpoint = tf.keras.callbacks.ModelCheckpoint(os.path.join(opts['job_dir'], 'checkpoints')) strategy = tf.distribute.MirroredStrategy() with strategy.scope(): autoencoder = create_model(opts['num_layers'], opts['pool_size']) print(autoencoder) history = autoencoder.fit(ds, steps_per_epoch=opts['num_steps']//opts['num_checkpoints'], epochs=opts['num_checkpoints'], shuffle=True, callbacks=[checkpoint]) autoencoder.save(os.path.join(opts['job_dir'], 'savedmodel')) # report final metric to hyperparameter tuner hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='final_loss', metric_value=history.history['loss'][-1], global_step=1 )
def train(job_dir, data_path, n_components, alpha): # Load data from GCS df_train = pd.read_csv(data_path) y = df_train.octane X = df_train.drop('octane', axis=1) # Configure a training pipeline pipeline = Pipeline([('scale', StandardScaler()), ('reduce_dim', PCA(n_components=n_components)), ('regress', Ridge(alpha=alpha))]) # Calculate the performance metric scores = cross_val_score(pipeline, X, y, cv=10, scoring='neg_mean_squared_error') # Log it with hypertune hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='neg_mean_squared_error', metric_value=scores.mean()) # Fit the model on a full dataset pipeline.fit(X, y) # Save the model model_filename = 'model.joblib' joblib.dump(value=pipeline, filename=model_filename) gcs_model_path = "{}/{}".format(job_dir, model_filename) subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout) logging.info("Saved model in: {}".format(gcs_model_path))
def test_parameter_scope2(self): cca = ht.ConstantParameter('a', value='cc.a') ccb = ht.ConstantParameter('b', value='cc.b') aio = ht.ObjectParameter('O', obj=CC, parameters=(ccb, cca)) aia = ht.ConstantParameter('a', value='ai.a') cdo = ht.ObjectParameter('O', obj=AI, parameters=(aio, aia)) cda = ht.ConstantParameter('a', value='cd.a') cdb = ht.ConstantParameter('b', value='cd.b') r = ht.HyperTune(CD, [cda, cdb, cdo], fit, acc, max_evals=0).tune()['params'] exp = { 'a': 'cd.a', 'b': 'cd.b', 'O': { 'O': { 'a': 'cc.a', 'b': 'cc.b' }, 'a': 'ai.a' } } self.assertEqual(r, exp)
def train_and_evaluate( model, num_epochs, steps_per_epoch, train_data, validation_steps, eval_data, output_dir, n_steps_history, FLAGS, decay_type, learning_rate=3e-5, s=1, n_batch_decay=1, metric_accuracy='metric', ): """ Compiles keras model and loads data into it for training. """ logging.info('training the model ...') model_callbacks = [] # create meta data dictionary dict_model = {} dict_data = {} dict_parameter = {} dict_hardware = {} dict_results = {} dict_type_job = {} dict_software = {} # for debugging only activate_tensorboard = True activate_hp_tensorboard = False # True activate_lr = False save_checkpoints = False # True save_history_per_step = False # True save_metadata = False # True activate_timing = False # True # drop official method that is not working activate_tf_summary_hp = True # False # hardcoded way of doing hp activate_hardcoded_hp = True # True # dependencies if activate_tf_summary_hp: save_history_per_step = True if FLAGS.is_hyperparameter_tuning: # get trial ID suffix = mu.get_trial_id() if suffix == '': logging.error('No trial ID for hyper parameter job!') FLAGS.is_hyperparameter_tuning = False else: # callback for hp logging.info('Creating a callback to store the metric!') if activate_tf_summary_hp: hp_metric = mu.HP_metric(metric_accuracy) model_callbacks.append(hp_metric) if output_dir: if activate_tensorboard: # tensorflow callback log_dir = os.path.join(output_dir, 'tensorboard') if FLAGS.is_hyperparameter_tuning: log_dir = os.path.join(log_dir, suffix) tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=log_dir, histogram_freq=1, embeddings_freq=0, write_graph=True, update_freq='batch', profile_batch='10, 20') model_callbacks.append(tensorboard_callback) if save_checkpoints: # checkpoints callback checkpoint_dir = os.path.join(output_dir, 'checkpoint_model') if not FLAGS.is_hyperparameter_tuning: # not saving model during hyper parameter tuning # heckpoint_dir = os.path.join(checkpoint_dir, suffix) checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}') checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_prefix, verbose=1, save_weights_only=True) model_callbacks.append(checkpoint_callback) if activate_lr: # decay learning rate callback # code snippet to make the switching between different learning rate decays possible if decay_type == 'exponential': decay_fn = mu.exponential_decay(lr0=learning_rate, s=s) elif decay_type == 'stepwise': decay_fn = mu.step_decay(lr0=learning_rate, s=s) elif decay_type == 'timebased': decay_fn = mu.time_decay(lr0=learning_rate, s=s) else: decay_fn = mu.no_decay(lr0=learning_rate) # exponential_decay_fn = mu.exponential_decay(lr0=learning_rate, s=s) # lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn, verbose=1) # model_callbacks.append(lr_scheduler) # added these two lines for batch updates lr_decay_batch = mu.LearningRateSchedulerPerBatch(decay_fn, n_batch_decay, verbose=1) # lr_decay_batch = mu.LearningRateSchedulerPerBatch(exponential_decay_fn, n_batch_decay, verbose=0) # lambda step: ((learning_rate - min_learning_rate) * decay_rate ** step + min_learning_rate)) model_callbacks.append(lr_decay_batch) # print_lr = mu.PrintLR() # model_callbacks.append(mu.PrintLR()) # --------------------------------------------------------------------------------------------------------------- # callback to store all the learning rates # all_learning_rates = mu.LearningRateSchedulerPerBatch(model.optimizer, n_steps_history) # all_learning_rates = mu.LR_per_step() # all_learning_rates = mu.LR_per_step(model.optimizer) # model_callbacks.append(all_learning_rates) # disble if save_history_per_step: # callback to create history per step (not per epoch) histories_per_step = mu.History_per_step(eval_data, n_steps_history) model_callbacks.append(histories_per_step) if activate_timing: # callback to time each epoch timing = mu.TimingCallback() model_callbacks.append(timing) # checking model callbacks for logging.info('model\'s callback:\n {}'.format(str(model_callbacks))) # train the model # time the function start_time = time.time() logging.info('starting model.fit') # verbose = 0 (silent) # verbose = 1 (progress bar) # verbose = 2 (one line per epoch) verbose = 1 history = model.fit(train_data, epochs=num_epochs, steps_per_epoch=steps_per_epoch, validation_data=eval_data, validation_steps=validation_steps, verbose=verbose, callbacks=model_callbacks) # print execution time elapsed_time_secs = time.time() - start_time logging.info('\nexecution time: {}'.format( timedelta(seconds=round(elapsed_time_secs)))) # check model logging.info('model summary ={}'.format(model.summary())) logging.info('model input ={}'.format(model.inputs)) logging.info('model outputs ={}'.format(model.outputs)) # to be remove logging.info('\ndebugging .... : ') pp.print_info_data(train_data) if activate_timing: logging.info('timing per epoch:\n{}'.format( list( map(lambda x: str(timedelta(seconds=round(x))), timing.timing_epoch)))) logging.info('timing per validation:\n{}'.format( list( map(lambda x: str(timedelta(seconds=round(x))), timing.timing_valid)))) logging.info('sum timing over all epochs:\n{}'.format( timedelta(seconds=round(sum(timing.timing_epoch))))) # for hp parameter tuning in TensorBoard if FLAGS.is_hyperparameter_tuning: logging.info('setup hyperparameter tuning!') # test #params = json.loads(os.environ.get("CLUSTER_SPEC", "{}")).get("job", {}) #print('debug: CLUSTER_SPEC1:', params) #params = json.loads(os.environ.get("CLUSTER_SPEC", "{}")).get("job", {}).get("job_args", {}) #print('debug: CLUSTER_SPEC2:', params) logging.info('debug: os.environ.items():', os.environ.items()) # if activate_hardcoded_hp: # trick to bypass ai platform bug logging.info('hardcoded hyperparameter tuning!') value_accuracy = histories_per_step.accuracies[-1] hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag=metric_accuracy, metric_value=value_accuracy, global_step=0) else: # should be extracted from /var/hypertune/output.metric logging.info('standard hyperparameter tuning!') # is this needed ? # value_accuracy = histories_per_step.accuracies[-1] # look at the content of the file path_metric = '/var/hypertune/output.metric' logging.info('checking if /var/hypertune/output.metric exist!') if os.path.isfile(path_metric): logging.info('file {} exist !'.format(path_metric)) with open(path_metric, 'r') as f: logging.info('content of output.metric: {}'.format(f.read())) if activate_hp_tensorboard: logging.info('setup TensorBoard for hyperparameter tuning!') # CAIP #params = json.loads(os.environ.get("TF_CONFIG", "{}")).get("job", {}).get("hyperparameters", {}).get("params", {}) #uCAIP params = json.loads( os.environ.get("CLUSTER_SPEC", "{}") ) #.get("job", {}).get("hyperparameters", {}).get("params", {}) print('debug: CLUSTER_SPEC:', params) list_hp = [] hparams = {} for el in params: hp_dict = dict(el) if hp_dict.get('type') == 'DOUBLE': key_hp = hp.HParam( hp_dict.get('parameter_name'), hp.RealInterval(hp_dict.get('min_value'), hp_dict.get('max_value'))) list_hp.append(key_hp) try: hparams[key_hp] = FLAGS[hp_dict.get( 'parameter_name')].value except KeyError: logging.error( 'hyperparameter key {} doesn\'t exist'.format( hp_dict.get('parameter_name'))) hparams_dir = os.path.join(output_dir, 'hparams_tuning') with tf.summary.create_file_writer(hparams_dir).as_default(): hp.hparams_config( hparams=list_hp, metrics=[ hp.Metric(metric_accuracy, display_name=metric_accuracy) ], ) hparams_dir = os.path.join(hparams_dir, suffix) with tf.summary.create_file_writer(hparams_dir).as_default(): # record the values used in this trial hp.hparams(hparams) tf.summary.scalar(metric_accuracy, value_accuracy, step=1) if save_history_per_step: # save the history in a file search = re.search('gs://(.*?)/(.*)', output_dir) if search is not None: # temp folder locally and to be ove on gcp later history_dir = os.path.join('./', model.name) os.makedirs(history_dir, exist_ok=True) else: # locally history_dir = os.path.join(output_dir, model.name) os.makedirs(history_dir, exist_ok=True) logging.debug('history_dir: \n {}'.format(history_dir)) with open(history_dir + '/history', 'wb') as file: model_history = mu.History_trained_model(history.history, history.epoch, history.params) pickle.dump(model_history, file, pickle.HIGHEST_PROTOCOL) with open(history_dir + '/history_per_step', 'wb') as file: model_history_per_step = mu.History_per_steps_trained_model( histories_per_step.steps, histories_per_step.losses, histories_per_step.accuracies, histories_per_step.val_steps, histories_per_step.val_losses, histories_per_step.val_accuracies, 0, # all_learning_rates.all_lr, 0, # all_learning_rates.all_lr_alternative, 0) # all_learning_rates.all_lr_logs) pickle.dump(model_history_per_step, file, pickle.HIGHEST_PROTOCOL) if output_dir: # save the model savemodel_path = os.path.join(output_dir, 'saved_model') if not FLAGS.is_hyperparameter_tuning: # not saving model during hyper parameter tuning # savemodel_path = os.path.join(savemodel_path, suffix) model.save(os.path.join(savemodel_path, model.name)) model2 = tf.keras.models.load_model( os.path.join(savemodel_path, model.name)) # check model logging.info('model2 summary ={}'.format(model2.summary())) logging.info('model2 input ={}'.format(model2.inputs)) logging.info('model2 outputs ={}'.format(model2.outputs)) logging.info('model2 signature outputs ={}'.format( model2.signatures['serving_default'].structured_outputs)) logging.info('model2 inputs ={}'.format( model2.signatures['serving_default'].inputs[0])) if save_history_per_step: # save history search = re.search('gs://(.*?)/(.*)', output_dir) if search is not None: bucket_name = search.group(1) blob_name = search.group(2) output_folder = blob_name + '/history' if FLAGS.is_hyperparameter_tuning: output_folder = os.path.join(output_folder, suffix) mu.copy_local_directory_to_gcs(history_dir, bucket_name, output_folder) if save_metadata: # add meta data dict_model['pretrained_transformer_model'] = FLAGS.pretrained_model_dir dict_model['num_classes'] = FLAGS.num_classes dict_data['train'] = FLAGS.input_train_tfrecords dict_data['eval'] = FLAGS.input_eval_tfrecords dict_parameter[ 'use_decay_learning_rate'] = FLAGS.use_decay_learning_rate dict_parameter['epochs'] = FLAGS.epochs dict_parameter['steps_per_epoch_train'] = FLAGS.steps_per_epoch_train dict_parameter['steps_per_epoch_eval'] = FLAGS.steps_per_epoch_eval dict_parameter['n_steps_history'] = FLAGS.n_steps_history dict_parameter['batch_size_train'] = FLAGS.batch_size_train dict_parameter['batch_size_eval'] = FLAGS.batch_size_eval dict_parameter['learning_rate'] = FLAGS.learning_rate dict_parameter['epsilon'] = FLAGS.epsilon dict_hardware['is_tpu'] = FLAGS.use_tpu dict_type_job[ 'is_hyperparameter_tuning'] = FLAGS.is_hyperparameter_tuning dict_type_job['is_tpu'] = FLAGS.use_tpu dict_software['tensorflow'] = tf.__version__ dict_software['transformer'] = __version__ dict_software['python'] = sys.version # aggregate dictionaries dict_all = { 'model': dict_model, 'data': dict_data, 'parameter': dict_parameter, 'hardware': dict_hardware, 'results': dict_results, 'type_job': dict_type_job, 'software': dict_software } # save metadata search = re.search('gs://(.*?)/(.*)', output_dir) if search is not None: bucket_name = search.group(1) blob_name = search.group(2) output_folder = blob_name + '/metadata' storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) blob = bucket.blob(output_folder + '/model_job_metadata.json') blob.upload_from_string(data=json.dumps(dict_all), content_type='application/json')
def report_metric_to_hypertune(metric_value, step, tag='Loss'): """Use hypertune to report metrics for hyperparameter tuning.""" hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric(hyperparameter_metric_tag=tag, metric_value=metric_value, global_step=step)