def save_evaluation_plots(training_configs): """Create and save learning curves for all models in the batch""" for i, training_config in enumerate(training_configs): print('Saving plot for Model {}: {}'.format(i + 1, training_config.name)) model = training_config.get_by_model_key(False) checkpoint = models.ModelCheckpoint(model) checkpoint.load(training_config.get_model_path('checkpoint')) if not checkpoint.loaded: print('Not evaluated') continue path = os.path.join(training_config.models_dir, "{}_lc.png".format(training_config.name)) commons.save_learning_curve(checkpoint.training_losses, checkpoint.cv_losses, path)
def print_evaluation_report(training_config): """Print the training and evaluation results for a model""" # Training Config print('Training Config') for key, val in training_config.__dict__.items(): print('{}\t{}'.format(key, val)) print() # Checkpoint model = training_config.get_by_model_key(False) checkpoint = models.ModelCheckpoint(model) checkpoint.load(training_config.get_model_path('checkpoint')) if not checkpoint.loaded: print('Not evaluated') return print('Last checkpoint stats') for key, val in checkpoint.__dict__.items(): print('{}\t{}'.format(key, val))
def save_evaluation_report(training_configs, config_path): """Compile and save hyper-tuning report for all models in the batch""" hps = [] for i, training_config in enumerate(training_configs): print('Saving report for Model {}: {}'.format(i + 1, training_config.name)) model = training_config.get_by_model_key(False) checkpoint = models.ModelCheckpoint(model) checkpoint.load(training_config.get_model_path('checkpoint')) if not checkpoint.loaded: print('Not evaluated') continue if training_config.model == 'conv_autoencoder': hps.append(_get_hps_for_autoencoder(training_config, checkpoint)) elif training_config.model == 'cnn_classifier': hps.append(_get_hps_for_classifier(training_config, checkpoint)) else: raise Exception('Invalid model code: {}'.format( training_configs.model)) with open(os.path.join(os.path.dirname(config_path), 'hps.txt'), 'w') as rep_file: rep_file.write('\n'.join(['\t'.join(hp) for hp in hps]))
def show_plot(train_config_file, opt=1): """Plot learning curve for a training process""" training_config = train.TrainingConfig.load_from_file(train_config_file) # Model initialization model = training_config.get_by_model_key(False) checkpoint = models.ModelCheckpoint(model) checkpoint.load(training_config.get_model_path('checkpoint')) if not checkpoint.loaded: print('Not evaluated') return if opt == 1: commons.plot_learning_curve(checkpoint.training_losses, checkpoint.cv_losses, close=True) elif opt == 2: commons.plot_learning_curve( checkpoint.cv_accuracies, checkpoint.model_specific['polled_accuracies'], close=True) else: return time.sleep(60)
def train(training_config, plot_learning_curves=False, cuda=False, email=False): """Train a model using the specified training configuration Arguments: training_config: Instance of TrainingConfig plot_learning_curves: Whether to plot learning curves at the end of each epoch (Useful for monitoring training) cuda: Use True to train on GPU email: Use True to send email notifications on training completion or failure """ print('Training model {} [CUDA = {}, Plot = {}]'.format( training_config.name, cuda, plot_learning_curves)) # Exception block to catch training failures and send email notifications try: if training_config.ignore: print('Ignoring model') return # Model initialization model = training_config.get_by_model_key(cuda) # Load checkpoint checkpoint = models.ModelCheckpoint(model) print('Model Size: {} params'.format(checkpoint.trainable_params)) if training_config.resume: model.load_state(training_config.get_model_path('state')) checkpoint.load(training_config.get_model_path('checkpoint')) # Data generators for Training and Validation sets train_parts, cv_part, test_part = dp.load_created_partitions( training_config.dataset_path) if len(train_parts) == 0: raise Exception('No training partitions found') training_set = dp.PartitionBatchGenerator(train_parts, training_config.batch_size, mode='train') training_set_len = len(training_set) cv_set = dp.PartitionBatchGenerator(cv_part, training_config.batch_size, mode='cv') cv_set_len = len(cv_set) if checkpoint.epoch >= training_config.num_epochs: print('Already completed {} epochs'.format(checkpoint.epoch)) return # Training loop for curr_epoch in range(checkpoint.epoch, training_config.num_epochs): # Plot learning curves after first epoch if plot_learning_curves and curr_epoch > 0: commons.plot_learning_curve(checkpoint.training_losses, checkpoint.cv_losses, close=True) # Train on training set model.begin_training() loss = 0 train_start_time = time.time() progress = commons.ProgressBar(training_set_len, status='Training epoch %s' % str(curr_epoch + 1)) for i, (x, y) in enumerate(training_set): loss += model.train_batch(x, y) progress.update(i) train_stop_time = time.time() training_time = train_stop_time - train_start_time checkpoint.training_times.append(training_time) progress.complete( status='Done training epoch {} in {} seconds'.format( str(curr_epoch + 1), training_time)) avg_loss = loss / training_set_len checkpoint.training_losses.append(avg_loss) print('Average training loss per batch:', avg_loss) # Evaluate on validation set model.begin_evaluation() loss_cv = 0 for i, (x_cv, y_cv) in enumerate(cv_set): loss_batch_cv = model.evaluate(x_cv, y_cv) loss_cv += loss_batch_cv avg_loss_cv = loss_cv / cv_set_len checkpoint.cv_losses.append(avg_loss_cv) checkpoint.best_loss = avg_loss_cv if checkpoint.best_loss is None else min( checkpoint.best_loss, avg_loss_cv) print('Average validation loss per batch:', avg_loss_cv) print('Best Loss:', checkpoint.best_loss) # Post evaluation model specific actions model.post_evaluation(checkpoint) print() # Checkpoint checkpoint.epoch += 1 model.save_state(training_config.get_model_path('state')) checkpoint.save(training_config.get_model_path('checkpoint')) if checkpoint.best_loss == avg_loss_cv: model.save_state(training_config.get_model_path('state_best')) checkpoint.save( training_config.get_model_path('checkpoint_best')) print('Training complete') # Success email if email: emailer.sendmail( 'Model Training Complete: {}'.format(training_config.name), 'Model Config: {}\n Model Checkpoint: {}'.format( str(training_config.get_dict()), str(checkpoint.get_dict()))) except Exception as ex: print('Model Training Failed: {}'.format(str(ex))) # Failure email if email: emailer.sendmail( 'Model Training Failed: {}'.format(training_config.name), 'Error: {}'.format(traceback.format_exc())) raise
n_classes = 100 x_train_pct, y_train_pct = m.sample_train(x_train, y_train, train_pct) m.print_params(feature_extractor, embedding_dim, n_centers_per_class, n_classes, lr, sigma, batch_size, epochs, dataset, input_shape, patience) for i in range(n_trials): rbf_model, softmax_model, embeddings = m.construct_models(feature_extractor, embedding_dim, n_centers_per_class, n_classes, lr, sigma) # Callbacks Setup callbacks = [m.EarlyStopping(monitor='val_loss', patience=patience)] callbacks2 = [m.EarlyStopping(monitor='val_loss', patience=patience), m.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='min')] # Training Models ''' Softmax Model / Plain Model ''' history_plain = softmax_model.fit(x_train_pct, y_train_pct, batch_size= batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test), callbacks = callbacks2) ''' Pre trained Softmax Model.
def main(args): history = [] # Command Line Arguments feature_extractor = args.feature_extractor filepath = args.file_path dataset = args.dataset n_trials = args.n_trials # Dataset Setup if dataset == "CIFAR10": x_train, x_test, x_val, y_train, y_test, y_val = adl.load_cifar10() n_classes = 10 elif dataset == "CIFAR100": x_train, x_test, x_val, y_train, y_test, y_val = adl.load_cifar100() n_classes = 100 elif dataset == "TinyImagenet": x_train, x_test, x_val, y_train, y_test, y_val = adl.load_tiny_imagenet() n_classes = 200 for pct in ["10", "20", "30"]: x_train_pct, y_train_pct = x_train[pct], y_train[pct] m.print_params(feature_extractor, embedding_dim, n_centers_per_class, n_classes, lr, sigma, batch_size, epochs, dataset, input_shape, patience) for i in range(n_trials): rbf_model, softmax_model, embeddings = m.construct_models(feature_extractor, embedding_dim, n_centers_per_class, n_classes, lr, sigma) # Callbacks Setup callbacks = [m.EarlyStopping(monitor='val_loss', patience=patience)] callbacks2 = [m.EarlyStopping(monitor='val_loss', patience=patience), m.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='min')] # Training Models ''' Softmax Model / Plain Model ''' print("Model with softmax layer") history_plain = softmax_model.fit(x_train_pct, y_train_pct, batch_size= batch_size, epochs=epochs, verbose=1, validation_data=(x_val, y_val), callbacks = callbacks2) softmax_model.load_weights(filepath) error_softmax = rbf_model.evaluate(x_test, y_test, verbose = 0) ''' Pre trained Softmax Model. With K-Means Initialization. With Gauss Kernel. ''' print("Model with gauss kernel and initialization") rbf_model, softmax_model, embeddings = m.construct_models(feature_extractor, embedding_dim, n_centers_per_class, n_classes, lr, sigma, kernel_type = "gauss") softmax_model.load_weights(filepath) init_keys = m.get_initial_weights(embeddings, x_train_pct, y_train_pct, n_centers_per_class, n_classes, embedding_dim, init_method= "KMEANS") rbf_model.layers[-1].set_keys(init_keys) history_gauss_kmeans = rbf_model.fit(x_train_pct, y_train_pct, batch_size= batch_size, epochs=epochs, verbose=1, validation_data=(x_val, y_val), callbacks = callbacks) error_rbf_kmeans = rbf_model.evaluate(x_test, y_test, verbose = 0) ''' Non pre trained Model. Without Initialization. With Gauss Kernel. ''' print("Model with gauss kernel and without initialization") rbf_model, _, _ = m.construct_models(feature_extractor, embedding_dim, n_centers_per_class, n_classes, lr, sigma, kernel_type = "gauss") history_gauss = rbf_model.fit(x_train_pct, y_train_pct, batch_size= batch_size, epochs=epochs, verbose=1, validation_data=(x_val, y_val), callbacks = callbacks) error_rbf = rbf_model.evaluate(x_test, y_test, verbose = 0) # Record of Highest Validation Accuracies highest_plain = np.max(history_plain.history["val_acc"]) highest_gauss_kmeans = np.max(history_gauss_kmeans.history["val_acc"]) highest_gauss = np.max(history_gauss.history["val_acc"]) history.append({"plain": highest_plain, "gauss_means": highest_gauss_kmeans, "gauss": highest_gauss, "plain_error": error_softmax, "error_rbf": error_rbf, "error_rbf_kmeans": error_rbf_kmeans}) with open("Train_Results_"+feature_extractor+str(int(train_pct*100))+"_trial_"+str(i), "wb") as f: pickle.dump(history, f)
feature_extractor = str(sys.argv[1]).upper() dataset = str(sys.argv[2]).upper() # Models Weights Record model_name_softmax = "model-"+str(int(train_pct*100))+"-"+str(n_centers_per_class)+"-softmax.h5" model_name_gauss_kmeans = "model-"+str(int(train_pct*100))+"-"+str(n_centers_per_class)+"-gauss-kmeans.h5" model_name_gauss_kmedoids = "model-"+str(int(train_pct*100))+"-"+str(n_centers_per_class)+"-gauss-kmedoids.h5" model_name_gauss_no_init = "model-"+str(int(train_pct*100))+"-"+str(n_centers_per_class)+"-gauss-no-init.h5" # Callbacks Setup cbs_softmax = [m.EarlyStopping(monitor='val_loss', patience=patience), m.ModelCheckpoint(model_name_softmax, monitor='val_loss', verbose=0, save_best_only=True, mode='min')] cbs_gauss_kmeans = [m.EarlyStopping(monitor='val_loss', patience=patience), m.ModelCheckpoint(model_name_gauss_kmeans, monitor='val_loss', verbose=0, save_best_only=True, mode='min')] cbs_gauss_kmedoids = [m.EarlyStopping(monitor='val_loss', patience=patience), m.ModelCheckpoint(model_name_gauss_kmedoids, monitor='val_loss', verbose=0, save_best_only=True, mode='min')] cbs_gauss_no_init = [m.EarlyStopping(monitor='val_loss', patience=patience), m.ModelCheckpoint(model_name_gauss_no_init, monitor='val_loss', verbose=0, save_best_only=True, mode='min')] # Dataset Setup