def run_hyper_search(config): """ Important Hyperparameters for tSTE: Learning Rate: [1, 0.1, 0.01] """ dataset_name = config['dataset_selected'] batch_size = config['batch_size'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] n_samples = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] log_dir = config['log']['path'] triplet_multiplier_range = config['hyper_search']['triplets_multiplier'] learning_rate_range = config['hyper_search']['learning_rate'] optimizer = config['optimizer'] dimensions_range = config['hyper_search']['output_dimension'] separator = '_' experiment_name = 'tste_hyper_search_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_samples) + \ '_num_test_trips_' + str(number_of_test_triplets) + \ '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \ '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \ '_optimizer_' + str(optimizer) + \ '_bs_' + str(batch_size) + \ '_triplet_number_' + separator.join([str(i) for i in triplet_multiplier_range]) if not os.path.exists(log_dir): os.makedirs(log_dir) logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiment: ' + experiment_name) logger.info('Logging Path:' + logging_path) logger.info('Dataset Name: ' + dataset_name) logger.info('Epochs: ' + str(epochs)) best_params_train = {} best_params_test = {} all_results = {} device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") vec_data, labels = select_dataset( dataset_name, n_samples=n_samples, input_dim=input_dim ) # input_dim is only argument for uniform. Ignored otherwise n = vec_data.shape[0] logn = int(np.log2(n)) for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): all_results[(emb_dim, triplet_multiplier)] = {} best_train_error = 1 best_test_error = 1 triplet_num = triplet_multiplier * logn * n * emb_dim bs = min(batch_size, triplet_num) train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, bs, device) logger.info('Testing on: ' + dataset_name + '. Embedding dimension is ' + str(emb_dim)) logger.info(' ') for learning_rate in learning_rate_range: logger.info(10 * '-' + ' New parameters' + 10 * '-') logger.info('Learning Rate: ' + str(learning_rate)) logger.info('Number of Points: ' + str(n)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(emb_dim)) logger.info('Number of Test Triplets: ' + str(number_of_test_triplets)) logger.info('Triplet Multiplier: ' + str(triplet_multiplier)) logger.info('Batch Size: ' + str(batch_size)) logger.info('Computing tSTE...') x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam( triplets=train_triplets_dataset.trips_data_indices, n=n, emb_dim=emb_dim, epochs=epochs, batch_size=bs, learning_rate=learning_rate, device=device, logger=logger) # compute triplet error for train and test data train_error = train_triplets_dataset.triplet_error(x) logger.info('Triplet Error on Training Triplets: ' + str(train_error)) test_triplets_dataset = TripletBatchesDataset( vec_data, labels, number_of_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) #procrustes_error = procrustes_disparity(vec_data, x) #knn_error_ord_emb, knn_error_true_emb = knn_classification_error(x, vec_data, labels) logger.info('Epochs: ' + str(epochs)) logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) #logger.info('Procrustes Disparity: ' + str(procrustes_error)) #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = { 'train_error': train_error, 'test_error': test_error, 'loss_history': loss_history, 'error_history': triplet_error_history, 'last_embedding': x } all_results[(emb_dim, triplet_multiplier)].update({learning_rate: results}) if test_error < best_test_error: best_params_test[(emb_dim, triplet_multiplier)] = { 'learning_rate': learning_rate, 'optimizer': optimizer, 'error': test_error } best_test_error = test_error if train_error < best_train_error: best_params_train[(emb_dim, triplet_multiplier)] = { 'learning_rate': learning_rate, 'optimizer': optimizer, 'error': train_error } best_train_error = train_error result_name = 'tste_convergence_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_samples) + \ '_output_dim_' + str(emb_dim) + \ '_bs_' + str(batch_size) + \ '_triplet_number_' + str(triplet_multiplier) all_results['labels'] = labels joblib.dump(all_results[(emb_dim, triplet_multiplier)], os.path.join(log_dir, result_name + '.pkl')) # print all results as well again logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-') for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): results = all_results[(emb_dim, triplet_multiplier)] logger.info('Results for emb dimension ' + str(emb_dim) + ' and triplet multiplier ' + str(triplet_multiplier)) for learning_rate in learning_rate_range: logger.info('learning rate ' + str(learning_rate) + ' -- train error: ' + str(results[learning_rate]['train_error']) + ' test error: ' + str(results[learning_rate]['test_error'])) # print best parameter settings for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): logger.info('Best Parameters for emb dimension ' + str(emb_dim) + ' and triplet multiplier ' + str(triplet_multiplier)) best_on_train = best_params_train[(emb_dim, triplet_multiplier)] best_on_test = best_params_test[(emb_dim, triplet_multiplier)] logger.info('achieved ' + str(best_on_train['error']) + ' train error with learning rate: ' + str(best_on_train['learning_rate'])) logger.info('achieved ' + str(best_on_test['error']) + ' test error with learning rate: ' + str(best_on_test['learning_rate']))
def main(args): config = load_config(args.config_path) dataset_name = config['dataset_selected'] error_change_threshold = config['error_change_threshold'] batch_size = config['batch_size'] learning_rate = config['optimizer_params']['learning_rate'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] embedding_dimension = config['output_dimension'] n_samples = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] triplet_multiplier = config['triplets_multiplier'] log_dir = config['log']['path'] hyper_search = config['hyper_search']['activation'] optimizer = config['optimizer'] if hyper_search: run_hyper_search(config=config) else: vec_data, labels = select_dataset(dataset_name, n_samples=n_samples, input_dim=input_dim) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n = vec_data.shape[0] logn = int(np.log2(n)) triplet_num = triplet_multiplier * logn * n * embedding_dimension bs = min(batch_size, triplet_num) if not os.path.exists(log_dir): os.makedirs(log_dir) experiment_name = 'tste_' + \ 'data_' + dataset_name + \ '_error_change_threshold_' + str(error_change_threshold) + \ '_input_dim_' + str(input_dim) + \ '_output_dim_' + str(embedding_dimension) + \ '_originaldimension_' + str(vec_data.shape[1]) + \ '_triplet_num_' + str(triplet_multiplier) + \ '_n_pts_' + str(n) + \ '_lr_' + str(learning_rate) + \ '_optimizer_' + str(optimizer) + \ '_bs_' + str(batch_size) # create a logging file for extensive logging logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiments: ' + experiment_name) logger.info('Logging Path:' + logging_path) logger.info('Dataset Name: ' + dataset_name) logger.info('Error Change Threshold: ' + str(error_change_threshold)) logger.info('Epochs: ' + str(epochs)) logger.info('Learning Rate: ' + str(learning_rate)) logger.info('Number of Points: ' + str(n)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(embedding_dimension)) logger.info('Number of Test Triplets: ' + str(number_of_test_triplets)) logger.info('Triplet Multiplier: ' + str(triplet_multiplier)) logger.info('Batch Size: ' + str(batch_size)) train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, bs, device) logger.info('Computing TSTE...') x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam( triplets=train_triplets_dataset.trips_data_indices, n=n, emb_dim=embedding_dimension, epochs=epochs, batch_size=bs, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) logger.info('Evaluating the computed embeddings...') # compute triplet error for train and test data train_error = train_triplets_dataset.triplet_error(x) test_triplets_dataset = TripletBatchesDataset(vec_data, labels, number_of_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) procrustes_error = procrustes_disparity(vec_data, x) knn_error_ord_emb, knn_error_true_emb = knn_classification_error( x, vec_data, labels) # sample points for tsne visualization subsample = np.random.permutation(n)[0:500] x = x[subsample, :] sub_labels = labels[subsample] x_embedded = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(x) fig, ax = plt.subplots(1, 1) ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels) fig.savefig(os.path.join(log_dir, experiment_name + '.png')) logger.info('Name of Experiments: ' + experiment_name) logger.info('Epochs: ' + str(epochs)) logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) logger.info('Procrustes Disparity: ' + str(procrustes_error)) logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = { 'train_error': train_error, 'test_error': test_error, 'procrustes': procrustes_error, 'knn_true': knn_error_true_emb, 'knn_ord_emb': knn_error_ord_emb, 'labels': labels, 'loss_history': loss_history, 'error_history': triplet_error_history, 'ordinal_embedding': x, 'time_taken': time_taken } joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def run_method(config, dataset_name, algorithm, n_points, input_dim, embedding_dimension, learning_rate, batch_size, triplet_multiplier, optimizer, epochs, n_test_triplets, logger, error_change_threshold): vec_data, labels = select_dataset(dataset_name, n_samples=n_points, input_dim=input_dim) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n_points = vec_data.shape[0] triplet_num = np.int( np.ceil(triplet_multiplier * n_points * math.log2(n_points) * embedding_dimension)) train_triplets = [] loss_history, triplet_error_history, time_history = [], [], [] batch_size = min(batch_size, triplet_num) logger.info('Computing Embedding...') logger.info('Number of Points: ' + str(n_points)) logger.info('Number of Triplets: ' + str(triplet_num)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(embedding_dimension)) time_taken = 0 train_error = -1 # active methods wont have a train error if optimizer == 'adam' and algorithm == 'soe': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam( triplets=train_triplets, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif optimizer == 'sgd' and algorithm == 'soe': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_sgd( triplets=train_triplets, n=n_points, dim=embedding_dimension, iterations=epochs, bs=batch_size, lr=learning_rate, device=device, logger=logger) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'ste': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = ste.ste_adam( triplets=train_triplets, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'tste': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam( triplets=train_triplets, n=n_points, emb_dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'triplet_loss': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = soe.triplet_loss_adam( triplets=train_triplets, n=n_points, dim=embedding_dimension, iterations=epochs, batch_size=batch_size, lr=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'gnmds': regularizer = config['regularizer'] logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = gnmds.gnmds( triplets=train_triplets, reg_lbda=regularizer, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'forte': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = forte.rank_d_pgd( triplets=train_triplets, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'ckl': regularizer = config['regularizer'] mu = config['mu'] logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_k( triplets=train_triplets, reg_lbda=regularizer, mu=mu, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'ckl_x': mu = config['mu'] logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_x( triplets=train_triplets, mu=mu, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'loe': x, time_taken, train_error = landmark_oe.landmark_oe_with_data( data=vec_data, dim=embedding_dimension, trip_num=triplet_num, learning_rate=learning_rate, epochs=epochs, batch_size=batch_size, device=device, logger=logger) elif algorithm == 'oenn': number_of_neighbours = 50 # config['number_of_neighbours'] metric = 'eu' # config['metric'] all_triplets, triplet_loaders = data_utils_oenn.prep_data_for_nn( vec_data=vec_data, labels=labels, triplet_num=triplet_num, batch_size=batch_size, metric=metric, number_of_neighbours=number_of_neighbours) hl_size = int(120 + (2 * embedding_dimension * math.log2(n_points))) x, loss_history, triplet_error_history, time_taken, time_history = training_routine_v3.create_and_train_triplet_network( dataset_name=dataset_name, ind_loaders=triplet_loaders, n=n_points, dim=embedding_dimension, layers=3, learning_rate=learning_rate, epochs=epochs, hl_size=hl_size, batch_size=batch_size, number_of_triplets=triplet_num, logger=logger, error_change_threshold=error_change_threshold) train_error = triplet_error_batches(x, all_triplets) elif algorithm == 'lloe': num_landmarks = config['optimizer_params']['num_landmarks'] subset_size = config['optimizer_params']['subset_size'] phase1_learning_rate = config['optimizer_params'][ 'phase1_learning_rate'] phase2_learning_rate = config['optimizer_params'][ 'phase2_learning_rate'] target_loss = config['optimizer_params']['target_loss'] number_of_landmarks = min(int(num_landmarks * n_points), 100) subset_size = subset_size * number_of_landmarks landmarks, first_phase_indices, \ first_phase_subset_size, first_phase_reconstruction, \ first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe( num_landmarks=number_of_landmarks, subset_size=subset_size, data=vec_data, dataset_size=n_points, embedding_dim=embedding_dimension, epochs=epochs, first_phase_lr=phase1_learning_rate, device=device, target_loss=target_loss, batch_size=batch_size, logger=logger) embedded_indices = first_phase_indices embedded_points = first_phase_reconstruction non_embedded_indices = list( set(range(vec_data.shape[0])).difference(set(embedded_indices))) my_oracle = Oracle(data=vec_data) logger.info('Second Phase: ') logger.info('Oracle Created...') logger.info('Computing LLOE - Phase 2...') print(time_first_phase) # second phase for embedding point by point update second_phase_embeddings_index, \ second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle, non_embedded_indices=non_embedded_indices, embedded_indices=embedded_indices, first_phase_embedded_points=embedded_points, dim=embedding_dimension, lr=phase2_learning_rate, logger=logger) # combine the first phase and second phase points and index x = np.zeros((vec_data.shape[0], embedding_dimension)) # phase 1 points x[embedded_indices] = embedded_points # second phase points x[second_phase_embeddings_index] = second_phase_embeddings time_taken = time_first_phase + time_second_phase logger.info('Time Taken for experiment ' + str(time_taken) + ' seconds.') logger.info('Evaluating the computed embeddings...') test_triplets_dataset = TripletBatchesDataset(vec_data, labels, n_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) procrustes_error = procrustes_disparity(vec_data, x) knn_error_ord_emb, knn_error_true_emb = knn_classification_error( x, vec_data, labels) # log the errors logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) logger.info('Procrustes Disparity: ' + str(procrustes_error)) logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) return x, train_triplets, labels, train_error, test_error, procrustes_error, knn_error_true_emb, knn_error_ord_emb, time_taken, loss_history, triplet_error_history, time_history