def main(args): config = load_config(args.config_path) dataset_name = config['dataset_selected'] error_change_threshold = config['error_change_threshold'] batch_size = config['batch_size'] learning_rate = config['optimizer_params']['learning_rate'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] embedding_dimension = config['output_dimension'] n_samples = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] triplet_multiplier = config['triplets_multiplier'] log_dir = config['log']['path'] hyper_search = config['hyper_search']['activation'] optimizer = config['optimizer'] if hyper_search: run_hyper_search(config=config) else: vec_data, labels = select_dataset(dataset_name, n_samples=n_samples, input_dim=input_dim) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n = vec_data.shape[0] logn = int(np.log2(n)) triplet_num = triplet_multiplier * logn * n * embedding_dimension bs = min(batch_size, triplet_num) if not os.path.exists(log_dir): os.makedirs(log_dir) experiment_name = 'tste_' + \ 'data_' + dataset_name + \ '_error_change_threshold_' + str(error_change_threshold) + \ '_input_dim_' + str(input_dim) + \ '_output_dim_' + str(embedding_dimension) + \ '_originaldimension_' + str(vec_data.shape[1]) + \ '_triplet_num_' + str(triplet_multiplier) + \ '_n_pts_' + str(n) + \ '_lr_' + str(learning_rate) + \ '_optimizer_' + str(optimizer) + \ '_bs_' + str(batch_size) # create a logging file for extensive logging logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiments: ' + experiment_name) logger.info('Logging Path:' + logging_path) logger.info('Dataset Name: ' + dataset_name) logger.info('Error Change Threshold: ' + str(error_change_threshold)) logger.info('Epochs: ' + str(epochs)) logger.info('Learning Rate: ' + str(learning_rate)) logger.info('Number of Points: ' + str(n)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(embedding_dimension)) logger.info('Number of Test Triplets: ' + str(number_of_test_triplets)) logger.info('Triplet Multiplier: ' + str(triplet_multiplier)) logger.info('Batch Size: ' + str(batch_size)) train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, bs, device) logger.info('Computing TSTE...') x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam( triplets=train_triplets_dataset.trips_data_indices, n=n, emb_dim=embedding_dimension, epochs=epochs, batch_size=bs, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) logger.info('Evaluating the computed embeddings...') # compute triplet error for train and test data train_error = train_triplets_dataset.triplet_error(x) test_triplets_dataset = TripletBatchesDataset(vec_data, labels, number_of_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) procrustes_error = procrustes_disparity(vec_data, x) knn_error_ord_emb, knn_error_true_emb = knn_classification_error( x, vec_data, labels) # sample points for tsne visualization subsample = np.random.permutation(n)[0:500] x = x[subsample, :] sub_labels = labels[subsample] x_embedded = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(x) fig, ax = plt.subplots(1, 1) ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels) fig.savefig(os.path.join(log_dir, experiment_name + '.png')) logger.info('Name of Experiments: ' + experiment_name) logger.info('Epochs: ' + str(epochs)) logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) logger.info('Procrustes Disparity: ' + str(procrustes_error)) logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = { 'train_error': train_error, 'test_error': test_error, 'procrustes': procrustes_error, 'knn_true': knn_error_true_emb, 'knn_ord_emb': knn_error_ord_emb, 'labels': labels, 'loss_history': loss_history, 'error_history': triplet_error_history, 'ordinal_embedding': x, 'time_taken': time_taken } joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def run_hyper_search(config): """ Important Hyperparameters for tSTE: Learning Rate: [1, 0.1, 0.01] """ dataset_name = config['dataset_selected'] batch_size = config['batch_size'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] n_samples = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] log_dir = config['log']['path'] triplet_multiplier_range = config['hyper_search']['triplets_multiplier'] learning_rate_range = config['hyper_search']['learning_rate'] optimizer = config['optimizer'] dimensions_range = config['hyper_search']['output_dimension'] separator = '_' experiment_name = 'tste_hyper_search_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_samples) + \ '_num_test_trips_' + str(number_of_test_triplets) + \ '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \ '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \ '_optimizer_' + str(optimizer) + \ '_bs_' + str(batch_size) + \ '_triplet_number_' + separator.join([str(i) for i in triplet_multiplier_range]) if not os.path.exists(log_dir): os.makedirs(log_dir) logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiment: ' + experiment_name) logger.info('Logging Path:' + logging_path) logger.info('Dataset Name: ' + dataset_name) logger.info('Epochs: ' + str(epochs)) best_params_train = {} best_params_test = {} all_results = {} device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") vec_data, labels = select_dataset( dataset_name, n_samples=n_samples, input_dim=input_dim ) # input_dim is only argument for uniform. Ignored otherwise n = vec_data.shape[0] logn = int(np.log2(n)) for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): all_results[(emb_dim, triplet_multiplier)] = {} best_train_error = 1 best_test_error = 1 triplet_num = triplet_multiplier * logn * n * emb_dim bs = min(batch_size, triplet_num) train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, bs, device) logger.info('Testing on: ' + dataset_name + '. Embedding dimension is ' + str(emb_dim)) logger.info(' ') for learning_rate in learning_rate_range: logger.info(10 * '-' + ' New parameters' + 10 * '-') logger.info('Learning Rate: ' + str(learning_rate)) logger.info('Number of Points: ' + str(n)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(emb_dim)) logger.info('Number of Test Triplets: ' + str(number_of_test_triplets)) logger.info('Triplet Multiplier: ' + str(triplet_multiplier)) logger.info('Batch Size: ' + str(batch_size)) logger.info('Computing tSTE...') x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam( triplets=train_triplets_dataset.trips_data_indices, n=n, emb_dim=emb_dim, epochs=epochs, batch_size=bs, learning_rate=learning_rate, device=device, logger=logger) # compute triplet error for train and test data train_error = train_triplets_dataset.triplet_error(x) logger.info('Triplet Error on Training Triplets: ' + str(train_error)) test_triplets_dataset = TripletBatchesDataset( vec_data, labels, number_of_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) #procrustes_error = procrustes_disparity(vec_data, x) #knn_error_ord_emb, knn_error_true_emb = knn_classification_error(x, vec_data, labels) logger.info('Epochs: ' + str(epochs)) logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) #logger.info('Procrustes Disparity: ' + str(procrustes_error)) #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = { 'train_error': train_error, 'test_error': test_error, 'loss_history': loss_history, 'error_history': triplet_error_history, 'last_embedding': x } all_results[(emb_dim, triplet_multiplier)].update({learning_rate: results}) if test_error < best_test_error: best_params_test[(emb_dim, triplet_multiplier)] = { 'learning_rate': learning_rate, 'optimizer': optimizer, 'error': test_error } best_test_error = test_error if train_error < best_train_error: best_params_train[(emb_dim, triplet_multiplier)] = { 'learning_rate': learning_rate, 'optimizer': optimizer, 'error': train_error } best_train_error = train_error result_name = 'tste_convergence_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_samples) + \ '_output_dim_' + str(emb_dim) + \ '_bs_' + str(batch_size) + \ '_triplet_number_' + str(triplet_multiplier) all_results['labels'] = labels joblib.dump(all_results[(emb_dim, triplet_multiplier)], os.path.join(log_dir, result_name + '.pkl')) # print all results as well again logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-') for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): results = all_results[(emb_dim, triplet_multiplier)] logger.info('Results for emb dimension ' + str(emb_dim) + ' and triplet multiplier ' + str(triplet_multiplier)) for learning_rate in learning_rate_range: logger.info('learning rate ' + str(learning_rate) + ' -- train error: ' + str(results[learning_rate]['train_error']) + ' test error: ' + str(results[learning_rate]['test_error'])) # print best parameter settings for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): logger.info('Best Parameters for emb dimension ' + str(emb_dim) + ' and triplet multiplier ' + str(triplet_multiplier)) best_on_train = best_params_train[(emb_dim, triplet_multiplier)] best_on_test = best_params_test[(emb_dim, triplet_multiplier)] logger.info('achieved ' + str(best_on_train['error']) + ' train error with learning rate: ' + str(best_on_train['learning_rate'])) logger.info('achieved ' + str(best_on_test['error']) + ' test error with learning rate: ' + str(best_on_test['learning_rate']))
def main(args): config = load_config(args.config_path) dataset_name = config['dataset_selected'] batch_size = config['batch_size'] phase1_learning_rate = config['optimizer_params']['phase1_learning_rate'] phase2_learning_rate = config['optimizer_params']['phase2_learning_rate'] num_landmarks = config['optimizer_params']['num_landmarks'] subset_size = config['optimizer_params']['subset_size'] target_loss = config['optimizer_params']['target_loss'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] embedding_dimension = config['output_dimension'] n_samples = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] log_dir = config['log']['path'] hyper_search = config['hyper_search']['activation'] if hyper_search: run_hyper_search(config=config) else: vec_data, labels = select_dataset(dataset_name=dataset_name, input_dim=input_dim, n_samples=n_samples) n_points = vec_data.shape[0] # do not remove number_of_landmarks = min(int(num_landmarks * n_points), 100) subset_size = subset_size * number_of_landmarks experiment_name = 'lsoe_' + 'data_' + dataset_name \ + '_input_dim_' + str(input_dim) \ + '_emb_dimension_' + str(embedding_dimension) \ + '_originaldimension_' + str(vec_data.shape[1]) \ + '_n_' + str(n_samples) \ + '_landmarks_' + str(number_of_landmarks) \ + '_bs_ ' + str(batch_size) \ + '_pplr_' + str(phase2_learning_rate) \ + '_soe_lr_' + str(phase1_learning_rate) \ + '_epochs_' + str(epochs) if not os.path.exists(log_dir): os.makedirs(log_dir) logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiments: ' + experiment_name) logger.info('Dataset Name:' + dataset_name) logger.info('Number of Points: ' + str(n_samples)) logger.info('Dataset Dimension:' + str(input_dim)) logger.info('Number of Landmarks:' + str(number_of_landmarks)) logger.info('Number of Subset Size:' + str(subset_size)) logger.info('First Phase Epochs: ' + str(epochs)) # set the gpu id device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logger.info('Computing SOE - Phase 1...') # first phase of the algorithm landmarks, first_phase_indices, \ first_phase_subset_size, first_phase_reconstruction, \ first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe( num_landmarks=number_of_landmarks, subset_size=subset_size, data=vec_data, dataset_size=n_points, embedding_dim=embedding_dimension, epochs=epochs, first_phase_lr=phase1_learning_rate, device=device, target_loss=target_loss, batch_size=batch_size, logger=logger) logger.info('First Phase Loss: ' + str(first_phase_loss)) logger.info('First Phase Triplet Error: ' + str(first_phase_triplet_error)) logger.info('First Phase Number of Landmarks: ' + str(landmarks.shape)) logger.info('First Phase Indices Number: ' + str(len(first_phase_indices))) logger.info('First Phase Reconstruction Size: ' + str(first_phase_reconstruction.shape)) embedded_indices = first_phase_indices embedded_points = first_phase_reconstruction non_embedded_indices = list(set(range(vec_data.shape[0])).difference(set(embedded_indices))) my_oracle = Oracle(data=vec_data) logger.info('Second Phase: ') logger.info('Oracle Created...') logger.info('Computing LLOE - Phase 2...') # second phase for embedding point by point update second_phase_embeddings_index, \ second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle, non_embedded_indices=non_embedded_indices, embedded_indices=embedded_indices, first_phase_embedded_points=embedded_points, dim=embedding_dimension, lr=phase2_learning_rate, logger=logger) # combine the first phase and second phase points and index final_embedding = np.zeros((vec_data.shape[0], embedding_dimension)) # phase 1 points final_embedding[embedded_indices] = embedded_points # second phase points final_embedding[second_phase_embeddings_index] = second_phase_embeddings time_taken = time_first_phase + time_second_phase logger.info('Size of Dataset: ' + str(vec_data.shape[0])) logger.info('Size of First Phase Indices: ' + str(len(embedded_indices))) logger.info('Size of Second Phase Indices: ' + str(len(second_phase_embeddings_index))) # Evaluation logger.info('Evaluation of the Complete Embedding Dataset: ') random_trip_indices = gen_triplet_indices(n=vec_data.shape[0], num_trips=number_of_test_triplets) test_triplet_data = gen_triplet_data(data=vec_data, random_triplet_indices=random_trip_indices, batch_size=1000) test_error, embedding_error_list = triplet_error(final_embedding, test_triplet_data) procrustes_error = procrustes_disparity(vec_data, final_embedding) knn_error_ord_emb, knn_error_true_emb = knn_classification_error(final_embedding, vec_data, labels) # sample points for tsne visualization subsample = np.random.permutation(n_points)[0:500] x = final_embedding[subsample, :] sub_labels = labels[subsample] x_embedded = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(x) fig, ax = plt.subplots(1, 1) ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels) fig.savefig(os.path.join(log_dir, experiment_name + '.png')) logger.info('Name of Experiments: ' + experiment_name) logger.info('Epochs: ' + str(epochs)) logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Test Error: ' + str(test_error)) logger.info('Procrustes Disparity: ' + str(procrustes_error)) logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = {'test_error': test_error, 'procrustes': procrustes_error, 'knn_true': knn_error_true_emb, 'knn_ord_emb': knn_error_ord_emb, 'labels': labels, 'ordinal_embedding': final_embedding, 'time_taken': time_taken} joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def main(args): config = load_config(args.config_path) algorithm = config['algorithm'] error_change_threshold = config['error_change_threshold'] input_dim_range = config['tradeoff_set']['input_dimension'] output_dimensions_range = config['tradeoff_set']['output_dimension'] nmb_points_range = config['tradeoff_set']['number_of_points'] batch_size_range = config['tradeoff_set']['batch_size'] learning_rate_range = config['tradeoff_set']['learning_rate'] triplet_multiplier_range = config['tradeoff_set']['triplets_multiplier'] if args.data_set == 'not_selected': dataset_name = config['dataset_selected'] else: dataset_name = args.data_set try: input_equals_output = config['tradeoff_set']['input_equals_output'] except: input_equals_output = False epochs = config['nb_epochs'] optimizer = config['optimizer'] n_test_triplets = config['n_test_triplets'] log_dir = config['log']['path'] separator = '_' experiment_name = algorithm + \ '_data_' + dataset_name + \ '_input_dim_' + separator.join([str(i) for i in input_dim_range]) + \ '_output_dim_' + separator.join([str(i) for i in output_dimensions_range]) + \ '_n_pts_' + separator.join([str(i) for i in nmb_points_range]) + \ '_bs_' + separator.join([str(i) for i in batch_size_range]) + \ '_change_criterion_' + str(error_change_threshold) + \ '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \ '_triplet_num_' + separator.join([str(i) for i in triplet_multiplier_range]) + \ '_ep_' + str(epochs) # create a log directory if it does not exist if not os.path.exists(log_dir): os.makedirs(log_dir) logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiments: ' + experiment_name) logger.info('Dataset Name: ' + dataset_name) logger.info('Error Change Threshold: ' + str(error_change_threshold)) logger.info('Epochs: ' + str(epochs)) tradeoff_results = defaultdict(dict) order = [ 'Input Dimension', 'Output Dimension', 'Number of Points', 'Batch Size', 'Learning Rate', 'Triplet Multiplier', [ 'Train Error', 'Test Error', 'Procrustes Error', 'Knn Orig Error', 'Knn Ordinal Error', 'Time', 'Embedding', 'Labels', 'Train Triplets' ] ] experiment_range = OrderedDict({ 'input_dim': input_dim_range, 'output_dim': output_dimensions_range, 'number_of_points': nmb_points_range, 'batch_size': batch_size_range, 'learning_rate': learning_rate_range, 'triplet_multiplier': triplet_multiplier_range }) for input_dim_index, input_dim in enumerate(input_dim_range): for dimensions_index, embedding_dimension in enumerate( output_dimensions_range): for subset_index, nmb_points in enumerate(nmb_points_range): for batch_size_index, batch_size in enumerate( batch_size_range): for lr_index, learning_rate in enumerate( learning_rate_range): for trip_mindex, triplet_multiplier in enumerate( triplet_multiplier_range): if (not input_equals_output) or ( input_equals_output and input_dim == embedding_dimension): logger.info('Learning Rate: ' + str(learning_rate)) logger.info('Number of Points: ' + str(nmb_points)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(embedding_dimension)) logger.info('Number of Test Triplets: ' + str(n_test_triplets)) logger.info('Triplet Multiplier: ' + str(triplet_multiplier)) logger.info('Batch Size: ' + str(batch_size)) embedding, train_triplets, labels, train_error, test_error, \ procrustes_error, knn_orig, knn_embed, time_taken, \ loss_history, triplet_error_history, time_history = run_method(config, dataset_name, algorithm, nmb_points, input_dim, embedding_dimension, learning_rate, batch_size, triplet_multiplier, optimizer, epochs, n_test_triplets, logger, error_change_threshold) tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 0] = train_error tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 1] = test_error tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 2] = procrustes_error tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 3] = knn_orig tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 4] = knn_embed tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 5] = time_taken tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 6] = embedding tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 7] = labels tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 8] = loss_history tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 9] = triplet_error_history tradeoff_results[input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 10] = time_history # tradeoff_results[input_dim_index, dimensions_index, subset_index, # batch_size_index, lr_index, trip_mindex, 8] = train_triplets for input_dim_index, input_dim in enumerate(input_dim_range): for dimensions_index, embedding_dimension in enumerate( output_dimensions_range): for subset_index, nmb_points in enumerate(nmb_points_range): for batch_size_index, batch_size in enumerate( batch_size_range): for lr_index, learning_rate in enumerate( learning_rate_range): for trip_mindex, triplet_multiplier in enumerate( triplet_multiplier_range): if (not input_equals_output) or ( input_equals_output and input_dim == embedding_dimension): logger.info('Input Dimension ' + str(input_dim) + ' Output Dimension ' + str(embedding_dimension) + ' Number of Points ' + str(nmb_points) + ' Batch Size ' + str(batch_size) + ' Learning Rate ' + str(learning_rate) + ' Triplet Multiplier ' + str(triplet_multiplier)) logger.info( ' Train Error ' + str(tradeoff_results[ input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 0]) + ' Test Error ' + str(tradeoff_results[ input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 1])) logger.info(' Procrustes Error ' + str(tradeoff_results[ input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 2])) logger.info( ' kNN original Emb Loss ' + str(tradeoff_results[ input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 3]) + ' kNN on Ordinal Emb Loss ' + str(tradeoff_results[ input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 4])) logger.info(' Time ' + str(tradeoff_results[ input_dim_index, dimensions_index, subset_index, batch_size_index, lr_index, trip_mindex, 5])) logger.info('-' * 20) data_dump = [order, experiment_range, tradeoff_results] joblib.dump(data_dump, os.path.join(log_dir, experiment_name + '.pkl'))
def run_hyper_search(config): dataset_name = config['dataset_selected'] batch_size = config['batch_size'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] n_samples = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] log_dir = config['log']['path'] phase1_learning_rate_range = config['hyper_search']['phase1_learning_rate'] phase2_learning_rate_range = config['hyper_search']['phase2_learning_rate'] dimensions_range = config['hyper_search']['output_dimension'] separator = '_' experiment_name = 'lloe_full_hyper_search_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_samples) + \ '_num_test_trips_' + str(number_of_test_triplets) + \ '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \ '_phase1lr_' + separator.join([str(i) for i in phase1_learning_rate_range]) + \ '_phase2lr_' + separator.join([str(i) for i in phase2_learning_rate_range]) + \ '_bs_' + str(batch_size) if not os.path.exists(log_dir): os.makedirs(log_dir) logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiment: ' + experiment_name) logger.info('Logging Path:' + logging_path) logger.info('Dataset Name: ' + dataset_name) logger.info('Epochs: ' + str(epochs)) best_params_test = {} all_results = {} device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") vec_data, labels = select_dataset(dataset_name, n_samples=n_samples, input_dim=input_dim) # input_dim is only argument for uniform. Ignored otherwise n_samples = vec_data.shape[0] # do not remove number_of_landmarks = int(0.1 * n_samples) subset_size = 10 * number_of_landmarks for emb_dim in dimensions_range: all_results[emb_dim] = {} best_test_error = 1 logger.info('Testing on: ' + dataset_name + '. Embedding dimension is ' + str(emb_dim)) logger.info(' ') for (phase1_learning_rate, phase2_learning_rate) \ in product(phase1_learning_rate_range, phase2_learning_rate_range): logger.info(10*'-'+' New parameters' + 10*'-') logger.info('phase1_Learning Rate: ' + str(phase1_learning_rate)) logger.info('phase2_Learning Rate: ' + str(phase2_learning_rate)) logger.info('Number of Points: ' + str(n_samples)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(emb_dim)) logger.info('Number of Test Triplets: ' + str(number_of_test_triplets)) logger.info('Batch Size: ' + str(batch_size)) logger.info('Computing LOE_FULL...') # first phase of the algorithm landmarks, first_phase_indices, \ first_phase_subset_size, first_phase_reconstruction, \ first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe(num_landmarks=number_of_landmarks, subset_size=subset_size, data=vec_data, dataset_size=n_samples, embedding_dim=emb_dim, epochs=epochs, target_loss=0.1, first_phase_lr=phase1_learning_rate, device=device, batch_size=batch_size, logger=logger) logger.info('First Phase Loss: ' + str(first_phase_loss)) logger.info('First Phase Triplet Error: ' + str(first_phase_triplet_error)) logger.info('First Phase Number of Landmarks: ' + str(landmarks.shape)) logger.info('First Phase Indices Number: ' + str(len(first_phase_indices))) logger.info('First Phase Reconstruction Size: ' + str(first_phase_reconstruction.shape)) embedded_indices = first_phase_indices embedded_points = first_phase_reconstruction non_embedded_indices = list(set(range(vec_data.shape[0])).difference(set(embedded_indices))) non_embedded_points = vec_data[non_embedded_indices, :] my_oracle = Oracle(data=vec_data) logger.info('Second Phase: ') logger.info('Oracle Created...') logger.info('Computing LLOE - Phase 2...') # second phase for embedding point by point update seocnd_phase_embeddings_index, \ second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle, non_embedded_indices=non_embedded_indices, embedded_indices=embedded_indices, first_phase_embedded_points=embedded_points, dim=emb_dim, lr=phase2_learning_rate, logger=logger) # combine the first phase and second phase points and index final_embedding = np.zeros((vec_data.shape[0], emb_dim)) # phase 1 points final_embedding[first_phase_indices] = first_phase_reconstruction # second phase points final_embedding[seocnd_phase_embeddings_index] = second_phase_embeddings logger.info('Size of First Phase Indices: ' + str(len(first_phase_indices))) logger.info('Size of Second Phase Indices: ' + str(len(seocnd_phase_embeddings_index))) logger.info('First Phase Triplet Error: ' + str(first_phase_triplet_error)) # Evaluation logger.info('Evaluation of the Complete Embedding Dataset: ') random_trip_indices = gen_triplet_indices(n=vec_data.shape[0], num_trips=number_of_test_triplets) test_triplet_data = gen_triplet_data(data=vec_data, random_triplet_indices=random_trip_indices, batch_size=1000) test_error, embedding_error_list = triplet_error(final_embedding, test_triplet_data) time_taken = time_first_phase + time_second_phase logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Test Error: ' + str(test_error)) #logger.info('Procrustes Disparity: ' + str(procrustes_error)) #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = {'test_error': test_error, 'last_embedding': final_embedding} all_results[emb_dim].update({(phase1_learning_rate, phase2_learning_rate): results}) if test_error < best_test_error: best_params_test[emb_dim] = {'phase1_learning_rate': phase1_learning_rate, 'phase2_learning_rate': phase2_learning_rate, 'error': test_error} best_test_error = test_error result_name = 'lloe_full_hypersearch_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_samples) + \ '_output_dim_' + str(emb_dim) + \ '_bs_' + str(batch_size) all_results['labels'] = labels joblib.dump(all_results[emb_dim], os.path.join(log_dir, result_name + '.pkl')) # print all results as well again logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-') for emb_dim in dimensions_range: results = all_results[emb_dim] logger.info('Results for emb dimension ' + str(emb_dim)) for (phase1_learning_rate, phase2_learning_rate) \ in product(phase1_learning_rate_range, phase2_learning_rate_range): logger.info('phase1_learning_rate ' + str(phase1_learning_rate) + ' phase2_learning_rate ' + str(phase2_learning_rate) + ' -- test error: ' + str(results[(phase1_learning_rate, phase2_learning_rate)]['test_error'])) # print best parameter settings for emb_dim in dimensions_range: logger.info('Best Parameters for emb dimension ' + str(emb_dim)) best_on_test = best_params_test[emb_dim] logger.info('achieved ' + str(best_on_test['error']) + ' test error with phase1_learning_rate: ' + str(best_on_test['phase1_learning_rate']) + ' phase2_learning_rate: ' + str(best_on_test['phase2_learning_rate']))