def first_phase(num_landmarks, subset_size, data, dataset_size, embedding_dim, epochs, target_loss, first_phase_lr, bs, device, logger): # Performs the first phase of algorithm using the method described in the paper # get embedding size interval embedding_size_intervals = get_subset_sizes(dataset_size, subset_size) print('Embedding size intervals: ', embedding_size_intervals) # get landmarks landmarks, landmark_indices = get_landmark_subset( data, number_of_landmarks=num_landmarks) # first phase data buffer interval_final_epoch_losses = [] interval_data_subsets = [] interval_subset_reconstructions = [] interval_subset_indices = [] interval_triplet_error = [] i = 0 total_time = 0 subset_data = [] # first phase flow logger.info( 'Running SOE on different subset Size Until the Criteria Defined is Satisified.' ) for index, each_subset_size in enumerate(embedding_size_intervals): # Get the subset data. Subsets must contain landmarks. The function takes care of this. Subset indices are # from the original set of indices subset_data, subset_indices = get_subsets( data, landmark_indices=landmark_indices, subset_size=each_subset_size) logger.info('First Phase Iteration Index: ' + str(index)) logger.info('Landmarks:' + str(landmarks.shape)) logger.info('Obtained subset size:' + str(subset_data.shape)) logger.info('Landmark indices: ' + str(len(landmark_indices))) logger.info('Obtained subset indices: ' + str(len(subset_indices))) begin_time = time.time() # compute distances between landmarks and subset. Recall landmarks are also part of subset landmark_to_data_distance = compute_landmark_to_data_distance( landmarks=landmarks, subset=subset_data) # sort the distance matrix; sorted_indices are indices relative to the subset size. sorted_indices_pt_to_l, sorted_distances_pt_to_l = sort_distances_pt_to_landmark( distance_mat=landmark_to_data_distance) sorted_indices_l_to_pt, sorted_distances_l_to_pt = sort_landmark_to_pt_distances( distance_mat=landmark_to_data_distance) # now generate LNM-MDS triplets from the distance matrix generated_trips = generate_triplets_from_indices( sorted_indices_l_to_pt, sorted_indices_pt_to_l, landmark_indices) trips = np.asarray(generated_trips) # W.r.t relative indices end_time = time.time() total_time += (end_time - begin_time) # run soe on the subset data embedding_of_subset, loss_history, _, time_soe = soe_adam( triplets=trips, n=each_subset_size, dim=embedding_dim, epochs=epochs, batch_size=min(bs * (2**i), trips.shape[0]), learning_rate=first_phase_lr * (2**i), device=device, logger=logger) total_time += time_soe final_epoch_loss = loss_history[-1] trip_error, error_list = triplet_error(embedding_of_subset, trips) logger.info('Number of Triplets: ' + str(trips.shape[0])) logger.info('Training triplet Error: ' + str(trip_error)) logger.info('Subset Size: ' + str(each_subset_size) + ' Loss: ' + str(final_epoch_loss)) begin_time = time.time() if final_epoch_loss < target_loss: i += 1 interval_final_epoch_losses.append(final_epoch_loss) interval_data_subsets.append(subset_data) interval_subset_reconstructions.append(embedding_of_subset) interval_subset_indices.append(subset_indices) interval_triplet_error.append(trip_error) # print('Final epoch loss was less than target. Continue doubling the subset') continue else: if len(interval_final_epoch_losses) < 1: interval_final_epoch_losses.append(final_epoch_loss) interval_data_subsets.append(subset_data) interval_subset_reconstructions.append(embedding_of_subset) interval_subset_indices.append(subset_indices) interval_triplet_error.append(trip_error) # print('Final epoch loss was larger than target loss. We donot double the subset size') end_time = time.time() total_time += (end_time - begin_time) continue end_time = time.time() total_time += (end_time - begin_time) break logger.info('First Phase is Finished.') # Evaluation logger.info('Evaluation of the first phase Embedding: ') random_trip_indices = gen_triplet_indices( n=interval_data_subsets[-1].shape[0], num_trips=10000) test_triplet_data = gen_triplet_data( data=interval_data_subsets[-1], random_triplet_indices=random_trip_indices, batch_size=10000) test_triplet_error, _ = triplet_error(interval_subset_reconstructions[-1], test_triplet_data) logger.info('Test triplet error in the first phase is ' + str(test_triplet_error)) return landmarks, interval_subset_indices[-1], interval_data_subsets[-1], interval_subset_reconstructions[-1], \ interval_final_epoch_losses[-1], interval_triplet_error[-1], test_triplet_error, total_time
def first_phase_soe(num_landmarks, subset_size, data, dataset_size, embedding_dim, epochs, target_loss, first_phase_lr, batch_size, device, logger): # perform the first phase of algorith using SOE. This finds the embedding for "m" points # get embedding size interval embedding_size_intervals = get_subset_sizes(dataset_size, subset_size) print('Embedding size intervals: ', embedding_size_intervals) # get landmarks landmarks, landmark_indices = get_landmark_subset( data, number_of_landmarks=num_landmarks) # first phase data buffer interval_final_epoch_losses = [] interval_data_subsets = [] interval_subset_reconstructions = [] interval_subset_indices = [] interval_triplet_error = [] i = 0 total_time = 0 # first phase flow logger.info( 'Running SOE on different subset Size Until the Criteria Defined is Satisified.' ) for index, each_subset_size in enumerate(embedding_size_intervals): # Get the subset data. Subsets must contain landmarks. The function takes care of this. Subset indices are # from the original set of indices subset_data, subset_indices = get_subsets( data, landmark_indices=landmark_indices, subset_size=each_subset_size) logger.info('First Phase Iteration Index: ' + str(index)) logger.info('Landmarks:' + str(landmarks.shape)) logger.info('Obtained subset size:' + str(subset_data.shape)) logger.info('Landmark indices: ' + str(len(landmark_indices))) logger.info('Obtained subset indices: ' + str(len(subset_indices))) num_trips = np.int(2 * each_subset_size * math.log2(each_subset_size) * embedding_dim) trip_indices = np.asarray( gen_triplet_indices(each_subset_size, num_trips)) # Random relative triplets trips = gen_triplet_data(subset_data, trip_indices, batch_size=trip_indices.shape[0]) embedding_of_subset, loss_history, triplet_error_history, time_taken, _ = soe_adam( triplets=trips, n=each_subset_size, dim=embedding_dim, epochs=epochs, batch_size=min(trips.shape[0], batch_size * (2**i)), learning_rate=first_phase_lr * (2**i), device=device, logger=logger, error_change_threshold=-1) total_time += time_taken final_epoch_loss = loss_history[-1] trip_error, error_list = triplet_error(embedding_of_subset, trips) logger.info('Number of Triplets: ' + str(trips.shape[0])) logger.info('Training triplet Error: ' + str(trip_error)) logger.info('Subset Size: ' + str(each_subset_size) + ' Loss: ' + str(final_epoch_loss)) begin_time = time.time() if final_epoch_loss < target_loss: i += 1 interval_final_epoch_losses.append(final_epoch_loss) interval_data_subsets.append(subset_data) interval_subset_reconstructions.append(embedding_of_subset) interval_subset_indices.append(subset_indices) interval_triplet_error.append(trip_error) # print('Final epoch loss was less than target. Continue doubling the subset') end_time = time.time() total_time += (end_time - begin_time) continue else: if len(interval_final_epoch_losses) < 1: interval_final_epoch_losses.append(final_epoch_loss) interval_data_subsets.append(subset_data) interval_subset_reconstructions.append(embedding_of_subset) interval_subset_indices.append(subset_indices) interval_triplet_error.append(trip_error) # print('Final epoch loss was larger than target loss. We donot double the subset size') continue end_time = time.time() total_time += (end_time - begin_time) break logger.info('First Phase is Finished.') return landmarks, interval_subset_indices[-1], interval_data_subsets[-1], interval_subset_reconstructions[-1], \ interval_final_epoch_losses[-1], interval_triplet_error[-1], total_time
def run_hyper_search(config): dataset_name = config['dataset_selected'] batch_size = config['batch_size'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] n_points = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] log_dir = config['log']['path'] triplet_multiplier_range = config['hyper_search']['triplets_multiplier'] learning_rate_range = config['hyper_search']['learning_rate'] optimizer = config['optimizer'] dimensions_range = config['hyper_search']['output_dimension'] separator = '_' experiment_name = 'soe_hyper_search_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_points) + \ '_num_test_trips_' + str(number_of_test_triplets) + \ '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \ '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \ '_bs_' + str(batch_size) + \ '_triplet_number_' + separator.join([str(i) for i in triplet_multiplier_range]) if not os.path.exists(log_dir): os.makedirs(log_dir) logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiment: ' + experiment_name) logger.info('Logging Path:' + logging_path) logger.info('Dataset Name: ' + dataset_name) logger.info('Epochs: ' + str(epochs)) best_params_train = {} best_params_test = {} all_results = {} device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") vec_data, labels = select_dataset( dataset_name, n_samples=n_points, input_dim=input_dim ) # input_dim is only argument for uniform. Ignored otherwise n_points = vec_data.shape[0] logn = int(np.log2(n_points)) for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): all_results[(emb_dim, triplet_multiplier)] = {} best_train_error = 1 best_test_error = 1 triplet_num = triplet_multiplier * logn * n_points * emb_dim bs = min(batch_size, triplet_num) train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, bs, device) logger.info('Testing on: ' + dataset_name + '. Embedding dimension is ' + str(emb_dim)) logger.info(' ') for learning_rate in learning_rate_range: logger.info(10 * '-' + ' New parameters' + 10 * '-') logger.info('Learning Rate: ' + str(learning_rate)) logger.info('Number of Points: ' + str(n_points)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(emb_dim)) logger.info('Number of Test Triplets: ' + str(number_of_test_triplets)) logger.info('Triplet Multiplier: ' + str(triplet_multiplier)) logger.info('Batch Size: ' + str(batch_size)) logger.info('Computing SOE...') x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam( triplets=train_triplets_dataset.trips_data_indices, n=n_points, dim=emb_dim, epochs=epochs, batch_size=bs, learning_rate=learning_rate, device=device, logger=logger) logger.info('Evaluating the computed embeddings...') # compute triplet error for train and test data train_error = train_triplets_dataset.triplet_error(x) logger.info('Triplet Error on Training Triplets: ' + str(train_error)) test_triplets_dataset = TripletBatchesDataset( vec_data, labels, number_of_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) #procrustes_error = procrustes_disparity(vec_data, x) #knn_error_ord_emb, knn_error_true_emb = knn_classification_error(x, vec_data, labels) logger.info('Epochs: ' + str(epochs)) logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) #logger.info('Procrustes Disparity: ' + str(procrustes_error)) #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = { 'train_error': train_error, 'test_error': test_error, 'loss_history': loss_history, 'error_history': triplet_error_history, 'last_embedding': x } all_results[(emb_dim, triplet_multiplier)].update({learning_rate: results}) if test_error < best_test_error: best_params_test[(emb_dim, triplet_multiplier)] = { 'learning_rate': learning_rate, 'optimizer': optimizer, 'error': test_error } best_test_error = test_error if train_error < best_train_error: best_params_train[(emb_dim, triplet_multiplier)] = { 'learning_rate': learning_rate, 'optimizer': optimizer, 'error': train_error } best_train_error = train_error result_name = 'soe_convergence_' + \ 'data_' + dataset_name + \ '_input_dim_' + str(input_dim) + \ '_n_pts_' + str(n_points) + \ '_output_dim_' + str(emb_dim) + \ '_bs_' + str(batch_size) + \ '_triplet_number_' + str(triplet_multiplier) all_results['labels'] = labels joblib.dump(all_results[(emb_dim, triplet_multiplier)], os.path.join(log_dir, result_name + '.pkl')) # print all results as well again logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-') for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): results = all_results[(emb_dim, triplet_multiplier)] logger.info('Results for emb dimension ' + str(emb_dim) + ' and triplet multiplier ' + str(triplet_multiplier)) for learning_rate in learning_rate_range: logger.info('learning rate ' + str(learning_rate) + ' -- train error: ' + str(results[learning_rate]['train_error']) + ' test error: ' + str(results[learning_rate]['test_error'])) # print best parameter settings for (emb_dim, triplet_multiplier) in product(dimensions_range, triplet_multiplier_range): logger.info('Best Parameters for emb dimension ' + str(emb_dim) + ' and triplet multiplier ' + str(triplet_multiplier)) best_on_train = best_params_train[(emb_dim, triplet_multiplier)] best_on_test = best_params_test[(emb_dim, triplet_multiplier)] logger.info('achieved ' + str(best_on_train['error']) + ' train error with learning rate: ' + str(best_on_train['learning_rate'])) logger.info('achieved ' + str(best_on_test['error']) + ' test error with learning rate: ' + str(best_on_test['learning_rate']))
def main(args): config = load_config(args.config_path) dataset_name = config['dataset_selected'] error_change_threshold = config['error_change_threshold'] batch_size = config['batch_size'] learning_rate = config['optimizer_params']['learning_rate'] epochs = config['nb_epochs'] input_dim = config['input_dimension'] embedding_dimension = config['output_dimension'] n_points = config['number_of_points'] number_of_test_triplets = config['n_test_triplets'] triplet_multiplier = config['triplets_multiplier'] log_dir = config['log']['path'] hyper_search = config['hyper_search']['activation'] optimizer = config['optimizer'] if hyper_search: run_hyper_search(config=config) else: vec_data, labels = select_dataset(dataset_name, n_samples=n_points, input_dim=input_dim) n_points = vec_data.shape[0] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logn = int(np.log2(n_points)) triplet_num = triplet_multiplier * logn * n_points * embedding_dimension batch_size = min(batch_size, triplet_num) if not os.path.exists(log_dir): os.makedirs(log_dir) experiment_name = 'soe_' + \ 'data_' + dataset_name + \ '_error_change_threshold_' + str(error_change_threshold) + \ '_input_dim_' + str(input_dim) + \ '_output_dim_' + str(embedding_dimension) + \ '_originaldimension_' + str(vec_data.shape[1]) + \ '_triplet_num_' + str(triplet_multiplier) + \ '_n_pts_' + str(n_points) + \ '_lr_' + str(learning_rate) + \ '_optimizer_' + str(optimizer) + \ '_bs_' + str(batch_size) # create a logging file for extensive logging logging_path = os.path.join(log_dir, experiment_name + '.log') logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO) logger.info('Name of Experiments: ' + experiment_name) logger.info('Logging Path:' + logging_path) logger.info('Dataset Name: ' + dataset_name) logger.info('Error Change Threshold: ' + str(error_change_threshold)) logger.info('Epochs: ' + str(epochs)) logger.info('Learning Rate: ' + str(learning_rate)) logger.info('Number of Points: ' + str(n_points)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(embedding_dimension)) logger.info('Number of Test Triplets: ' + str(number_of_test_triplets)) logger.info('Triplet Multiplier: ' + str(triplet_multiplier)) logger.info('Batch Size: ' + str(batch_size)) train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) logger.info('Computing SOE...') x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam( triplets=train_triplets_dataset.trips_data_indices, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) logger.info('Evaluating the computed embeddings...') # compute triplet error for train and test data train_error = train_triplets_dataset.triplet_error(x) test_triplets_dataset = TripletBatchesDataset(vec_data, labels, number_of_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) procrustes_error = procrustes_disparity(vec_data, x) knn_error_ord_emb, knn_error_true_emb = knn_classification_error( x, vec_data, labels) # sample points for tsne visualization subsample = np.random.permutation(n_points)[0:500] x = x[subsample, :] sub_labels = labels[subsample] x_embedded = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(x) fig, ax = plt.subplots(1, 1) ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels) fig.savefig(os.path.join(log_dir, experiment_name + '.png')) logger.info('Name of Experiments: ' + experiment_name) logger.info('Epochs: ' + str(epochs)) logger.info('Time Taken: ' + str(time_taken) + ' seconds.') logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) logger.info('Procrustes Disparity: ' + str(procrustes_error)) logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) results = { 'train_error': train_error, 'test_error': test_error, 'procrustes': procrustes_error, 'knn_true': knn_error_true_emb, 'knn_ord_emb': knn_error_ord_emb, 'labels': labels, 'loss_history': loss_history, 'error_history': triplet_error_history, 'ordinal_embedding': x, 'time_taken': time_taken } joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def run_method(config, dataset_name, algorithm, n_points, input_dim, embedding_dimension, learning_rate, batch_size, triplet_multiplier, optimizer, epochs, n_test_triplets, logger, error_change_threshold): vec_data, labels = select_dataset(dataset_name, n_samples=n_points, input_dim=input_dim) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n_points = vec_data.shape[0] triplet_num = np.int( np.ceil(triplet_multiplier * n_points * math.log2(n_points) * embedding_dimension)) train_triplets = [] loss_history, triplet_error_history, time_history = [], [], [] batch_size = min(batch_size, triplet_num) logger.info('Computing Embedding...') logger.info('Number of Points: ' + str(n_points)) logger.info('Number of Triplets: ' + str(triplet_num)) logger.info('Input Dimension: ' + str(input_dim)) logger.info('Output Dimension: ' + str(embedding_dimension)) time_taken = 0 train_error = -1 # active methods wont have a train error if optimizer == 'adam' and algorithm == 'soe': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam( triplets=train_triplets, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif optimizer == 'sgd' and algorithm == 'soe': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_sgd( triplets=train_triplets, n=n_points, dim=embedding_dimension, iterations=epochs, bs=batch_size, lr=learning_rate, device=device, logger=logger) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'ste': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = ste.ste_adam( triplets=train_triplets, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'tste': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam( triplets=train_triplets, n=n_points, emb_dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'triplet_loss': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = soe.triplet_loss_adam( triplets=train_triplets, n=n_points, dim=embedding_dimension, iterations=epochs, batch_size=batch_size, lr=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'gnmds': regularizer = config['regularizer'] logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = gnmds.gnmds( triplets=train_triplets, reg_lbda=regularizer, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'forte': logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = forte.rank_d_pgd( triplets=train_triplets, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'ckl': regularizer = config['regularizer'] mu = config['mu'] logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_k( triplets=train_triplets, reg_lbda=regularizer, mu=mu, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'ckl_x': mu = config['mu'] logger.info('Generating triplets...') train_triplets_dataset = TripletBatchesDataset(vec_data, labels, triplet_num, batch_size, device) train_triplets = train_triplets_dataset.trips_data_indices x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_x( triplets=train_triplets, mu=mu, n=n_points, dim=embedding_dimension, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, device=device, logger=logger, error_change_threshold=error_change_threshold) # compute triplet error for train and test data train_error = triplet_error_batches(x, train_triplets) elif algorithm == 'loe': x, time_taken, train_error = landmark_oe.landmark_oe_with_data( data=vec_data, dim=embedding_dimension, trip_num=triplet_num, learning_rate=learning_rate, epochs=epochs, batch_size=batch_size, device=device, logger=logger) elif algorithm == 'oenn': number_of_neighbours = 50 # config['number_of_neighbours'] metric = 'eu' # config['metric'] all_triplets, triplet_loaders = data_utils_oenn.prep_data_for_nn( vec_data=vec_data, labels=labels, triplet_num=triplet_num, batch_size=batch_size, metric=metric, number_of_neighbours=number_of_neighbours) hl_size = int(120 + (2 * embedding_dimension * math.log2(n_points))) x, loss_history, triplet_error_history, time_taken, time_history = training_routine_v3.create_and_train_triplet_network( dataset_name=dataset_name, ind_loaders=triplet_loaders, n=n_points, dim=embedding_dimension, layers=3, learning_rate=learning_rate, epochs=epochs, hl_size=hl_size, batch_size=batch_size, number_of_triplets=triplet_num, logger=logger, error_change_threshold=error_change_threshold) train_error = triplet_error_batches(x, all_triplets) elif algorithm == 'lloe': num_landmarks = config['optimizer_params']['num_landmarks'] subset_size = config['optimizer_params']['subset_size'] phase1_learning_rate = config['optimizer_params'][ 'phase1_learning_rate'] phase2_learning_rate = config['optimizer_params'][ 'phase2_learning_rate'] target_loss = config['optimizer_params']['target_loss'] number_of_landmarks = min(int(num_landmarks * n_points), 100) subset_size = subset_size * number_of_landmarks landmarks, first_phase_indices, \ first_phase_subset_size, first_phase_reconstruction, \ first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe( num_landmarks=number_of_landmarks, subset_size=subset_size, data=vec_data, dataset_size=n_points, embedding_dim=embedding_dimension, epochs=epochs, first_phase_lr=phase1_learning_rate, device=device, target_loss=target_loss, batch_size=batch_size, logger=logger) embedded_indices = first_phase_indices embedded_points = first_phase_reconstruction non_embedded_indices = list( set(range(vec_data.shape[0])).difference(set(embedded_indices))) my_oracle = Oracle(data=vec_data) logger.info('Second Phase: ') logger.info('Oracle Created...') logger.info('Computing LLOE - Phase 2...') print(time_first_phase) # second phase for embedding point by point update second_phase_embeddings_index, \ second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle, non_embedded_indices=non_embedded_indices, embedded_indices=embedded_indices, first_phase_embedded_points=embedded_points, dim=embedding_dimension, lr=phase2_learning_rate, logger=logger) # combine the first phase and second phase points and index x = np.zeros((vec_data.shape[0], embedding_dimension)) # phase 1 points x[embedded_indices] = embedded_points # second phase points x[second_phase_embeddings_index] = second_phase_embeddings time_taken = time_first_phase + time_second_phase logger.info('Time Taken for experiment ' + str(time_taken) + ' seconds.') logger.info('Evaluating the computed embeddings...') test_triplets_dataset = TripletBatchesDataset(vec_data, labels, n_test_triplets, 1000, device) test_error = test_triplets_dataset.triplet_error(x) procrustes_error = procrustes_disparity(vec_data, x) knn_error_ord_emb, knn_error_true_emb = knn_classification_error( x, vec_data, labels) # log the errors logger.info('Train Error: ' + str(train_error)) logger.info('Test Error: ' + str(test_error)) logger.info('Procrustes Disparity: ' + str(procrustes_error)) logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb)) logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb)) return x, train_triplets, labels, train_error, test_error, procrustes_error, knn_error_true_emb, knn_error_ord_emb, time_taken, loss_history, triplet_error_history, time_history