Esempio n. 1
0
def first_phase(num_landmarks, subset_size, data, dataset_size, embedding_dim,
                epochs, target_loss, first_phase_lr, bs, device, logger):
    # Performs the first phase of algorithm using the method described in the paper
    # get embedding size interval
    embedding_size_intervals = get_subset_sizes(dataset_size, subset_size)
    print('Embedding size intervals: ', embedding_size_intervals)
    # get landmarks
    landmarks, landmark_indices = get_landmark_subset(
        data, number_of_landmarks=num_landmarks)

    # first phase data buffer
    interval_final_epoch_losses = []
    interval_data_subsets = []
    interval_subset_reconstructions = []
    interval_subset_indices = []
    interval_triplet_error = []
    i = 0
    total_time = 0
    subset_data = []
    # first phase flow
    logger.info(
        'Running SOE on different subset Size Until the Criteria Defined is Satisified.'
    )
    for index, each_subset_size in enumerate(embedding_size_intervals):
        # Get the subset data. Subsets must contain landmarks. The function takes care of this. Subset indices are
        # from the original set of indices
        subset_data, subset_indices = get_subsets(
            data,
            landmark_indices=landmark_indices,
            subset_size=each_subset_size)
        logger.info('First Phase Iteration Index: ' + str(index))
        logger.info('Landmarks:' + str(landmarks.shape))
        logger.info('Obtained subset size:' + str(subset_data.shape))
        logger.info('Landmark indices: ' + str(len(landmark_indices)))
        logger.info('Obtained subset indices: ' + str(len(subset_indices)))

        begin_time = time.time()
        # compute distances between landmarks and subset. Recall landmarks are also part of subset
        landmark_to_data_distance = compute_landmark_to_data_distance(
            landmarks=landmarks, subset=subset_data)

        # sort the distance matrix; sorted_indices are indices relative to the subset size.
        sorted_indices_pt_to_l, sorted_distances_pt_to_l = sort_distances_pt_to_landmark(
            distance_mat=landmark_to_data_distance)

        sorted_indices_l_to_pt, sorted_distances_l_to_pt = sort_landmark_to_pt_distances(
            distance_mat=landmark_to_data_distance)

        # now generate LNM-MDS triplets from the distance matrix
        generated_trips = generate_triplets_from_indices(
            sorted_indices_l_to_pt, sorted_indices_pt_to_l, landmark_indices)
        trips = np.asarray(generated_trips)  # W.r.t relative indices

        end_time = time.time()
        total_time += (end_time - begin_time)

        # run soe on the subset data
        embedding_of_subset, loss_history, _, time_soe = soe_adam(
            triplets=trips,
            n=each_subset_size,
            dim=embedding_dim,
            epochs=epochs,
            batch_size=min(bs * (2**i), trips.shape[0]),
            learning_rate=first_phase_lr * (2**i),
            device=device,
            logger=logger)
        total_time += time_soe
        final_epoch_loss = loss_history[-1]

        trip_error, error_list = triplet_error(embedding_of_subset, trips)
        logger.info('Number of Triplets: ' + str(trips.shape[0]))
        logger.info('Training triplet Error: ' + str(trip_error))
        logger.info('Subset Size: ' + str(each_subset_size) + ' Loss: ' +
                    str(final_epoch_loss))
        begin_time = time.time()
        if final_epoch_loss < target_loss:
            i += 1
            interval_final_epoch_losses.append(final_epoch_loss)
            interval_data_subsets.append(subset_data)
            interval_subset_reconstructions.append(embedding_of_subset)
            interval_subset_indices.append(subset_indices)
            interval_triplet_error.append(trip_error)
            # print('Final epoch loss was less than target. Continue doubling the subset')
            continue
        else:
            if len(interval_final_epoch_losses) < 1:
                interval_final_epoch_losses.append(final_epoch_loss)
                interval_data_subsets.append(subset_data)
                interval_subset_reconstructions.append(embedding_of_subset)
                interval_subset_indices.append(subset_indices)
                interval_triplet_error.append(trip_error)
                # print('Final epoch loss was larger than target loss. We donot double the subset size')
                end_time = time.time()
                total_time += (end_time - begin_time)
                continue
            end_time = time.time()
            total_time += (end_time - begin_time)
            break
    logger.info('First Phase is Finished.')

    # Evaluation
    logger.info('Evaluation of the first phase Embedding: ')
    random_trip_indices = gen_triplet_indices(
        n=interval_data_subsets[-1].shape[0], num_trips=10000)
    test_triplet_data = gen_triplet_data(
        data=interval_data_subsets[-1],
        random_triplet_indices=random_trip_indices,
        batch_size=10000)

    test_triplet_error, _ = triplet_error(interval_subset_reconstructions[-1],
                                          test_triplet_data)

    logger.info('Test triplet error in the first phase is ' +
                str(test_triplet_error))

    return landmarks, interval_subset_indices[-1], interval_data_subsets[-1], interval_subset_reconstructions[-1], \
           interval_final_epoch_losses[-1], interval_triplet_error[-1], test_triplet_error, total_time
Esempio n. 2
0
def first_phase_soe(num_landmarks, subset_size, data, dataset_size,
                    embedding_dim, epochs, target_loss, first_phase_lr,
                    batch_size, device, logger):
    # perform the first phase of algorith using SOE. This finds the embedding for "m" points
    # get embedding size interval
    embedding_size_intervals = get_subset_sizes(dataset_size, subset_size)
    print('Embedding size intervals: ', embedding_size_intervals)
    # get landmarks
    landmarks, landmark_indices = get_landmark_subset(
        data, number_of_landmarks=num_landmarks)

    # first phase data buffer
    interval_final_epoch_losses = []
    interval_data_subsets = []
    interval_subset_reconstructions = []
    interval_subset_indices = []
    interval_triplet_error = []
    i = 0
    total_time = 0

    # first phase flow
    logger.info(
        'Running SOE on different subset Size Until the Criteria Defined is Satisified.'
    )
    for index, each_subset_size in enumerate(embedding_size_intervals):
        # Get the subset data. Subsets must contain landmarks. The function takes care of this. Subset indices are
        # from the original set of indices
        subset_data, subset_indices = get_subsets(
            data,
            landmark_indices=landmark_indices,
            subset_size=each_subset_size)
        logger.info('First Phase Iteration Index: ' + str(index))
        logger.info('Landmarks:' + str(landmarks.shape))
        logger.info('Obtained subset size:' + str(subset_data.shape))
        logger.info('Landmark indices: ' + str(len(landmark_indices)))
        logger.info('Obtained subset indices: ' + str(len(subset_indices)))

        num_trips = np.int(2 * each_subset_size * math.log2(each_subset_size) *
                           embedding_dim)
        trip_indices = np.asarray(
            gen_triplet_indices(each_subset_size,
                                num_trips))  # Random relative triplets

        trips = gen_triplet_data(subset_data,
                                 trip_indices,
                                 batch_size=trip_indices.shape[0])

        embedding_of_subset, loss_history, triplet_error_history, time_taken, _ = soe_adam(
            triplets=trips,
            n=each_subset_size,
            dim=embedding_dim,
            epochs=epochs,
            batch_size=min(trips.shape[0], batch_size * (2**i)),
            learning_rate=first_phase_lr * (2**i),
            device=device,
            logger=logger,
            error_change_threshold=-1)
        total_time += time_taken

        final_epoch_loss = loss_history[-1]
        trip_error, error_list = triplet_error(embedding_of_subset, trips)
        logger.info('Number of Triplets: ' + str(trips.shape[0]))
        logger.info('Training triplet Error: ' + str(trip_error))
        logger.info('Subset Size: ' + str(each_subset_size) + ' Loss: ' +
                    str(final_epoch_loss))
        begin_time = time.time()
        if final_epoch_loss < target_loss:
            i += 1
            interval_final_epoch_losses.append(final_epoch_loss)
            interval_data_subsets.append(subset_data)
            interval_subset_reconstructions.append(embedding_of_subset)
            interval_subset_indices.append(subset_indices)
            interval_triplet_error.append(trip_error)
            # print('Final epoch loss was less than target. Continue doubling the subset')
            end_time = time.time()
            total_time += (end_time - begin_time)
            continue
        else:
            if len(interval_final_epoch_losses) < 1:
                interval_final_epoch_losses.append(final_epoch_loss)
                interval_data_subsets.append(subset_data)
                interval_subset_reconstructions.append(embedding_of_subset)
                interval_subset_indices.append(subset_indices)
                interval_triplet_error.append(trip_error)
                # print('Final epoch loss was larger than target loss. We donot double the subset size')
                continue
            end_time = time.time()
            total_time += (end_time - begin_time)
            break

    logger.info('First Phase is Finished.')
    return landmarks, interval_subset_indices[-1], interval_data_subsets[-1], interval_subset_reconstructions[-1], \
           interval_final_epoch_losses[-1], interval_triplet_error[-1], total_time
def run_hyper_search(config):
    dataset_name = config['dataset_selected']
    batch_size = config['batch_size']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    n_points = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    log_dir = config['log']['path']
    triplet_multiplier_range = config['hyper_search']['triplets_multiplier']
    learning_rate_range = config['hyper_search']['learning_rate']
    optimizer = config['optimizer']
    dimensions_range = config['hyper_search']['output_dimension']

    separator = '_'
    experiment_name = 'soe_hyper_search_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_points) + \
                      '_num_test_trips_' + str(number_of_test_triplets) + \
                      '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \
                      '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \
                      '_bs_' + str(batch_size) + \
                      '_triplet_number_' + separator.join([str(i) for i in triplet_multiplier_range])

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    logging_path = os.path.join(log_dir, experiment_name + '.log')
    logger = logging_util.my_custom_logger(logger_name=logging_path,
                                           level=logging.INFO)
    logger.info('Name of Experiment: ' + experiment_name)
    logger.info('Logging Path:' + logging_path)
    logger.info('Dataset Name: ' + dataset_name)
    logger.info('Epochs: ' + str(epochs))

    best_params_train = {}
    best_params_test = {}
    all_results = {}

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    vec_data, labels = select_dataset(
        dataset_name, n_samples=n_points, input_dim=input_dim
    )  # input_dim is only argument for uniform. Ignored otherwise
    n_points = vec_data.shape[0]
    logn = int(np.log2(n_points))
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        all_results[(emb_dim, triplet_multiplier)] = {}
        best_train_error = 1
        best_test_error = 1

        triplet_num = triplet_multiplier * logn * n_points * emb_dim

        bs = min(batch_size, triplet_num)
        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, bs, device)
        logger.info('Testing on: ' + dataset_name +
                    '. Embedding dimension is ' + str(emb_dim))
        logger.info(' ')
        for learning_rate in learning_rate_range:

            logger.info(10 * '-' + ' New parameters' + 10 * '-')
            logger.info('Learning Rate: ' + str(learning_rate))
            logger.info('Number of Points: ' + str(n_points))
            logger.info('Input Dimension: ' + str(input_dim))
            logger.info('Output Dimension: ' + str(emb_dim))
            logger.info('Number of Test Triplets: ' +
                        str(number_of_test_triplets))
            logger.info('Triplet Multiplier: ' + str(triplet_multiplier))
            logger.info('Batch Size: ' + str(batch_size))

            logger.info('Computing SOE...')

            x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam(
                triplets=train_triplets_dataset.trips_data_indices,
                n=n_points,
                dim=emb_dim,
                epochs=epochs,
                batch_size=bs,
                learning_rate=learning_rate,
                device=device,
                logger=logger)

            logger.info('Evaluating the computed embeddings...')
            # compute triplet error for train and test data
            train_error = train_triplets_dataset.triplet_error(x)
            logger.info('Triplet Error on Training Triplets: ' +
                        str(train_error))
            test_triplets_dataset = TripletBatchesDataset(
                vec_data, labels, number_of_test_triplets, 1000, device)
            test_error = test_triplets_dataset.triplet_error(x)
            #procrustes_error = procrustes_disparity(vec_data, x)
            #knn_error_ord_emb, knn_error_true_emb = knn_classification_error(x, vec_data, labels)

            logger.info('Epochs: ' + str(epochs))
            logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
            logger.info('Train Error: ' + str(train_error))
            logger.info('Test Error: ' + str(test_error))
            #logger.info('Procrustes Disparity: ' + str(procrustes_error))
            #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb))
            #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb))

            results = {
                'train_error': train_error,
                'test_error': test_error,
                'loss_history': loss_history,
                'error_history': triplet_error_history,
                'last_embedding': x
            }
            all_results[(emb_dim,
                         triplet_multiplier)].update({learning_rate: results})

            if test_error < best_test_error:
                best_params_test[(emb_dim, triplet_multiplier)] = {
                    'learning_rate': learning_rate,
                    'optimizer': optimizer,
                    'error': test_error
                }
                best_test_error = test_error
            if train_error < best_train_error:
                best_params_train[(emb_dim, triplet_multiplier)] = {
                    'learning_rate': learning_rate,
                    'optimizer': optimizer,
                    'error': train_error
                }
                best_train_error = train_error

        result_name = 'soe_convergence_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_points) + \
                      '_output_dim_' + str(emb_dim) + \
                      '_bs_' + str(batch_size) + \
                      '_triplet_number_' + str(triplet_multiplier)
        all_results['labels'] = labels
        joblib.dump(all_results[(emb_dim, triplet_multiplier)],
                    os.path.join(log_dir, result_name + '.pkl'))

    # print all results as well again
    logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-')
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        results = all_results[(emb_dim, triplet_multiplier)]
        logger.info('Results for emb dimension ' + str(emb_dim) +
                    ' and triplet multiplier ' + str(triplet_multiplier))
        for learning_rate in learning_rate_range:
            logger.info('learning rate ' + str(learning_rate) +
                        ' -- train error: ' +
                        str(results[learning_rate]['train_error']) +
                        ' test error: ' +
                        str(results[learning_rate]['test_error']))

    # print best parameter settings
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        logger.info('Best Parameters for emb dimension ' + str(emb_dim) +
                    ' and triplet multiplier ' + str(triplet_multiplier))
        best_on_train = best_params_train[(emb_dim, triplet_multiplier)]
        best_on_test = best_params_test[(emb_dim, triplet_multiplier)]
        logger.info('achieved ' + str(best_on_train['error']) +
                    ' train error with learning rate: ' +
                    str(best_on_train['learning_rate']))
        logger.info('achieved ' + str(best_on_test['error']) +
                    ' test error with learning rate: ' +
                    str(best_on_test['learning_rate']))
def main(args):
    config = load_config(args.config_path)
    dataset_name = config['dataset_selected']
    error_change_threshold = config['error_change_threshold']
    batch_size = config['batch_size']
    learning_rate = config['optimizer_params']['learning_rate']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    embedding_dimension = config['output_dimension']
    n_points = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    triplet_multiplier = config['triplets_multiplier']
    log_dir = config['log']['path']
    hyper_search = config['hyper_search']['activation']
    optimizer = config['optimizer']

    if hyper_search:
        run_hyper_search(config=config)
    else:
        vec_data, labels = select_dataset(dataset_name,
                                          n_samples=n_points,
                                          input_dim=input_dim)
        n_points = vec_data.shape[0]
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        logn = int(np.log2(n_points))
        triplet_num = triplet_multiplier * logn * n_points * embedding_dimension

        batch_size = min(batch_size, triplet_num)

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        experiment_name = 'soe_' + \
                          'data_' + dataset_name + \
                          '_error_change_threshold_' + str(error_change_threshold) + \
                          '_input_dim_' + str(input_dim) + \
                          '_output_dim_' + str(embedding_dimension) + \
                          '_originaldimension_' + str(vec_data.shape[1]) + \
                          '_triplet_num_' + str(triplet_multiplier) + \
                          '_n_pts_' + str(n_points) + \
                          '_lr_' + str(learning_rate) + \
                          '_optimizer_' + str(optimizer) + \
                          '_bs_' + str(batch_size)

        # create a logging file for extensive logging
        logging_path = os.path.join(log_dir, experiment_name + '.log')
        logger = logging_util.my_custom_logger(logger_name=logging_path,
                                               level=logging.INFO)

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Logging Path:' + logging_path)
        logger.info('Dataset Name: ' + dataset_name)
        logger.info('Error Change Threshold: ' + str(error_change_threshold))
        logger.info('Epochs: ' + str(epochs))
        logger.info('Learning Rate: ' + str(learning_rate))
        logger.info('Number of Points: ' + str(n_points))
        logger.info('Input Dimension: ' + str(input_dim))
        logger.info('Output Dimension: ' + str(embedding_dimension))
        logger.info('Number of Test Triplets: ' + str(number_of_test_triplets))
        logger.info('Triplet Multiplier: ' + str(triplet_multiplier))
        logger.info('Batch Size: ' + str(batch_size))

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)

        logger.info('Computing SOE...')
        x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam(
            triplets=train_triplets_dataset.trips_data_indices,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)

        logger.info('Evaluating the computed embeddings...')
        # compute triplet error for train and test data
        train_error = train_triplets_dataset.triplet_error(x)
        test_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                      number_of_test_triplets,
                                                      1000, device)
        test_error = test_triplets_dataset.triplet_error(x)
        procrustes_error = procrustes_disparity(vec_data, x)
        knn_error_ord_emb, knn_error_true_emb = knn_classification_error(
            x, vec_data, labels)

        # sample points for tsne visualization
        subsample = np.random.permutation(n_points)[0:500]
        x = x[subsample, :]
        sub_labels = labels[subsample]

        x_embedded = TSNE(n_components=2, perplexity=15,
                          learning_rate=10).fit_transform(x)
        fig, ax = plt.subplots(1, 1)

        ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels)
        fig.savefig(os.path.join(log_dir, experiment_name + '.png'))

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Epochs: ' + str(epochs))
        logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
        logger.info('Train Error: ' + str(train_error))
        logger.info('Test Error: ' + str(test_error))
        logger.info('Procrustes Disparity: ' + str(procrustes_error))
        logger.info('kNN Classification Error on ground-truth: ' +
                    str(knn_error_true_emb))
        logger.info('kNN Classification Error on embedding: ' +
                    str(knn_error_ord_emb))

        results = {
            'train_error': train_error,
            'test_error': test_error,
            'procrustes': procrustes_error,
            'knn_true': knn_error_true_emb,
            'knn_ord_emb': knn_error_ord_emb,
            'labels': labels,
            'loss_history': loss_history,
            'error_history': triplet_error_history,
            'ordinal_embedding': x,
            'time_taken': time_taken
        }
        joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def run_method(config, dataset_name, algorithm, n_points, input_dim,
               embedding_dimension, learning_rate, batch_size,
               triplet_multiplier, optimizer, epochs, n_test_triplets, logger,
               error_change_threshold):
    vec_data, labels = select_dataset(dataset_name,
                                      n_samples=n_points,
                                      input_dim=input_dim)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    n_points = vec_data.shape[0]

    triplet_num = np.int(
        np.ceil(triplet_multiplier * n_points * math.log2(n_points) *
                embedding_dimension))
    train_triplets = []
    loss_history, triplet_error_history, time_history = [], [], []

    batch_size = min(batch_size, triplet_num)

    logger.info('Computing Embedding...')
    logger.info('Number of Points: ' + str(n_points))
    logger.info('Number of Triplets: ' + str(triplet_num))
    logger.info('Input Dimension: ' + str(input_dim))
    logger.info('Output Dimension: ' + str(embedding_dimension))
    time_taken = 0
    train_error = -1  # active methods wont have a train error
    if optimizer == 'adam' and algorithm == 'soe':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)

    elif optimizer == 'sgd' and algorithm == 'soe':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_sgd(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            iterations=epochs,
            bs=batch_size,
            lr=learning_rate,
            device=device,
            logger=logger)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'ste':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ste.ste_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'tste':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
            triplets=train_triplets,
            n=n_points,
            emb_dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'triplet_loss':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.triplet_loss_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            iterations=epochs,
            batch_size=batch_size,
            lr=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'gnmds':
        regularizer = config['regularizer']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = gnmds.gnmds(
            triplets=train_triplets,
            reg_lbda=regularizer,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'forte':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = forte.rank_d_pgd(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'ckl':
        regularizer = config['regularizer']
        mu = config['mu']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_k(
            triplets=train_triplets,
            reg_lbda=regularizer,
            mu=mu,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)

    elif algorithm == 'ckl_x':
        mu = config['mu']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_x(
            triplets=train_triplets,
            mu=mu,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'loe':
        x, time_taken, train_error = landmark_oe.landmark_oe_with_data(
            data=vec_data,
            dim=embedding_dimension,
            trip_num=triplet_num,
            learning_rate=learning_rate,
            epochs=epochs,
            batch_size=batch_size,
            device=device,
            logger=logger)
    elif algorithm == 'oenn':

        number_of_neighbours = 50  # config['number_of_neighbours']
        metric = 'eu'  # config['metric']
        all_triplets, triplet_loaders = data_utils_oenn.prep_data_for_nn(
            vec_data=vec_data,
            labels=labels,
            triplet_num=triplet_num,
            batch_size=batch_size,
            metric=metric,
            number_of_neighbours=number_of_neighbours)

        hl_size = int(120 + (2 * embedding_dimension * math.log2(n_points)))
        x, loss_history, triplet_error_history, time_taken, time_history = training_routine_v3.create_and_train_triplet_network(
            dataset_name=dataset_name,
            ind_loaders=triplet_loaders,
            n=n_points,
            dim=embedding_dimension,
            layers=3,
            learning_rate=learning_rate,
            epochs=epochs,
            hl_size=hl_size,
            batch_size=batch_size,
            number_of_triplets=triplet_num,
            logger=logger,
            error_change_threshold=error_change_threshold)
        train_error = triplet_error_batches(x, all_triplets)

    elif algorithm == 'lloe':
        num_landmarks = config['optimizer_params']['num_landmarks']
        subset_size = config['optimizer_params']['subset_size']
        phase1_learning_rate = config['optimizer_params'][
            'phase1_learning_rate']
        phase2_learning_rate = config['optimizer_params'][
            'phase2_learning_rate']
        target_loss = config['optimizer_params']['target_loss']

        number_of_landmarks = min(int(num_landmarks * n_points), 100)
        subset_size = subset_size * number_of_landmarks
        landmarks, first_phase_indices, \
        first_phase_subset_size, first_phase_reconstruction, \
        first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe(
            num_landmarks=number_of_landmarks,
            subset_size=subset_size,
            data=vec_data, dataset_size=n_points,
            embedding_dim=embedding_dimension, epochs=epochs,
            first_phase_lr=phase1_learning_rate,
            device=device,
            target_loss=target_loss,
            batch_size=batch_size,
            logger=logger)
        embedded_indices = first_phase_indices
        embedded_points = first_phase_reconstruction
        non_embedded_indices = list(
            set(range(vec_data.shape[0])).difference(set(embedded_indices)))
        my_oracle = Oracle(data=vec_data)
        logger.info('Second Phase: ')
        logger.info('Oracle Created...')
        logger.info('Computing LLOE - Phase 2...')
        print(time_first_phase)
        # second phase for embedding point by point update
        second_phase_embeddings_index, \
        second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle,
                                                                  non_embedded_indices=non_embedded_indices,
                                                                  embedded_indices=embedded_indices,
                                                                  first_phase_embedded_points=embedded_points,
                                                                  dim=embedding_dimension,
                                                                  lr=phase2_learning_rate, logger=logger)
        # combine the first phase and second phase points and index
        x = np.zeros((vec_data.shape[0], embedding_dimension))
        # phase 1 points
        x[embedded_indices] = embedded_points
        # second phase points
        x[second_phase_embeddings_index] = second_phase_embeddings
        time_taken = time_first_phase + time_second_phase

    logger.info('Time Taken for experiment ' + str(time_taken) + ' seconds.')
    logger.info('Evaluating the computed embeddings...')

    test_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                  n_test_triplets, 1000,
                                                  device)
    test_error = test_triplets_dataset.triplet_error(x)
    procrustes_error = procrustes_disparity(vec_data, x)
    knn_error_ord_emb, knn_error_true_emb = knn_classification_error(
        x, vec_data, labels)

    # log the errors
    logger.info('Train Error: ' + str(train_error))
    logger.info('Test Error: ' + str(test_error))
    logger.info('Procrustes Disparity: ' + str(procrustes_error))
    logger.info('kNN Classification Error on ground-truth: ' +
                str(knn_error_true_emb))
    logger.info('kNN Classification Error on embedding: ' +
                str(knn_error_ord_emb))
    return x, train_triplets, labels, train_error, test_error, procrustes_error, knn_error_true_emb, knn_error_ord_emb, time_taken, loss_history, triplet_error_history, time_history