def run_method(config, dataset_name, algorithm, n_points, input_dim,
               embedding_dimension, learning_rate, batch_size,
               triplet_multiplier, optimizer, epochs, n_test_triplets, logger,
               error_change_threshold):
    vec_data, labels = select_dataset(dataset_name,
                                      n_samples=n_points,
                                      input_dim=input_dim)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    n_points = vec_data.shape[0]

    triplet_num = np.int(
        np.ceil(triplet_multiplier * n_points * math.log2(n_points) *
                embedding_dimension))
    train_triplets = []
    loss_history, triplet_error_history, time_history = [], [], []

    batch_size = min(batch_size, triplet_num)

    logger.info('Computing Embedding...')
    logger.info('Number of Points: ' + str(n_points))
    logger.info('Number of Triplets: ' + str(triplet_num))
    logger.info('Input Dimension: ' + str(input_dim))
    logger.info('Output Dimension: ' + str(embedding_dimension))
    time_taken = 0
    train_error = -1  # active methods wont have a train error
    if optimizer == 'adam' and algorithm == 'soe':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)

    elif optimizer == 'sgd' and algorithm == 'soe':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_sgd(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            iterations=epochs,
            bs=batch_size,
            lr=learning_rate,
            device=device,
            logger=logger)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'ste':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ste.ste_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'tste':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
            triplets=train_triplets,
            n=n_points,
            emb_dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'triplet_loss':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.triplet_loss_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            iterations=epochs,
            batch_size=batch_size,
            lr=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'gnmds':
        regularizer = config['regularizer']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = gnmds.gnmds(
            triplets=train_triplets,
            reg_lbda=regularizer,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'forte':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = forte.rank_d_pgd(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'ckl':
        regularizer = config['regularizer']
        mu = config['mu']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_k(
            triplets=train_triplets,
            reg_lbda=regularizer,
            mu=mu,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)

    elif algorithm == 'ckl_x':
        mu = config['mu']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_x(
            triplets=train_triplets,
            mu=mu,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'loe':
        x, time_taken, train_error = landmark_oe.landmark_oe_with_data(
            data=vec_data,
            dim=embedding_dimension,
            trip_num=triplet_num,
            learning_rate=learning_rate,
            epochs=epochs,
            batch_size=batch_size,
            device=device,
            logger=logger)
    elif algorithm == 'oenn':

        number_of_neighbours = 50  # config['number_of_neighbours']
        metric = 'eu'  # config['metric']
        all_triplets, triplet_loaders = data_utils_oenn.prep_data_for_nn(
            vec_data=vec_data,
            labels=labels,
            triplet_num=triplet_num,
            batch_size=batch_size,
            metric=metric,
            number_of_neighbours=number_of_neighbours)

        hl_size = int(120 + (2 * embedding_dimension * math.log2(n_points)))
        x, loss_history, triplet_error_history, time_taken, time_history = training_routine_v3.create_and_train_triplet_network(
            dataset_name=dataset_name,
            ind_loaders=triplet_loaders,
            n=n_points,
            dim=embedding_dimension,
            layers=3,
            learning_rate=learning_rate,
            epochs=epochs,
            hl_size=hl_size,
            batch_size=batch_size,
            number_of_triplets=triplet_num,
            logger=logger,
            error_change_threshold=error_change_threshold)
        train_error = triplet_error_batches(x, all_triplets)

    elif algorithm == 'lloe':
        num_landmarks = config['optimizer_params']['num_landmarks']
        subset_size = config['optimizer_params']['subset_size']
        phase1_learning_rate = config['optimizer_params'][
            'phase1_learning_rate']
        phase2_learning_rate = config['optimizer_params'][
            'phase2_learning_rate']
        target_loss = config['optimizer_params']['target_loss']

        number_of_landmarks = min(int(num_landmarks * n_points), 100)
        subset_size = subset_size * number_of_landmarks
        landmarks, first_phase_indices, \
        first_phase_subset_size, first_phase_reconstruction, \
        first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe(
            num_landmarks=number_of_landmarks,
            subset_size=subset_size,
            data=vec_data, dataset_size=n_points,
            embedding_dim=embedding_dimension, epochs=epochs,
            first_phase_lr=phase1_learning_rate,
            device=device,
            target_loss=target_loss,
            batch_size=batch_size,
            logger=logger)
        embedded_indices = first_phase_indices
        embedded_points = first_phase_reconstruction
        non_embedded_indices = list(
            set(range(vec_data.shape[0])).difference(set(embedded_indices)))
        my_oracle = Oracle(data=vec_data)
        logger.info('Second Phase: ')
        logger.info('Oracle Created...')
        logger.info('Computing LLOE - Phase 2...')
        print(time_first_phase)
        # second phase for embedding point by point update
        second_phase_embeddings_index, \
        second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle,
                                                                  non_embedded_indices=non_embedded_indices,
                                                                  embedded_indices=embedded_indices,
                                                                  first_phase_embedded_points=embedded_points,
                                                                  dim=embedding_dimension,
                                                                  lr=phase2_learning_rate, logger=logger)
        # combine the first phase and second phase points and index
        x = np.zeros((vec_data.shape[0], embedding_dimension))
        # phase 1 points
        x[embedded_indices] = embedded_points
        # second phase points
        x[second_phase_embeddings_index] = second_phase_embeddings
        time_taken = time_first_phase + time_second_phase

    logger.info('Time Taken for experiment ' + str(time_taken) + ' seconds.')
    logger.info('Evaluating the computed embeddings...')

    test_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                  n_test_triplets, 1000,
                                                  device)
    test_error = test_triplets_dataset.triplet_error(x)
    procrustes_error = procrustes_disparity(vec_data, x)
    knn_error_ord_emb, knn_error_true_emb = knn_classification_error(
        x, vec_data, labels)

    # log the errors
    logger.info('Train Error: ' + str(train_error))
    logger.info('Test Error: ' + str(test_error))
    logger.info('Procrustes Disparity: ' + str(procrustes_error))
    logger.info('kNN Classification Error on ground-truth: ' +
                str(knn_error_true_emb))
    logger.info('kNN Classification Error on embedding: ' +
                str(knn_error_ord_emb))
    return x, train_triplets, labels, train_error, test_error, procrustes_error, knn_error_true_emb, knn_error_ord_emb, time_taken, loss_history, triplet_error_history, time_history
Exemple #2
0
def main(args):

    config = load_config(args.config_path)
    dataset_name = config['dataset_selected']
    error_change_threshold = config['error_change_threshold']
    batch_size = config['batch_size']
    learning_rate = config['optimizer_params']['learning_rate']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    embedding_dimension = config['output_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    triplet_multiplier = config['triplets_multiplier']
    log_dir = config['log']['path']
    hyper_search = config['hyper_search']['activation']
    optimizer = config['optimizer']

    if hyper_search:
        run_hyper_search(config=config)
    else:
        vec_data, labels = select_dataset(dataset_name,
                                          n_samples=n_samples,
                                          input_dim=input_dim)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        n = vec_data.shape[0]
        logn = int(np.log2(n))
        triplet_num = triplet_multiplier * logn * n * embedding_dimension

        bs = min(batch_size, triplet_num)

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        experiment_name = 'tste_' + \
                          'data_' + dataset_name + \
                          '_error_change_threshold_' + str(error_change_threshold) + \
                          '_input_dim_' + str(input_dim) + \
                          '_output_dim_' + str(embedding_dimension) + \
                           '_originaldimension_' + str(vec_data.shape[1]) + \
                          '_triplet_num_' + str(triplet_multiplier) + \
                          '_n_pts_' + str(n) + \
                          '_lr_' + str(learning_rate) + \
                          '_optimizer_' + str(optimizer) + \
                          '_bs_' + str(batch_size)

        # create a logging file for extensive logging
        logging_path = os.path.join(log_dir, experiment_name + '.log')
        logger = logging_util.my_custom_logger(logger_name=logging_path,
                                               level=logging.INFO)

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Logging Path:' + logging_path)
        logger.info('Dataset Name: ' + dataset_name)
        logger.info('Error Change Threshold: ' + str(error_change_threshold))
        logger.info('Epochs: ' + str(epochs))
        logger.info('Learning Rate: ' + str(learning_rate))
        logger.info('Number of Points: ' + str(n))
        logger.info('Input Dimension: ' + str(input_dim))
        logger.info('Output Dimension: ' + str(embedding_dimension))
        logger.info('Number of Test Triplets: ' + str(number_of_test_triplets))
        logger.info('Triplet Multiplier: ' + str(triplet_multiplier))
        logger.info('Batch Size: ' + str(batch_size))

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, bs, device)

        logger.info('Computing TSTE...')

        x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
            triplets=train_triplets_dataset.trips_data_indices,
            n=n,
            emb_dim=embedding_dimension,
            epochs=epochs,
            batch_size=bs,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)

        logger.info('Evaluating the computed embeddings...')
        # compute triplet error for train and test data
        train_error = train_triplets_dataset.triplet_error(x)
        test_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                      number_of_test_triplets,
                                                      1000, device)
        test_error = test_triplets_dataset.triplet_error(x)
        procrustes_error = procrustes_disparity(vec_data, x)
        knn_error_ord_emb, knn_error_true_emb = knn_classification_error(
            x, vec_data, labels)

        # sample points for tsne visualization
        subsample = np.random.permutation(n)[0:500]
        x = x[subsample, :]
        sub_labels = labels[subsample]

        x_embedded = TSNE(n_components=2, perplexity=15,
                          learning_rate=10).fit_transform(x)
        fig, ax = plt.subplots(1, 1)

        ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels)
        fig.savefig(os.path.join(log_dir, experiment_name + '.png'))

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Epochs: ' + str(epochs))
        logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
        logger.info('Train Error: ' + str(train_error))
        logger.info('Test Error: ' + str(test_error))
        logger.info('Procrustes Disparity: ' + str(procrustes_error))
        logger.info('kNN Classification Error on ground-truth: ' +
                    str(knn_error_true_emb))
        logger.info('kNN Classification Error on embedding: ' +
                    str(knn_error_ord_emb))

        results = {
            'train_error': train_error,
            'test_error': test_error,
            'procrustes': procrustes_error,
            'knn_true': knn_error_true_emb,
            'knn_ord_emb': knn_error_ord_emb,
            'labels': labels,
            'loss_history': loss_history,
            'error_history': triplet_error_history,
            'ordinal_embedding': x,
            'time_taken': time_taken
        }
        joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def main(args):

    config = load_config(args.config_path)
    dataset_name = config['dataset_selected']
    batch_size = config['batch_size']
    phase1_learning_rate = config['optimizer_params']['phase1_learning_rate']
    phase2_learning_rate = config['optimizer_params']['phase2_learning_rate']
    num_landmarks = config['optimizer_params']['num_landmarks']
    subset_size = config['optimizer_params']['subset_size']
    target_loss = config['optimizer_params']['target_loss']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    embedding_dimension = config['output_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    log_dir = config['log']['path']
    hyper_search = config['hyper_search']['activation']

    if hyper_search:
        run_hyper_search(config=config)
    else:
        vec_data, labels = select_dataset(dataset_name=dataset_name,
                                      input_dim=input_dim, n_samples=n_samples)

        n_points = vec_data.shape[0]  # do not remove
        number_of_landmarks = min(int(num_landmarks * n_points), 100)
        subset_size = subset_size * number_of_landmarks

        experiment_name = 'lsoe_' + 'data_' + dataset_name \
                          + '_input_dim_' + str(input_dim) \
                          + '_emb_dimension_' + str(embedding_dimension) \
                          + '_originaldimension_' + str(vec_data.shape[1]) \
                          + '_n_' + str(n_samples) \
                          + '_landmarks_' + str(number_of_landmarks) \
                          + '_bs_ ' + str(batch_size) \
                          + '_pplr_' + str(phase2_learning_rate) \
                          + '_soe_lr_' + str(phase1_learning_rate) \
                          + '_epochs_' + str(epochs)

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        logging_path = os.path.join(log_dir, experiment_name + '.log')
        logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO)

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Dataset Name:' + dataset_name)
        logger.info('Number of Points: ' + str(n_samples))
        logger.info('Dataset Dimension:' + str(input_dim))
        logger.info('Number of Landmarks:' + str(number_of_landmarks))
        logger.info('Number of Subset Size:' + str(subset_size))
        logger.info('First Phase Epochs: ' + str(epochs))

        # set the gpu id
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


        logger.info('Computing SOE - Phase 1...')


        # first phase of the algorithm
        landmarks, first_phase_indices, \
        first_phase_subset_size, first_phase_reconstruction, \
        first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe(
            num_landmarks=number_of_landmarks,
            subset_size=subset_size,
            data=vec_data, dataset_size=n_points,
            embedding_dim=embedding_dimension, epochs=epochs,
            first_phase_lr=phase1_learning_rate,
            device=device,
            target_loss=target_loss,
            batch_size=batch_size,
            logger=logger)

        logger.info('First Phase Loss: ' + str(first_phase_loss))
        logger.info('First Phase Triplet Error: ' + str(first_phase_triplet_error))
        logger.info('First Phase Number of Landmarks: ' + str(landmarks.shape))
        logger.info('First Phase Indices Number: ' + str(len(first_phase_indices)))
        logger.info('First Phase Reconstruction Size: ' + str(first_phase_reconstruction.shape))

        embedded_indices = first_phase_indices
        embedded_points = first_phase_reconstruction
        non_embedded_indices = list(set(range(vec_data.shape[0])).difference(set(embedded_indices)))
        my_oracle = Oracle(data=vec_data)
        logger.info('Second Phase: ')
        logger.info('Oracle Created...')

        logger.info('Computing LLOE - Phase 2...')
        # second phase for embedding point by point update
        second_phase_embeddings_index, \
        second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle,
                                                                  non_embedded_indices=non_embedded_indices,
                                                                  embedded_indices=embedded_indices,
                                                                  first_phase_embedded_points=embedded_points,
                                                                  dim=embedding_dimension,
                                                                  lr=phase2_learning_rate, logger=logger)

        # combine the first phase and second phase points and index
        final_embedding = np.zeros((vec_data.shape[0], embedding_dimension))
        # phase 1 points
        final_embedding[embedded_indices] = embedded_points
        # second phase points
        final_embedding[second_phase_embeddings_index] = second_phase_embeddings
        time_taken = time_first_phase + time_second_phase

        logger.info('Size of Dataset: ' + str(vec_data.shape[0]))
        logger.info('Size of First Phase Indices: ' + str(len(embedded_indices)))
        logger.info('Size of Second Phase Indices: ' + str(len(second_phase_embeddings_index)))

        # Evaluation
        logger.info('Evaluation of the Complete Embedding Dataset: ')
        random_trip_indices = gen_triplet_indices(n=vec_data.shape[0], num_trips=number_of_test_triplets)
        test_triplet_data = gen_triplet_data(data=vec_data, random_triplet_indices=random_trip_indices, batch_size=1000)

        test_error, embedding_error_list = triplet_error(final_embedding, test_triplet_data)
        procrustes_error = procrustes_disparity(vec_data, final_embedding)
        knn_error_ord_emb, knn_error_true_emb = knn_classification_error(final_embedding, vec_data, labels)

        # sample points for tsne visualization
        subsample = np.random.permutation(n_points)[0:500]
        x = final_embedding[subsample, :]
        sub_labels = labels[subsample]

        x_embedded = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(x)
        fig, ax = plt.subplots(1, 1)

        ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels)
        fig.savefig(os.path.join(log_dir, experiment_name + '.png'))

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Epochs: ' + str(epochs))
        logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
        logger.info('Test Error: ' + str(test_error))
        logger.info('Procrustes Disparity: ' + str(procrustes_error))
        logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb))
        logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb))

        results = {'test_error': test_error, 'procrustes': procrustes_error, 'knn_true': knn_error_true_emb,
                   'knn_ord_emb': knn_error_ord_emb, 'labels': labels,
                   'ordinal_embedding': final_embedding, 'time_taken': time_taken}
        joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))