Example #1
0
def run_hyper_search(config):
    """
    Important Hyperparameters for tSTE:
    Learning Rate: [1, 0.1, 0.01]
    """
    dataset_name = config['dataset_selected']
    batch_size = config['batch_size']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    log_dir = config['log']['path']
    triplet_multiplier_range = config['hyper_search']['triplets_multiplier']
    learning_rate_range = config['hyper_search']['learning_rate']
    optimizer = config['optimizer']
    dimensions_range = config['hyper_search']['output_dimension']

    separator = '_'
    experiment_name = 'tste_hyper_search_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_samples) + \
                      '_num_test_trips_' + str(number_of_test_triplets) + \
                      '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \
                      '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \
                      '_optimizer_' + str(optimizer) + \
                      '_bs_' + str(batch_size) + \
                      '_triplet_number_' + separator.join([str(i) for i in triplet_multiplier_range])

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    logging_path = os.path.join(log_dir, experiment_name + '.log')
    logger = logging_util.my_custom_logger(logger_name=logging_path,
                                           level=logging.INFO)
    logger.info('Name of Experiment: ' + experiment_name)
    logger.info('Logging Path:' + logging_path)
    logger.info('Dataset Name: ' + dataset_name)
    logger.info('Epochs: ' + str(epochs))

    best_params_train = {}
    best_params_test = {}
    all_results = {}

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    vec_data, labels = select_dataset(
        dataset_name, n_samples=n_samples, input_dim=input_dim
    )  # input_dim is only argument for uniform. Ignored otherwise

    n = vec_data.shape[0]
    logn = int(np.log2(n))
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        all_results[(emb_dim, triplet_multiplier)] = {}
        best_train_error = 1
        best_test_error = 1

        triplet_num = triplet_multiplier * logn * n * emb_dim

        bs = min(batch_size, triplet_num)

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, bs, device)
        logger.info('Testing on: ' + dataset_name +
                    '. Embedding dimension is ' + str(emb_dim))
        logger.info(' ')
        for learning_rate in learning_rate_range:

            logger.info(10 * '-' + ' New parameters' + 10 * '-')
            logger.info('Learning Rate: ' + str(learning_rate))
            logger.info('Number of Points: ' + str(n))
            logger.info('Input Dimension: ' + str(input_dim))
            logger.info('Output Dimension: ' + str(emb_dim))
            logger.info('Number of Test Triplets: ' +
                        str(number_of_test_triplets))
            logger.info('Triplet Multiplier: ' + str(triplet_multiplier))
            logger.info('Batch Size: ' + str(batch_size))

            logger.info('Computing tSTE...')

            x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
                triplets=train_triplets_dataset.trips_data_indices,
                n=n,
                emb_dim=emb_dim,
                epochs=epochs,
                batch_size=bs,
                learning_rate=learning_rate,
                device=device,
                logger=logger)

            # compute triplet error for train and test data
            train_error = train_triplets_dataset.triplet_error(x)
            logger.info('Triplet Error on Training Triplets: ' +
                        str(train_error))
            test_triplets_dataset = TripletBatchesDataset(
                vec_data, labels, number_of_test_triplets, 1000, device)
            test_error = test_triplets_dataset.triplet_error(x)
            #procrustes_error = procrustes_disparity(vec_data, x)
            #knn_error_ord_emb, knn_error_true_emb = knn_classification_error(x, vec_data, labels)

            logger.info('Epochs: ' + str(epochs))
            logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
            logger.info('Train Error: ' + str(train_error))
            logger.info('Test Error: ' + str(test_error))
            #logger.info('Procrustes Disparity: ' + str(procrustes_error))
            #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb))
            #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb))

            results = {
                'train_error': train_error,
                'test_error': test_error,
                'loss_history': loss_history,
                'error_history': triplet_error_history,
                'last_embedding': x
            }
            all_results[(emb_dim,
                         triplet_multiplier)].update({learning_rate: results})

            if test_error < best_test_error:
                best_params_test[(emb_dim, triplet_multiplier)] = {
                    'learning_rate': learning_rate,
                    'optimizer': optimizer,
                    'error': test_error
                }
                best_test_error = test_error
            if train_error < best_train_error:
                best_params_train[(emb_dim, triplet_multiplier)] = {
                    'learning_rate': learning_rate,
                    'optimizer': optimizer,
                    'error': train_error
                }
                best_train_error = train_error
        result_name = 'tste_convergence_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_samples) + \
                      '_output_dim_' + str(emb_dim) + \
                      '_bs_' + str(batch_size) + \
                      '_triplet_number_' + str(triplet_multiplier)
        all_results['labels'] = labels
        joblib.dump(all_results[(emb_dim, triplet_multiplier)],
                    os.path.join(log_dir, result_name + '.pkl'))

    # print all results as well again
    logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-')
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        results = all_results[(emb_dim, triplet_multiplier)]
        logger.info('Results for emb dimension ' + str(emb_dim) +
                    ' and triplet multiplier ' + str(triplet_multiplier))
        for learning_rate in learning_rate_range:
            logger.info('learning rate ' + str(learning_rate) +
                        ' -- train error: ' +
                        str(results[learning_rate]['train_error']) +
                        ' test error: ' +
                        str(results[learning_rate]['test_error']))

    # print best parameter settings
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        logger.info('Best Parameters for emb dimension ' + str(emb_dim) +
                    ' and triplet multiplier ' + str(triplet_multiplier))
        best_on_train = best_params_train[(emb_dim, triplet_multiplier)]
        best_on_test = best_params_test[(emb_dim, triplet_multiplier)]
        logger.info('achieved ' + str(best_on_train['error']) +
                    ' train error with learning rate: ' +
                    str(best_on_train['learning_rate']))
        logger.info('achieved ' + str(best_on_test['error']) +
                    ' test error with learning rate: ' +
                    str(best_on_test['learning_rate']))
Example #2
0
def main(args):

    config = load_config(args.config_path)
    dataset_name = config['dataset_selected']
    error_change_threshold = config['error_change_threshold']
    batch_size = config['batch_size']
    learning_rate = config['optimizer_params']['learning_rate']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    embedding_dimension = config['output_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    triplet_multiplier = config['triplets_multiplier']
    log_dir = config['log']['path']
    hyper_search = config['hyper_search']['activation']
    optimizer = config['optimizer']

    if hyper_search:
        run_hyper_search(config=config)
    else:
        vec_data, labels = select_dataset(dataset_name,
                                          n_samples=n_samples,
                                          input_dim=input_dim)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        n = vec_data.shape[0]
        logn = int(np.log2(n))
        triplet_num = triplet_multiplier * logn * n * embedding_dimension

        bs = min(batch_size, triplet_num)

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        experiment_name = 'tste_' + \
                          'data_' + dataset_name + \
                          '_error_change_threshold_' + str(error_change_threshold) + \
                          '_input_dim_' + str(input_dim) + \
                          '_output_dim_' + str(embedding_dimension) + \
                           '_originaldimension_' + str(vec_data.shape[1]) + \
                          '_triplet_num_' + str(triplet_multiplier) + \
                          '_n_pts_' + str(n) + \
                          '_lr_' + str(learning_rate) + \
                          '_optimizer_' + str(optimizer) + \
                          '_bs_' + str(batch_size)

        # create a logging file for extensive logging
        logging_path = os.path.join(log_dir, experiment_name + '.log')
        logger = logging_util.my_custom_logger(logger_name=logging_path,
                                               level=logging.INFO)

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Logging Path:' + logging_path)
        logger.info('Dataset Name: ' + dataset_name)
        logger.info('Error Change Threshold: ' + str(error_change_threshold))
        logger.info('Epochs: ' + str(epochs))
        logger.info('Learning Rate: ' + str(learning_rate))
        logger.info('Number of Points: ' + str(n))
        logger.info('Input Dimension: ' + str(input_dim))
        logger.info('Output Dimension: ' + str(embedding_dimension))
        logger.info('Number of Test Triplets: ' + str(number_of_test_triplets))
        logger.info('Triplet Multiplier: ' + str(triplet_multiplier))
        logger.info('Batch Size: ' + str(batch_size))

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, bs, device)

        logger.info('Computing TSTE...')

        x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
            triplets=train_triplets_dataset.trips_data_indices,
            n=n,
            emb_dim=embedding_dimension,
            epochs=epochs,
            batch_size=bs,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)

        logger.info('Evaluating the computed embeddings...')
        # compute triplet error for train and test data
        train_error = train_triplets_dataset.triplet_error(x)
        test_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                      number_of_test_triplets,
                                                      1000, device)
        test_error = test_triplets_dataset.triplet_error(x)
        procrustes_error = procrustes_disparity(vec_data, x)
        knn_error_ord_emb, knn_error_true_emb = knn_classification_error(
            x, vec_data, labels)

        # sample points for tsne visualization
        subsample = np.random.permutation(n)[0:500]
        x = x[subsample, :]
        sub_labels = labels[subsample]

        x_embedded = TSNE(n_components=2, perplexity=15,
                          learning_rate=10).fit_transform(x)
        fig, ax = plt.subplots(1, 1)

        ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels)
        fig.savefig(os.path.join(log_dir, experiment_name + '.png'))

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Epochs: ' + str(epochs))
        logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
        logger.info('Train Error: ' + str(train_error))
        logger.info('Test Error: ' + str(test_error))
        logger.info('Procrustes Disparity: ' + str(procrustes_error))
        logger.info('kNN Classification Error on ground-truth: ' +
                    str(knn_error_true_emb))
        logger.info('kNN Classification Error on embedding: ' +
                    str(knn_error_ord_emb))

        results = {
            'train_error': train_error,
            'test_error': test_error,
            'procrustes': procrustes_error,
            'knn_true': knn_error_true_emb,
            'knn_ord_emb': knn_error_ord_emb,
            'labels': labels,
            'loss_history': loss_history,
            'error_history': triplet_error_history,
            'ordinal_embedding': x,
            'time_taken': time_taken
        }
        joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def run_method(config, dataset_name, algorithm, n_points, input_dim,
               embedding_dimension, learning_rate, batch_size,
               triplet_multiplier, optimizer, epochs, n_test_triplets, logger,
               error_change_threshold):
    vec_data, labels = select_dataset(dataset_name,
                                      n_samples=n_points,
                                      input_dim=input_dim)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    n_points = vec_data.shape[0]

    triplet_num = np.int(
        np.ceil(triplet_multiplier * n_points * math.log2(n_points) *
                embedding_dimension))
    train_triplets = []
    loss_history, triplet_error_history, time_history = [], [], []

    batch_size = min(batch_size, triplet_num)

    logger.info('Computing Embedding...')
    logger.info('Number of Points: ' + str(n_points))
    logger.info('Number of Triplets: ' + str(triplet_num))
    logger.info('Input Dimension: ' + str(input_dim))
    logger.info('Output Dimension: ' + str(embedding_dimension))
    time_taken = 0
    train_error = -1  # active methods wont have a train error
    if optimizer == 'adam' and algorithm == 'soe':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)

    elif optimizer == 'sgd' and algorithm == 'soe':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.soe_sgd(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            iterations=epochs,
            bs=batch_size,
            lr=learning_rate,
            device=device,
            logger=logger)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'ste':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ste.ste_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'tste':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
            triplets=train_triplets,
            n=n_points,
            emb_dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'triplet_loss':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = soe.triplet_loss_adam(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            iterations=epochs,
            batch_size=batch_size,
            lr=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'gnmds':
        regularizer = config['regularizer']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = gnmds.gnmds(
            triplets=train_triplets,
            reg_lbda=regularizer,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'forte':
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = forte.rank_d_pgd(
            triplets=train_triplets,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'ckl':
        regularizer = config['regularizer']
        mu = config['mu']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_k(
            triplets=train_triplets,
            reg_lbda=regularizer,
            mu=mu,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)

    elif algorithm == 'ckl_x':
        mu = config['mu']
        logger.info('Generating triplets...')

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, batch_size,
                                                       device)
        train_triplets = train_triplets_dataset.trips_data_indices

        x, loss_history, triplet_error_history, time_taken, time_history = ckl.ckl_x(
            triplets=train_triplets,
            mu=mu,
            n=n_points,
            dim=embedding_dimension,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)
        # compute triplet error for train and test data
        train_error = triplet_error_batches(x, train_triplets)
    elif algorithm == 'loe':
        x, time_taken, train_error = landmark_oe.landmark_oe_with_data(
            data=vec_data,
            dim=embedding_dimension,
            trip_num=triplet_num,
            learning_rate=learning_rate,
            epochs=epochs,
            batch_size=batch_size,
            device=device,
            logger=logger)
    elif algorithm == 'oenn':

        number_of_neighbours = 50  # config['number_of_neighbours']
        metric = 'eu'  # config['metric']
        all_triplets, triplet_loaders = data_utils_oenn.prep_data_for_nn(
            vec_data=vec_data,
            labels=labels,
            triplet_num=triplet_num,
            batch_size=batch_size,
            metric=metric,
            number_of_neighbours=number_of_neighbours)

        hl_size = int(120 + (2 * embedding_dimension * math.log2(n_points)))
        x, loss_history, triplet_error_history, time_taken, time_history = training_routine_v3.create_and_train_triplet_network(
            dataset_name=dataset_name,
            ind_loaders=triplet_loaders,
            n=n_points,
            dim=embedding_dimension,
            layers=3,
            learning_rate=learning_rate,
            epochs=epochs,
            hl_size=hl_size,
            batch_size=batch_size,
            number_of_triplets=triplet_num,
            logger=logger,
            error_change_threshold=error_change_threshold)
        train_error = triplet_error_batches(x, all_triplets)

    elif algorithm == 'lloe':
        num_landmarks = config['optimizer_params']['num_landmarks']
        subset_size = config['optimizer_params']['subset_size']
        phase1_learning_rate = config['optimizer_params'][
            'phase1_learning_rate']
        phase2_learning_rate = config['optimizer_params'][
            'phase2_learning_rate']
        target_loss = config['optimizer_params']['target_loss']

        number_of_landmarks = min(int(num_landmarks * n_points), 100)
        subset_size = subset_size * number_of_landmarks
        landmarks, first_phase_indices, \
        first_phase_subset_size, first_phase_reconstruction, \
        first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe(
            num_landmarks=number_of_landmarks,
            subset_size=subset_size,
            data=vec_data, dataset_size=n_points,
            embedding_dim=embedding_dimension, epochs=epochs,
            first_phase_lr=phase1_learning_rate,
            device=device,
            target_loss=target_loss,
            batch_size=batch_size,
            logger=logger)
        embedded_indices = first_phase_indices
        embedded_points = first_phase_reconstruction
        non_embedded_indices = list(
            set(range(vec_data.shape[0])).difference(set(embedded_indices)))
        my_oracle = Oracle(data=vec_data)
        logger.info('Second Phase: ')
        logger.info('Oracle Created...')
        logger.info('Computing LLOE - Phase 2...')
        print(time_first_phase)
        # second phase for embedding point by point update
        second_phase_embeddings_index, \
        second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle,
                                                                  non_embedded_indices=non_embedded_indices,
                                                                  embedded_indices=embedded_indices,
                                                                  first_phase_embedded_points=embedded_points,
                                                                  dim=embedding_dimension,
                                                                  lr=phase2_learning_rate, logger=logger)
        # combine the first phase and second phase points and index
        x = np.zeros((vec_data.shape[0], embedding_dimension))
        # phase 1 points
        x[embedded_indices] = embedded_points
        # second phase points
        x[second_phase_embeddings_index] = second_phase_embeddings
        time_taken = time_first_phase + time_second_phase

    logger.info('Time Taken for experiment ' + str(time_taken) + ' seconds.')
    logger.info('Evaluating the computed embeddings...')

    test_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                  n_test_triplets, 1000,
                                                  device)
    test_error = test_triplets_dataset.triplet_error(x)
    procrustes_error = procrustes_disparity(vec_data, x)
    knn_error_ord_emb, knn_error_true_emb = knn_classification_error(
        x, vec_data, labels)

    # log the errors
    logger.info('Train Error: ' + str(train_error))
    logger.info('Test Error: ' + str(test_error))
    logger.info('Procrustes Disparity: ' + str(procrustes_error))
    logger.info('kNN Classification Error on ground-truth: ' +
                str(knn_error_true_emb))
    logger.info('kNN Classification Error on embedding: ' +
                str(knn_error_ord_emb))
    return x, train_triplets, labels, train_error, test_error, procrustes_error, knn_error_true_emb, knn_error_ord_emb, time_taken, loss_history, triplet_error_history, time_history