def get_common_info(parameters):
    res = {}
    res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters)
    res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters)
    res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters)
    res['labels_mnist'] = generate_data.load_labels_mnist(
        parameters=parameters)
    res['picked_neighbors'] = generate_data.load_picked_neighbors(
        parameters=parameters)
    res['picked_neighbors_labels'] = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    res['accuracy_nn'] = parameters.get("accuracy_nn",
                                        settings.parameters["accuracy_nn"])
    D_Y = distance.squareform(distance.pdist(res['Y_mnist']))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    res['nearest_neighbors_y_dist'] = np.min(D_Y,
                                             axis=1)  # Actually, whatever axis
    return res
def main(regenerate_model1=False, regenerate_model2=False, regenerate_model3=False,
         parameters=settings.parameters):
    models_and_results = neural_network_commons.train_or_load_models(regenerate_model1=regenerate_model1,
        regenerate_model3=regenerate_model3,regenerate_model2=regenerate_model2,parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)

    model1, model2, model3 = models_and_results["models"]
    Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist = models_and_results["Y_predicted"]

    Y_neighb1_mnist = model1.predict(picked_neighbors)
    Y_neighb2_mnist = model2.predict(picked_neighbors)
    Y_neighb3_mnist = model3.predict(picked_neighbors)

    nn_method_results = [Y_neighb1_mnist, Y_neighb2_mnist, Y_neighb3_mnist]
    nn_models_orig = [Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist]
    nn_method_list = ['NN - 2L; 250N; ReLu; D0.25','NN - 2L; 500N; ReLu; D0.5', 'NN - 1L; 500N; tanh']

    output_file = generate_cluster_results_filename(parameters)

    with open(output_file, 'wb') as f:
        pickle.dump((nn_method_results, nn_models_orig, nn_method_list), f)
def main(parameters=settings.parameters, regenerate=False):
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)

    result = dict()

    output_file = \
        cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(output_prefix, parameters)

    if os.path.isfile(output_file) and not regenerate:
        with open(output_file, "rb") as f:
            result = pickle.load(f)
            logging.info("Previous result loaded")
    else:
        logging.info("No previous result or regeneration requested")

    for fname_prefix in original_files_prefixes:
        cluster_results_file = \
            cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(fname_prefix, parameters)
        logging.info("Processing file: %s", cluster_results_file)
        with open(cluster_results_file, 'rb') as f:
            res = pickle.load(f)
            for i in res.keys():
                logging.info("Processing method: %s", i)
                if i not in result or regenerate:

                    precision = calc_precision(res[i]["EmbeddedPoints"],
                                               X_mnist, Y_mnist,
                                               picked_neighbors, precision_nn)
                    logging.info("%s precision: %f (accuracy was %f)", i,
                                 precision, res[i]["Accuracy"])
                    result[i] = precision

                    with open(output_file, "wb") as f:
                        pickle.dump(result, f)
Example #4
0
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== ACCURACY
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (picked_neighbors_y_gd_transformed,
         picked_neighbors_y_gd_variance_recalc_transformed,
         picked_neighbors_y_gd_transformed_random,
         picked_neighbors_y_gd_variance_recalc_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_method_results = [
        picked_neighbors_y_gd_transformed,
        picked_neighbors_y_gd_transformed_random,
        picked_neighbors_y_gd_variance_recalc_transformed,
        picked_neighbors_y_gd_variance_recalc_transformed_random,
        picked_neighbors_y_gd_early_exagg_transformed,
        picked_neighbors_y_gd_early_exagg_transformed_random,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \
        picked_neighbors_y_time_gd_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(picked_neighbors_y_time_gd_transformed),
        np.mean(picked_neighbors_y_time_gd_transformed_random),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed
        ),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random
        ),
    ]

    gd_accuracy = np.zeros((len(gd_method_list, )))
    gd_precision = np.zeros((len(gd_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    gd_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(gd_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(gd_method_list)):
            y = gd_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix,
                                      axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j])

    # ============================== KL divergence
    for j in range(len(gd_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :],
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            x = picked_neighbors[i, :]
            y = gd_method_results[j][i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [i for i in nn_x_indices if i in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

        gd_accuracy[j] = np.mean(per_sample_accuracy)
        gd_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j],
                     gd_accuracy[j])

    gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_gd_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_performance_file):
        with open(kl_gd_performance_file, 'rb') as f:
            gd_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors), ))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set
            | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0)
            gd_kl[j,
                  i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P,
                                                               y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_performance_file, 'wb') as f:
            pickle.dump((gd_kl, processed_indices), f)
    # This should be fast
    gd_avg_kl = np.mean(gd_kl, axis=1)

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time,
                     gd_avg_kl, gd_distance_percentiles), f)
def main(parameters = settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== Some starting stuff
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y) ** 2, axis=1)
        return np.argsort(y_distances)[:n]

    kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \
        kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f)
    ind = [4, 24, 49]
    kernelized_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:]
        for i in ind]
    kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind]

    kernelized_accuracy = np.zeros((len(kernelized_method_list,)))
    kernelized_precision = np.zeros((len(kernelized_method_list,)))
    kernelized_per_item_time = np.zeros((len(kernelized_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_method_results[j][i, :]
            kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1)))
            kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                                                            kernelized_dist)
    kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_method_list)):
        logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j])

    # ============================== Accuracy and precision
    for j in range(len(kernelized_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors),))
        per_sample_precision = np.zeros((len(picked_neighbors),))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]

            y = kernelized_method_results[j][i,:]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn)
            matching_indices = len([k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels)
        kernelized_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_precision[j] = np.mean(per_sample_precision)
        kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors)
        logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j],
                     kernelized_accuracy[j])

    kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters)
    if os.path.isfile(kl_kernelized_performance_file):
        with open(kl_kernelized_performance_file, 'rb') as f:
            kernelized_kl, processed_indices = pickle.load(f)

    # ============================== KL divergence
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors),))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0)
            kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_performance_file, 'wb') as f:
            pickle.dump((kernelized_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_kl = np.mean(kernelized_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision,
                     kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
Example #6
0
def generate_idw_power_performance(*,
                                   regenerate=False,
                                   recursive_regenerate=False,
                                   parameters=settings.parameters):
    global_idw_power_performance = dict()  # Start from scratch
    global_idw_power_performance_abs = dict()  # Start from scratch
    global_idw_accuracy = dict()
    global_idw_precision = dict()

    start_time = datetime.datetime.now()
    logging.info("IDW power experiment started: %s", start_time)
    idw_power_performance_file = generate_idw_power_filename(parameters)
    idw_power_plot_file = generate_idw_power_plot_filename(parameters)

    X_mnist = generate_data.load_x_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in idw_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)

    if os.path.isfile(idw_power_performance_file) and not regenerate:
        with open(idw_power_performance_file, 'rb') as f:
            global_idw_power_performance, global_idw_power_performance_abs, global_idw_accuracy = pickle.load(
                f)
    else:
        logging.info("Regeneration requested")

    for p in idw_power_options:
        if p in global_idw_power_performance:
            logging.info("Loaded p %f", p)
            continue

        logging.info("Processing p %f", p)

        interpolator = dTSNE_mnist.generate_embedding_function(
            embedding_function_type='weighted-inverse-distance',
            function_kwargs={'power': p})

        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))

        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            result = interpolator(picked_neighbors[i], verbose=0)
            nn_indices = get_nearest_neighbors_in_y(result,
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            y = result
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)
        cur_acc = np.mean(per_sample_accuracy)
        cur_prec = np.mean(per_sample_precision)

        y_sum_square_dist = 0.0
        y_sum_abs_dist = 0.0
        y_abs_dist = 0.0
        y_count = 0.0
        for i in range(len(X_mnist)):
            distances = distance_matrix[i, :].copy()
            # distances[i] = np.inf #Not interested in distance to itself
            # Step 1. Find nearest neighbors in the neighborhood.
            neighbor_indices = list(range(X_mnist.shape[0]))
            neighbor_indices.remove(i)
            num_neighbors = len(neighbor_indices)
            weights = 1 / distances[neighbor_indices]**p
            weights = weights / np.sum(weights)
            cur_y_result = weights.dot(Y_mnist[neighbor_indices, :])
            y_sum_square_dist += np.sum(cur_y_result - Y_mnist[i, :])**2
            y_sum_abs_dist += np.sqrt(np.sum(cur_y_result - Y_mnist[i, :])**2)
            y_count += 1.0

        global_idw_power_performance[p] = y_sum_square_dist / y_count
        global_idw_power_performance_abs[p] = y_sum_abs_dist / y_count
        global_idw_accuracy[p] = cur_acc
        global_idw_precision[p] = cur_prec

        # Just in case it will become unstable due to too few neighbors
        # lion_power_plot_data[(p, perc)]['PowerSquareDistSum'] = y_sum_square_dist
        # lion_power_plot_data[(p, perc)]['PowerSquareDistCount'] = y_count

        with open(idw_power_performance_file, 'wb') as f:
            pickle.dump((global_idw_power_performance,
                         global_idw_power_performance_abs, global_idw_accuracy,
                         global_idw_precision), f)

    EPS = 1e-5
    y = list()
    x_global = list()
    for cur_power in idw_power_options:
        closest_power = [
            i for i in global_idw_power_performance_abs
            if np.abs(i - cur_power) < EPS
        ]
        if len(closest_power) > 0:
            x_global.append(cur_power)
            y.append(global_idw_power_performance[closest_power[0]])
    idw_optimal_power = x_global[np.argmin(y)]

    with open(idw_power_plot_file, 'wb') as f:
        pickle.dump((x_global, y, idw_optimal_power), f)
    logging.info("IDW optimal power: %f", idw_optimal_power)

    end_time = datetime.datetime.now()
    logging.info("IDW power experiment ended: %s", end_time)
    logging.info("IDW power experiment duration: %s", end_time - start_time)
Example #7
0
def main(parameters=settings.parameters, regenerate_parameters_cache=False):
    step = 0.01
    choice_K = np.arange(step, 2 + step, step)  # Let's try those K.

    logging.info("Started loading.")
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)
    baseline_accuracy = generate_data.get_baseline_accuracy(
        parameters=parameters)
    logging.info("Loaded everything.")

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    # Implementing carefully. Not the fastest, but the most reliable way.

    kernel_tsne_mapping = kernelized_tsne.generate_kernelized_tsne_mapping_function(
        parameters=parameters,
        regenerate_parameters_cache=regenerate_parameters_cache)

    kernelized_detailed_tsne_method_list = [
        "Kernelized tSNE; K=%.2f" % (k) for k in choice_K
    ]
    kernelized_detailed_tsne_method_results = list()

    kernelized_detailed_tsne_accuracy = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))
    kernelized_detailed_tsne_precision = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))
    kernelized_detailed_tsne_time = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))

    for j in range(len(choice_K)):
        k = choice_K[j]
        logging.info("%f", k)

        embedder_start_time = datetime.datetime.now()
        kernelized_detailed_tsne_method_results.append(
            kernel_tsne_mapping(picked_neighbors, k=k))
        embedder_end_time = datetime.datetime.now()
        kernelized_detailed_tsne_time[j] = (
            embedder_end_time - embedder_start_time).total_seconds()
        logging.info("%f complete", k)
        #kernelized_detailed_tsne_method_results = [kernel_tsne_mapping(picked_neighbors, k=k) for k in choice_K]

        logging.info("%s", kernelized_detailed_tsne_method_list[j])
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            if i % 200 == 0:
                logging.info("%d", i)
            expected_label = picked_neighbor_labels[i]
            y = kernelized_detailed_tsne_method_results[j][i, :]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(
                kernelized_detailed_tsne_method_results[j][i, :],
                Y_mnist,
                n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)
        kernelized_detailed_tsne_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_detailed_tsne_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f\t%f s",
                     kernelized_detailed_tsne_method_list[j],
                     kernelized_detailed_tsne_precision[j],
                     kernelized_detailed_tsne_accuracy[j],
                     kernelized_detailed_tsne_time[j])

    # Accuracy-vs-power plot
    legend_list = list()
    f, ax = plt.subplots()
    f.set_size_inches(6, 3)
    x = [k for k in choice_K]  # Ensuring order
    y = kernelized_detailed_tsne_accuracy
    # plt.title("IDW - Accuracy vs Power") # We'd better use figure caption
    # ax.legend([h1,h2,h3,h4,h5,h6], ["Closest Training Set Image"]+idw_method_list)
    plt.plot(x, y, c='blue')
    h = plt.axhline(y=baseline_accuracy, c='black', linestyle='--')
    plt.legend([h], ["Baseline Accuracy (%.4f)" % baseline_accuracy])
    plt.xlabel("Kernelized tSNE: K parameter")
    plt.ylabel("10-NN Accuracy")
    plt.ylim([0, 1])
    plt.xlim([0, 2])
    f.tight_layout()
    plt.savefig("../figures/kernelized-tsne-K-vs-accuracy.png")

    ind = [4, 24, 49]
    kernelized_tsne_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] +
        kernelized_detailed_tsne_method_list[i][-8:] for i in ind
    ]
    kernelized_tsne_method_results = [
        kernelized_detailed_tsne_method_results[i] for i in ind
    ]

    kernelized_tsne_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(kernelized_tsne_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_tsne_method_list)):
            y = kernelized_tsne_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            kernelized_tsne_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    kernelized_tsne_distance_percentiles = np.mean(
        kernelized_tsne_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_tsne_method_list)):
        print(kernelized_tsne_method_list[j],
              kernelized_tsne_distance_percentiles[j])

    output_file = generate_cluster_results_filename(parameters)
    with open(output_file, 'wb') as f:
        pickle.dump(
            (kernelized_detailed_tsne_method_results,
             kernelized_detailed_tsne_accuracy,
             kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time,
             kernelized_detailed_tsne_method_list), f)
Example #8
0
def generate_lion_power_performance(*,
                                    regenerate=False,
                                    recursive_regenerate=False,
                                    parameters=settings.parameters):
    start_time = datetime.datetime.now()
    logging.info("LION power experiment started: %s", start_time)

    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    lion_power_performance_data_file = generate_lion_power_performance_filename(
        parameters)
    lion_power_plot_data_file = generate_lion_power_plot_filename(parameters)

    lion_power_performance_data = dict()  # Start from scratch

    X_mnist = generate_data.load_x_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    labels_mnist = generate_data.load_labels_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    dTSNE_mnist = generate_data.load_dtsne_mnist(
        parameters=settings.parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=settings.parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=settings.parameters)

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in lion_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)
    logging.info("Radius X: %s", radius_x)

    if os.path.isfile(lion_power_performance_data_file) and not regenerate:
        with open(lion_power_performance_data_file, 'rb') as f:
            lion_power_performance_data = pickle.load(f)

    for perc in lion_percentile_options:
        for p in lion_power_options:
            logging.info("Processing percentile and power: %f, %d", p, perc)
            key = str(perc) + ";" + "%.3f" % (p)
            logging.info("Key: %s", key)
            if key not in lion_power_performance_data:
                lion_power_performance_data[key] = dict()

            if 'Accuracy' not in lion_power_performance_data[key]:
                logging.info(
                    "Accuracy not found for power %f percentile %d. \tCalculating...",
                    p, perc)
                interpolator = dTSNE_mnist.generate_lion_tsne_embedder(
                    verbose=0,
                    random_state=0,
                    function_kwargs={
                        'radius_x_percentile': perc,
                        'power': p
                    })

                per_sample_accuracy = np.zeros((len(picked_neighbors), ))
                per_sample_precision = np.zeros((len(picked_neighbors), ))

                for i in range(len(picked_neighbors)):
                    # if i%100==0:
                    #    print("\tPower: ",p,"Processing:",i)
                    expected_label = picked_neighbor_labels[i]
                    result = interpolator(picked_neighbors[i], verbose=0)
                    nn_indices = get_nearest_neighbors_in_y(result,
                                                            Y_mnist,
                                                            n=accuracy_nn)
                    obtained_labels = labels_mnist[nn_indices]
                    per_sample_accuracy[i] = sum(
                        obtained_labels == expected_label) / len(
                            obtained_labels)

                    y = result
                    x = picked_neighbors[i, :]
                    nn_x_indices = get_nearest_neighbors_in_y(x,
                                                              X_mnist,
                                                              n=precision_nn)
                    nn_y_indices = get_nearest_neighbors_in_y(y,
                                                              Y_mnist,
                                                              n=precision_nn)
                    matching_indices = len(
                        [k for k in nn_x_indices if k in nn_y_indices])
                    per_sample_precision[i] = (matching_indices / precision_nn)

                cur_acc = np.mean(per_sample_accuracy)
                cur_prec = np.mean(per_sample_precision)
                # print('================= ',p,perc, cur_acc)
                lion_power_performance_data[key]['Accuracy'] = cur_acc
                lion_power_performance_data[key]['Precision'] = cur_prec
                with open(lion_power_performance_data_file, 'wb') as f:
                    pickle.dump(lion_power_performance_data, f)
            else:
                logging.info(
                    "Accuracy FOUND for power %f percentile %d. Using loaded.",
                    p, perc)

            if 'PowerSquareDist' not in lion_power_performance_data[
                    key] or regenerate:
                logging.info(
                    "Power performance not found for power %f percentile %d.\tCalculating...",
                    p, perc)

                y_sum_square_dist = 0.0
                y_sum_abs_dist = 0.0
                y_count = 0.0
                for i in range(len(X_mnist)):
                    distances = distance_matrix[i, :].copy()
                    distances[
                        i] = np.inf  # Not interested in distance to itself
                    # Step 1. Find nearest neighbors in the neighborhood.
                    neighbor_indices = np.where(distances <= radius_x[perc])[0]
                    num_neighbors = len(neighbor_indices)
                    if num_neighbors >= 2:  # Below 2? Cannot interpolate
                        # We are good
                        weights = 1 / distances[neighbor_indices]**p
                        weights = weights / np.sum(weights)
                        cur_y_result = weights.dot(
                            Y_mnist[neighbor_indices, :])
                        y_sum_square_dist += np.sum(cur_y_result -
                                                    Y_mnist[i, :])**2
                        y_sum_abs_dist += np.sqrt(
                            np.sum(cur_y_result - Y_mnist[i, :])**2)
                        y_count += 1.0
                new_dict = dict()
                new_dict['PowerSquareDist'] = y_sum_square_dist / y_count
                new_dict['PowerAbsDist'] = y_sum_abs_dist / y_count
                # Just in case it will become unstable due to too few neighbors
                new_dict['PowerSquareDistSum'] = y_sum_square_dist
                new_dict['PowerSquareDistCount'] = y_count
                for ndk in new_dict.keys():
                    lion_power_performance_data[key][ndk] = new_dict[ndk]

                with open(lion_power_performance_data_file, 'wb') as f:
                    pickle.dump(lion_power_performance_data, f)
            else:
                logging.info(
                    "Power FOUND for power %f percentile %d. Using loaded.", p,
                    perc)

            logging.info("%s %s", key, lion_power_performance_data[key])

    lion_optimal_power = dict()
    lion_power_plot_y = dict()
    for perc in lion_percentile_options:
        y = list()
        for cur_power in lion_power_options:
            key = str(perc) + ";%.3f" % (cur_power)
            # print(cur_power, perc, lion_power_plot_data[key])
            y.append(lion_power_performance_data[key]['PowerSquareDist'])
        lion_power_plot_y[perc] = y
        lion_optimal_power[perc] = lion_power_options[np.argmin(y)]

    with open(lion_power_plot_data_file, 'wb') as f:
        pickle.dump(
            (lion_power_options, lion_power_plot_y, lion_optimal_power), f)
    logging.info("LION optimal power: %s", lion_optimal_power)

    end_time = datetime.datetime.now()
    logging.info("LION power experiment ended: %s", end_time)
    logging.info("LION power experiment duration: %s", end_time - start_time)
def main(regenerate, only_time):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist= generate_data.load_y_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)

    # Doing it from scratch takes REALLY long time. If possible, save results & pre-load

    # These are consequences of parallelization
    # input_files = ['gd_results' + str(100 * i) + '_' + str(100 * i + 100) + '.p' for i in range(10)]
    output_file = generate_cluster_results_filename(parameters)
    output_time_file = generate_time_results_filename(parameters)

    first_sample_inc = 0  # Change only if it is one of "Other notebooks just for parallelization"
    last_sample_exclusive = len(picked_neighbors)

    if os.path.isfile(output_file) and not regenerate:
        logging.info("Found previous partially completed test. Starting from there.")
        with open(output_file, 'rb') as f:
            (picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed,
             picked_neighbors_y_gd_transformed_random,
             picked_neighbors_y_gd_variance_recalc_transformed_random,
             picked_neighbors_y_gd_early_exagg_transformed_random,
             picked_neighbors_y_gd_early_exagg_transformed,
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
             picked_random_starting_positions,
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f)
        with open(output_time_file, 'rb') as f:
            (picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed,
             picked_neighbors_y_time_gd_transformed_random,
             picked_neighbors_y_time_gd_variance_recalc_transformed_random,
             picked_neighbors_y_time_gd_early_exagg_transformed_random,
             picked_neighbors_y_time_gd_early_exagg_transformed,
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random,
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f)
    else:
        logging.info("No previous partially completed test, or regeneration requested. Starting from scratch.")
        covered_samples = list()

        # Let's build all possible combinations. Later we'll decide what to plot
        picked_neighbors_y_gd_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))

        picked_neighbors_y_gd_early_exagg_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_early_exagg_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random = np.zeros(
             (len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))

        picked_neighbors_y_time_gd_transformed = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_transformed = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_transformed_random = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_transformed_random = np.zeros((len(picked_neighbors), ))

        picked_neighbors_y_time_gd_early_exagg_transformed_random = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_early_exagg_transformed = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed = np.zeros((len(picked_neighbors), ))

        picked_random_starting_positions = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))

    for i in range(first_sample_inc, last_sample_exclusive):
         np.random.seed(i)  # We reset random seed every time. Otherwise, if you load partial results from file, everything
         # will depend on which parts were loaded, random sequence will "shift" depend on that, and reproducibility will be lost.
         # I.e. if put seed(0) before the loop and start from scratch, then you will have some random sequence [abc] for sample 0,
         # other (continuation of that sequence) [def] for sample 1, etc. But if you already loaded sample 0 from file, you will
         # have [abc] for sample 1, [def] for sample 2, etc. Reproducibility should not depend on what parts are loaded.
         # Hence, random seed every time, and it depends on ABSOLUTE sample number.
         logging.info(" ====================== Sample %d\n\n", i)
         if i in covered_samples and not regenerate:
             logging.info("Already loaded.")
         else:
             neighbor = picked_neighbors[i].reshape((1, -1))

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_transformed[i, :] = dTSNE_mnist.transform(neighbor, y='closest',
                                                                             verbose=2,
                                                                             optimizer_kwargs={'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_transformed[i] = (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time: %f s", picked_neighbors_y_time_gd_transformed[i])
             

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_transformed[i, :] = dTSNE_mnist.transform(neighbor, keep_sigmas=False,
                      y='closest',
                      verbose=2, optimizer_kwargs={'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_transformed[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR): %f s", picked_neighbors_y_time_gd_variance_recalc_transformed[i])

             # Let's pick random starts at any point. not necessary near the center.
             y_start = np.array([[
                 np.random.uniform(np.min(Y_mnist[:, 0]), np.max(Y_mnist[:, 0])),
                 np.random.uniform(np.min(Y_mnist[:, 1]), np.max(Y_mnist[:, 1]))
             ]])

             picked_random_starting_positions[i, :] = y_start

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_transformed_random[i, :] = dTSNE_mnist.transform(neighbor, y=y_start,  # y='random',
                    verbose=2, optimizer_kwargs={
                     'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (random): %f s", picked_neighbors_y_time_gd_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_transformed_random[i, :] = dTSNE_mnist.transform(neighbor,
                    keep_sigmas=False,
                    y=y_start,  # y='random',
                    verbose=2,
                    optimizer_kwargs={
                        'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR,random): %f s", picked_neighbors_y_time_gd_variance_recalc_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_early_exagg_transformed_random[i, :] = dTSNE_mnist.transform(neighbor, y=y_start,
                                                                                                # y='random',
                                                                                                verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_early_exagg_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (EE,random): %f s", picked_neighbors_y_time_gd_early_exagg_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_early_exagg_transformed[i, :] = dTSNE_mnist.transform(neighbor, y='closest', verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_early_exagg_transformed[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (EE): %f s", picked_neighbors_y_time_gd_early_exagg_transformed[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random[i, :] = \
                 dTSNE_mnist.transform(neighbor,
                                        y=y_start,
                                        keep_sigmas=False,
                                        verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR, EE, random): %f s",
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed[i, :] = dTSNE_mnist.transform(neighbor,
                   keep_sigmas=False,y='closest',verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR, EE): %f s",
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed[i])

             covered_samples.append(i)
         # Re-saving even if it is a loaded sample
         logging.info("Saving...")
         # Gradient descent results take a long while. Let's cache.
         if not only_time:
            with open(output_file, 'wb') as f:
                pickle.dump((picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed,
                          picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random,
                          picked_neighbors_y_gd_early_exagg_transformed_random,
                          picked_neighbors_y_gd_early_exagg_transformed,
                          picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
                          picked_random_starting_positions,
                          picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples), f)
         with open(output_time_file, 'wb') as f:
             pickle.dump((picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed,
                          picked_neighbors_y_time_gd_transformed_random,
                          picked_neighbors_y_time_gd_variance_recalc_transformed_random,
                          picked_neighbors_y_time_gd_early_exagg_transformed_random,
                          picked_neighbors_y_time_gd_early_exagg_transformed,
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random,
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples), f)