def get_common_info(parameters):
    res = {}
    res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters)
    res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters)
    res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters)
    res['labels_mnist'] = generate_data.load_labels_mnist(
        parameters=parameters)
    res['picked_neighbors'] = generate_data.load_picked_neighbors(
        parameters=parameters)
    res['picked_neighbors_labels'] = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    res['accuracy_nn'] = parameters.get("accuracy_nn",
                                        settings.parameters["accuracy_nn"])
    D_Y = distance.squareform(distance.pdist(res['Y_mnist']))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    res['nearest_neighbors_y_dist'] = np.min(D_Y,
                                             axis=1)  # Actually, whatever axis
    return res
import matplotlib.pyplot as plt
import generate_data
from matplotlib.font_manager import FontProperties

labels_mnist = generate_data.load_labels_mnist()
Y_mnist = generate_data.load_y_mnist()

plt.figure(dpi=300)
font_properties = FontProperties()
font_properties.set_family('serif')
font_properties.set_name('Times New Roman')
font_properties.set_size(9)

plt.xlim([-180, 180])
plt.ylim([-150, 170])

plt.gcf().set_size_inches(
    2.5, 2.1)  #Let's set the plot sizes that just fit paper margins
legend_list = list()
for l in set(sorted(labels_mnist)):
    plt.scatter(Y_mnist[labels_mnist == l, 0],
                Y_mnist[labels_mnist == l, 1],
                marker='.',
                s=5)
    legend_list.append(str(l))
#plt.title("MNIST Dataset - TSNE visualization")
#plt.tight_layout()

l = plt.legend(legend_list,
               bbox_to_anchor=(0.99, 1.025),
               markerscale=8,
Example #3
0
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== ACCURACY
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (picked_neighbors_y_gd_transformed,
         picked_neighbors_y_gd_variance_recalc_transformed,
         picked_neighbors_y_gd_transformed_random,
         picked_neighbors_y_gd_variance_recalc_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_method_results = [
        picked_neighbors_y_gd_transformed,
        picked_neighbors_y_gd_transformed_random,
        picked_neighbors_y_gd_variance_recalc_transformed,
        picked_neighbors_y_gd_variance_recalc_transformed_random,
        picked_neighbors_y_gd_early_exagg_transformed,
        picked_neighbors_y_gd_early_exagg_transformed_random,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \
        picked_neighbors_y_time_gd_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(picked_neighbors_y_time_gd_transformed),
        np.mean(picked_neighbors_y_time_gd_transformed_random),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed
        ),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random
        ),
    ]

    gd_accuracy = np.zeros((len(gd_method_list, )))
    gd_precision = np.zeros((len(gd_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    gd_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(gd_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(gd_method_list)):
            y = gd_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix,
                                      axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j])

    # ============================== KL divergence
    for j in range(len(gd_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :],
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            x = picked_neighbors[i, :]
            y = gd_method_results[j][i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [i for i in nn_x_indices if i in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

        gd_accuracy[j] = np.mean(per_sample_accuracy)
        gd_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j],
                     gd_accuracy[j])

    gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_gd_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_performance_file):
        with open(kl_gd_performance_file, 'rb') as f:
            gd_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors), ))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set
            | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0)
            gd_kl[j,
                  i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P,
                                                               y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_performance_file, 'wb') as f:
            pickle.dump((gd_kl, processed_indices), f)
    # This should be fast
    gd_avg_kl = np.mean(gd_kl, axis=1)

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time,
                     gd_avg_kl, gd_distance_percentiles), f)
def main(parameters = settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== Some starting stuff
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y) ** 2, axis=1)
        return np.argsort(y_distances)[:n]

    kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \
        kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f)
    ind = [4, 24, 49]
    kernelized_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:]
        for i in ind]
    kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind]

    kernelized_accuracy = np.zeros((len(kernelized_method_list,)))
    kernelized_precision = np.zeros((len(kernelized_method_list,)))
    kernelized_per_item_time = np.zeros((len(kernelized_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_method_results[j][i, :]
            kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1)))
            kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                                                            kernelized_dist)
    kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_method_list)):
        logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j])

    # ============================== Accuracy and precision
    for j in range(len(kernelized_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors),))
        per_sample_precision = np.zeros((len(picked_neighbors),))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]

            y = kernelized_method_results[j][i,:]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn)
            matching_indices = len([k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels)
        kernelized_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_precision[j] = np.mean(per_sample_precision)
        kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors)
        logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j],
                     kernelized_accuracy[j])

    kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters)
    if os.path.isfile(kl_kernelized_performance_file):
        with open(kl_kernelized_performance_file, 'rb') as f:
            kernelized_kl, processed_indices = pickle.load(f)

    # ============================== KL divergence
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors),))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0)
            kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_performance_file, 'wb') as f:
            pickle.dump((kernelized_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_kl = np.mean(kernelized_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision,
                     kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
Example #5
0
def generate_idw_power_performance(*,
                                   regenerate=False,
                                   recursive_regenerate=False,
                                   parameters=settings.parameters):
    global_idw_power_performance = dict()  # Start from scratch
    global_idw_power_performance_abs = dict()  # Start from scratch
    global_idw_accuracy = dict()
    global_idw_precision = dict()

    start_time = datetime.datetime.now()
    logging.info("IDW power experiment started: %s", start_time)
    idw_power_performance_file = generate_idw_power_filename(parameters)
    idw_power_plot_file = generate_idw_power_plot_filename(parameters)

    X_mnist = generate_data.load_x_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in idw_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)

    if os.path.isfile(idw_power_performance_file) and not regenerate:
        with open(idw_power_performance_file, 'rb') as f:
            global_idw_power_performance, global_idw_power_performance_abs, global_idw_accuracy = pickle.load(
                f)
    else:
        logging.info("Regeneration requested")

    for p in idw_power_options:
        if p in global_idw_power_performance:
            logging.info("Loaded p %f", p)
            continue

        logging.info("Processing p %f", p)

        interpolator = dTSNE_mnist.generate_embedding_function(
            embedding_function_type='weighted-inverse-distance',
            function_kwargs={'power': p})

        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))

        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            result = interpolator(picked_neighbors[i], verbose=0)
            nn_indices = get_nearest_neighbors_in_y(result,
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            y = result
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)
        cur_acc = np.mean(per_sample_accuracy)
        cur_prec = np.mean(per_sample_precision)

        y_sum_square_dist = 0.0
        y_sum_abs_dist = 0.0
        y_abs_dist = 0.0
        y_count = 0.0
        for i in range(len(X_mnist)):
            distances = distance_matrix[i, :].copy()
            # distances[i] = np.inf #Not interested in distance to itself
            # Step 1. Find nearest neighbors in the neighborhood.
            neighbor_indices = list(range(X_mnist.shape[0]))
            neighbor_indices.remove(i)
            num_neighbors = len(neighbor_indices)
            weights = 1 / distances[neighbor_indices]**p
            weights = weights / np.sum(weights)
            cur_y_result = weights.dot(Y_mnist[neighbor_indices, :])
            y_sum_square_dist += np.sum(cur_y_result - Y_mnist[i, :])**2
            y_sum_abs_dist += np.sqrt(np.sum(cur_y_result - Y_mnist[i, :])**2)
            y_count += 1.0

        global_idw_power_performance[p] = y_sum_square_dist / y_count
        global_idw_power_performance_abs[p] = y_sum_abs_dist / y_count
        global_idw_accuracy[p] = cur_acc
        global_idw_precision[p] = cur_prec

        # Just in case it will become unstable due to too few neighbors
        # lion_power_plot_data[(p, perc)]['PowerSquareDistSum'] = y_sum_square_dist
        # lion_power_plot_data[(p, perc)]['PowerSquareDistCount'] = y_count

        with open(idw_power_performance_file, 'wb') as f:
            pickle.dump((global_idw_power_performance,
                         global_idw_power_performance_abs, global_idw_accuracy,
                         global_idw_precision), f)

    EPS = 1e-5
    y = list()
    x_global = list()
    for cur_power in idw_power_options:
        closest_power = [
            i for i in global_idw_power_performance_abs
            if np.abs(i - cur_power) < EPS
        ]
        if len(closest_power) > 0:
            x_global.append(cur_power)
            y.append(global_idw_power_performance[closest_power[0]])
    idw_optimal_power = x_global[np.argmin(y)]

    with open(idw_power_plot_file, 'wb') as f:
        pickle.dump((x_global, y, idw_optimal_power), f)
    logging.info("IDW optimal power: %f", idw_optimal_power)

    end_time = datetime.datetime.now()
    logging.info("IDW power experiment ended: %s", end_time)
    logging.info("IDW power experiment duration: %s", end_time - start_time)
Example #6
0
def main(parameters=settings.parameters, regenerate_parameters_cache=False):
    step = 0.01
    choice_K = np.arange(step, 2 + step, step)  # Let's try those K.

    logging.info("Started loading.")
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)
    baseline_accuracy = generate_data.get_baseline_accuracy(
        parameters=parameters)
    logging.info("Loaded everything.")

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    # Implementing carefully. Not the fastest, but the most reliable way.

    kernel_tsne_mapping = kernelized_tsne.generate_kernelized_tsne_mapping_function(
        parameters=parameters,
        regenerate_parameters_cache=regenerate_parameters_cache)

    kernelized_detailed_tsne_method_list = [
        "Kernelized tSNE; K=%.2f" % (k) for k in choice_K
    ]
    kernelized_detailed_tsne_method_results = list()

    kernelized_detailed_tsne_accuracy = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))
    kernelized_detailed_tsne_precision = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))
    kernelized_detailed_tsne_time = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))

    for j in range(len(choice_K)):
        k = choice_K[j]
        logging.info("%f", k)

        embedder_start_time = datetime.datetime.now()
        kernelized_detailed_tsne_method_results.append(
            kernel_tsne_mapping(picked_neighbors, k=k))
        embedder_end_time = datetime.datetime.now()
        kernelized_detailed_tsne_time[j] = (
            embedder_end_time - embedder_start_time).total_seconds()
        logging.info("%f complete", k)
        #kernelized_detailed_tsne_method_results = [kernel_tsne_mapping(picked_neighbors, k=k) for k in choice_K]

        logging.info("%s", kernelized_detailed_tsne_method_list[j])
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            if i % 200 == 0:
                logging.info("%d", i)
            expected_label = picked_neighbor_labels[i]
            y = kernelized_detailed_tsne_method_results[j][i, :]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(
                kernelized_detailed_tsne_method_results[j][i, :],
                Y_mnist,
                n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)
        kernelized_detailed_tsne_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_detailed_tsne_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f\t%f s",
                     kernelized_detailed_tsne_method_list[j],
                     kernelized_detailed_tsne_precision[j],
                     kernelized_detailed_tsne_accuracy[j],
                     kernelized_detailed_tsne_time[j])

    # Accuracy-vs-power plot
    legend_list = list()
    f, ax = plt.subplots()
    f.set_size_inches(6, 3)
    x = [k for k in choice_K]  # Ensuring order
    y = kernelized_detailed_tsne_accuracy
    # plt.title("IDW - Accuracy vs Power") # We'd better use figure caption
    # ax.legend([h1,h2,h3,h4,h5,h6], ["Closest Training Set Image"]+idw_method_list)
    plt.plot(x, y, c='blue')
    h = plt.axhline(y=baseline_accuracy, c='black', linestyle='--')
    plt.legend([h], ["Baseline Accuracy (%.4f)" % baseline_accuracy])
    plt.xlabel("Kernelized tSNE: K parameter")
    plt.ylabel("10-NN Accuracy")
    plt.ylim([0, 1])
    plt.xlim([0, 2])
    f.tight_layout()
    plt.savefig("../figures/kernelized-tsne-K-vs-accuracy.png")

    ind = [4, 24, 49]
    kernelized_tsne_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] +
        kernelized_detailed_tsne_method_list[i][-8:] for i in ind
    ]
    kernelized_tsne_method_results = [
        kernelized_detailed_tsne_method_results[i] for i in ind
    ]

    kernelized_tsne_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(kernelized_tsne_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_tsne_method_list)):
            y = kernelized_tsne_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            kernelized_tsne_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    kernelized_tsne_distance_percentiles = np.mean(
        kernelized_tsne_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_tsne_method_list)):
        print(kernelized_tsne_method_list[j],
              kernelized_tsne_distance_percentiles[j])

    output_file = generate_cluster_results_filename(parameters)
    with open(output_file, 'wb') as f:
        pickle.dump(
            (kernelized_detailed_tsne_method_results,
             kernelized_detailed_tsne_accuracy,
             kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time,
             kernelized_detailed_tsne_method_list), f)
Example #7
0
def generate_lion_power_performance(*,
                                    regenerate=False,
                                    recursive_regenerate=False,
                                    parameters=settings.parameters):
    start_time = datetime.datetime.now()
    logging.info("LION power experiment started: %s", start_time)

    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    lion_power_performance_data_file = generate_lion_power_performance_filename(
        parameters)
    lion_power_plot_data_file = generate_lion_power_plot_filename(parameters)

    lion_power_performance_data = dict()  # Start from scratch

    X_mnist = generate_data.load_x_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    labels_mnist = generate_data.load_labels_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    dTSNE_mnist = generate_data.load_dtsne_mnist(
        parameters=settings.parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=settings.parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=settings.parameters)

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in lion_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)
    logging.info("Radius X: %s", radius_x)

    if os.path.isfile(lion_power_performance_data_file) and not regenerate:
        with open(lion_power_performance_data_file, 'rb') as f:
            lion_power_performance_data = pickle.load(f)

    for perc in lion_percentile_options:
        for p in lion_power_options:
            logging.info("Processing percentile and power: %f, %d", p, perc)
            key = str(perc) + ";" + "%.3f" % (p)
            logging.info("Key: %s", key)
            if key not in lion_power_performance_data:
                lion_power_performance_data[key] = dict()

            if 'Accuracy' not in lion_power_performance_data[key]:
                logging.info(
                    "Accuracy not found for power %f percentile %d. \tCalculating...",
                    p, perc)
                interpolator = dTSNE_mnist.generate_lion_tsne_embedder(
                    verbose=0,
                    random_state=0,
                    function_kwargs={
                        'radius_x_percentile': perc,
                        'power': p
                    })

                per_sample_accuracy = np.zeros((len(picked_neighbors), ))
                per_sample_precision = np.zeros((len(picked_neighbors), ))

                for i in range(len(picked_neighbors)):
                    # if i%100==0:
                    #    print("\tPower: ",p,"Processing:",i)
                    expected_label = picked_neighbor_labels[i]
                    result = interpolator(picked_neighbors[i], verbose=0)
                    nn_indices = get_nearest_neighbors_in_y(result,
                                                            Y_mnist,
                                                            n=accuracy_nn)
                    obtained_labels = labels_mnist[nn_indices]
                    per_sample_accuracy[i] = sum(
                        obtained_labels == expected_label) / len(
                            obtained_labels)

                    y = result
                    x = picked_neighbors[i, :]
                    nn_x_indices = get_nearest_neighbors_in_y(x,
                                                              X_mnist,
                                                              n=precision_nn)
                    nn_y_indices = get_nearest_neighbors_in_y(y,
                                                              Y_mnist,
                                                              n=precision_nn)
                    matching_indices = len(
                        [k for k in nn_x_indices if k in nn_y_indices])
                    per_sample_precision[i] = (matching_indices / precision_nn)

                cur_acc = np.mean(per_sample_accuracy)
                cur_prec = np.mean(per_sample_precision)
                # print('================= ',p,perc, cur_acc)
                lion_power_performance_data[key]['Accuracy'] = cur_acc
                lion_power_performance_data[key]['Precision'] = cur_prec
                with open(lion_power_performance_data_file, 'wb') as f:
                    pickle.dump(lion_power_performance_data, f)
            else:
                logging.info(
                    "Accuracy FOUND for power %f percentile %d. Using loaded.",
                    p, perc)

            if 'PowerSquareDist' not in lion_power_performance_data[
                    key] or regenerate:
                logging.info(
                    "Power performance not found for power %f percentile %d.\tCalculating...",
                    p, perc)

                y_sum_square_dist = 0.0
                y_sum_abs_dist = 0.0
                y_count = 0.0
                for i in range(len(X_mnist)):
                    distances = distance_matrix[i, :].copy()
                    distances[
                        i] = np.inf  # Not interested in distance to itself
                    # Step 1. Find nearest neighbors in the neighborhood.
                    neighbor_indices = np.where(distances <= radius_x[perc])[0]
                    num_neighbors = len(neighbor_indices)
                    if num_neighbors >= 2:  # Below 2? Cannot interpolate
                        # We are good
                        weights = 1 / distances[neighbor_indices]**p
                        weights = weights / np.sum(weights)
                        cur_y_result = weights.dot(
                            Y_mnist[neighbor_indices, :])
                        y_sum_square_dist += np.sum(cur_y_result -
                                                    Y_mnist[i, :])**2
                        y_sum_abs_dist += np.sqrt(
                            np.sum(cur_y_result - Y_mnist[i, :])**2)
                        y_count += 1.0
                new_dict = dict()
                new_dict['PowerSquareDist'] = y_sum_square_dist / y_count
                new_dict['PowerAbsDist'] = y_sum_abs_dist / y_count
                # Just in case it will become unstable due to too few neighbors
                new_dict['PowerSquareDistSum'] = y_sum_square_dist
                new_dict['PowerSquareDistCount'] = y_count
                for ndk in new_dict.keys():
                    lion_power_performance_data[key][ndk] = new_dict[ndk]

                with open(lion_power_performance_data_file, 'wb') as f:
                    pickle.dump(lion_power_performance_data, f)
            else:
                logging.info(
                    "Power FOUND for power %f percentile %d. Using loaded.", p,
                    perc)

            logging.info("%s %s", key, lion_power_performance_data[key])

    lion_optimal_power = dict()
    lion_power_plot_y = dict()
    for perc in lion_percentile_options:
        y = list()
        for cur_power in lion_power_options:
            key = str(perc) + ";%.3f" % (cur_power)
            # print(cur_power, perc, lion_power_plot_data[key])
            y.append(lion_power_performance_data[key]['PowerSquareDist'])
        lion_power_plot_y[perc] = y
        lion_optimal_power[perc] = lion_power_options[np.argmin(y)]

    with open(lion_power_plot_data_file, 'wb') as f:
        pickle.dump(
            (lion_power_options, lion_power_plot_y, lion_optimal_power), f)
    logging.info("LION optimal power: %s", lion_optimal_power)

    end_time = datetime.datetime.now()
    logging.info("LION power experiment ended: %s", end_time)
    logging.info("LION power experiment duration: %s", end_time - start_time)
import settings
import os
import logging

regenerate = False
logging.basicConfig(level=logging.INFO)

for i in ['full']:
    fname = '../figures/PCA_and_tSNE/mnist_tsne_original'+str(i)+'.png'
    if os.path.isfile(fname):
        logging.info("%s exists", fname)
        continue

    parameters = settings.parameters.copy()
    parameters["pca_random_seed"] = i
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)
    Y_mnist= generate_data.load_y_mnist(parameters=parameters)

    plt.figure(dpi=300)
    font_properties = FontProperties()
    font_properties.set_family('serif')
    font_properties.set_name('Times New Roman')
    font_properties.set_size(9)

    plt.gcf().set_size_inches(3.3,3.3) #Let's set the plot sizes that just fit paper margins
    legend_list = list()
    for l in set(sorted(labels_mnist)):
        plt.scatter(Y_mnist[labels_mnist == l, 0], Y_mnist[labels_mnist == l, 1], marker = '.', s=5)
        legend_list.append(str(l))
    #plt.title("MNIST Dataset - TSNE visualization")
    #plt.tight_layout()