def calc_kl(*, common_info, embedded_neighbors, parameters):
    dTSNE_mnist = common_info["dTSNE_mnist"]
    X_mnist = common_info["X_mnist"]
    Y_mnist = common_info["Y_mnist"]
    letter_samples = common_info["letter_samples"]
    per_sample_kl_divergences = list()
    for j in range(len(letter_samples)):
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Don't store those matrices in a single file. Way too large.

        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            if j % 50 == 0:
                logging.info("\t%d P-matrix file found. Loading.", j)
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            if j % 50 == 0:
                logging.info(
                    "\t%d P-matrix file not found. Creating and saving.", j)
            new_X = np.concatenate((X_mnist, letter_samples[j, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma \
                (distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            if not os.path.isdir(distance_matrix_dir):
                logging.info('Creating directory: %s', distance_matrix_dir)
                os.mkdir(distance_matrix_dir)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # Single file with p matrix.
        # Now use it to calculate KL divergence.
        new_Y = np.concatenate((Y_mnist, embedded_neighbors[j, :].reshape(
            (1, -1))),
                               axis=0)
        kl, _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y)
        per_sample_kl_divergences.append(kl)
    return np.mean(per_sample_kl_divergences)
Ejemplo n.º 2
0
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== ACCURACY
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (picked_neighbors_y_gd_transformed,
         picked_neighbors_y_gd_variance_recalc_transformed,
         picked_neighbors_y_gd_transformed_random,
         picked_neighbors_y_gd_variance_recalc_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_method_results = [
        picked_neighbors_y_gd_transformed,
        picked_neighbors_y_gd_transformed_random,
        picked_neighbors_y_gd_variance_recalc_transformed,
        picked_neighbors_y_gd_variance_recalc_transformed_random,
        picked_neighbors_y_gd_early_exagg_transformed,
        picked_neighbors_y_gd_early_exagg_transformed_random,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \
        picked_neighbors_y_time_gd_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(picked_neighbors_y_time_gd_transformed),
        np.mean(picked_neighbors_y_time_gd_transformed_random),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed
        ),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random
        ),
    ]

    gd_accuracy = np.zeros((len(gd_method_list, )))
    gd_precision = np.zeros((len(gd_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    gd_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(gd_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(gd_method_list)):
            y = gd_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix,
                                      axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j])

    # ============================== KL divergence
    for j in range(len(gd_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :],
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            x = picked_neighbors[i, :]
            y = gd_method_results[j][i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [i for i in nn_x_indices if i in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

        gd_accuracy[j] = np.mean(per_sample_accuracy)
        gd_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j],
                     gd_accuracy[j])

    gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_gd_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_performance_file):
        with open(kl_gd_performance_file, 'rb') as f:
            gd_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors), ))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set
            | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0)
            gd_kl[j,
                  i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P,
                                                               y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_performance_file, 'wb') as f:
            pickle.dump((gd_kl, processed_indices), f)
    # This should be fast
    gd_avg_kl = np.mean(gd_kl, axis=1)

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time,
                     gd_avg_kl, gd_distance_percentiles), f)
def main(parameters = settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== Some starting stuff
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y) ** 2, axis=1)
        return np.argsort(y_distances)[:n]

    kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \
        kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f)
    ind = [4, 24, 49]
    kernelized_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:]
        for i in ind]
    kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind]

    kernelized_accuracy = np.zeros((len(kernelized_method_list,)))
    kernelized_precision = np.zeros((len(kernelized_method_list,)))
    kernelized_per_item_time = np.zeros((len(kernelized_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_method_results[j][i, :]
            kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1)))
            kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                                                            kernelized_dist)
    kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_method_list)):
        logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j])

    # ============================== Accuracy and precision
    for j in range(len(kernelized_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors),))
        per_sample_precision = np.zeros((len(picked_neighbors),))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]

            y = kernelized_method_results[j][i,:]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn)
            matching_indices = len([k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels)
        kernelized_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_precision[j] = np.mean(per_sample_precision)
        kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors)
        logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j],
                     kernelized_accuracy[j])

    kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters)
    if os.path.isfile(kl_kernelized_performance_file):
        with open(kl_kernelized_performance_file, 'rb') as f:
            kernelized_kl, processed_indices = pickle.load(f)

    # ============================== KL divergence
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors),))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0)
            kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_performance_file, 'wb') as f:
            pickle.dump((kernelized_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_kl = np.mean(kernelized_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision,
                     kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
    if time_s > 100:
        return ">$10^5$"
    if time_s > 10:
        return ">$10^4$"
    if time_s > 1:
        return ">$10^3$"
    return "%2.2f" % (time_s * 1000)


# ================= GENERATING THE TEX TABLE ==========================

print("\n\nTABLE FOR COPY-PASTING TO LATEX\n\n\n")

s = ""

initial_kl_divergence, _ = lion_tsne.kl_divergence_and_gradient(
    y=dTSNE_mnist.Y, p_matrix=dTSNE_mnist.P_matrix)

s += '''\\begin{table} \small\sf\centering \caption{Letters placement test: methods comparison.
    Original KL divergence of the dataset is %.5f}  \label{tab_letter_methods_comparison}
    \\begin{tabular}{ m{0.19\\textwidth}  m{0.07\\textwidth}  m{0.07\\textwidth}  m{0.06\\textwidth} }
        \\toprule
            \\textbf{Method}
            & \\textbf{Distance Perc-le}
            & \\textbf{KL Div.}
            & \\textbf{Time (ms)}
        \\\\ \\midrule''' % (initial_kl_divergence)

s += '\\multicolumn{4}{c}{\\textbf{RBF Interpolation}}\n'
s += '\t\\\\\n'

for j in range(len(rbf_method_list)):
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    letter_samples, _, _ = generate_data.load_letters(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    # ============== KL Divergence
    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_letter_test_GD.generate_letter_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (letters_y_gd_transformed, letters_y_gd_variance_recalc_transformed,
         letters_y_gd_transformed_random,
         letters_y_gd_variance_recalc_transformed_random,
         letters_y_gd_early_exagg_transformed_random,
         letters_y_gd_early_exagg_transformed,
         letters_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         letters_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_letters_results = [
        letters_y_gd_transformed,
        letters_y_gd_transformed_random,
        letters_y_gd_variance_recalc_transformed,
        letters_y_gd_variance_recalc_transformed_random,
        letters_y_gd_early_exagg_transformed,
        letters_y_gd_early_exagg_transformed_random,
        letters_y_gd_variance_recalc_early_exagg_transformed,
        letters_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_letter_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        letters_y_time_gd_transformed, letters_y_time_gd_variance_recalc_transformed, \
        letters_y_time_gd_transformed_random, \
        letters_y_time_gd_variance_recalc_transformed_random, \
        letters_y_time_gd_early_exagg_transformed_random, \
        letters_y_time_gd_early_exagg_transformed, \
        letters_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        letters_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(letters_y_time_gd_transformed),
        np.mean(letters_y_time_gd_transformed_random),
        np.mean(letters_y_time_gd_variance_recalc_transformed),
        np.mean(letters_y_time_gd_variance_recalc_transformed_random),
        np.mean(letters_y_time_gd_early_exagg_transformed),
        np.mean(letters_y_time_gd_early_exagg_transformed_random),
        np.mean(letters_y_time_gd_variance_recalc_early_exagg_transformed),
        np.mean(
            letters_y_time_gd_variance_recalc_early_exagg_transformed_random),
    ]

    gd_letters_kl = np.zeros((len(gd_method_list), len(letter_samples)))

    processed_indices = list()

    kl_gd_letters_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_letters_performance_file):
        with open(kl_gd_letters_performance_file, 'rb') as f:
            gd_letters_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(letter_samples), ))
    for i in range(len(letter_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, letter_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_letters_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_letters_results[j][i, :].reshape((1, -1))),
                axis=0)
            gd_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(
                p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_letters_performance_file, 'wb') as f:
            pickle.dump((gd_letters_kl, processed_indices), f)
    # This should be fast
    gd_avg_letters_kl = np.mean(gd_letters_kl, axis=1)

    # ============== Distance percentiles
    gd_letters_percentiles_matrix = np.zeros(
        (len(letter_samples), len(gd_method_list)))
    gd_letters_distance_matrix = np.zeros(
        (len(letter_samples), len(gd_method_list)))
    for i in range(len(letter_samples)):
        for j in range(len(gd_method_list)):
            y = gd_letters_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_letters_distance_matrix[i, j] = nn_dist
            gd_letters_percentiles_matrix[i, j] = stats.percentileofscore(
                nearest_neighbors_y_dist, nn_dist)
    gd_letters_distance_percentiles = np.mean(gd_letters_percentiles_matrix,
                                              axis=0)
    gd_letters_distances = np.mean(gd_letters_distance_matrix, axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s: %f, %f", gd_method_list[j], gd_letters_distances[j],
                     gd_letters_distance_percentiles[j])

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_time, gd_avg_letters_kl,
                     gd_letters_distance_percentiles), f)
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_y_mnist(parameters=parameters)

    outlier_samples, _ = generate_data.load_outliers(parameters=parameters)

    nn_results_file = exp_outlier_test_NN.generate_outlier_results_filename(
        parameters)
    with open(nn_results_file, 'rb') as f:
        nn_outliers_results, nn_models_orig, nn_method_list = pickle.load(f)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    # ================ KL DIVERGENCE ===================
    nn_outliers_kl = np.zeros((len(nn_method_list), len(outlier_samples)))

    processed_indices = list()

    kl_nn_outliers_performance_file = generate_nn_kl_temp_filename(parameters)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(outlier_samples), ))
    for i in range(len(outlier_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.outlier_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(nn_outliers_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (nn_models_orig[j], nn_outliers_results[j][i, :].reshape(
                    (1, -1))),
                axis=0)
            nn_outliers_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(
                p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_nn_outliers_performance_file, 'wb') as f:
            pickle.dump((nn_outliers_kl, processed_indices), f)
    # This should be fast
    nn_avg_outliers_kl = np.mean(nn_outliers_kl, axis=1)

    # ================ DISTANCE MATRICES ===================
    nn_outliers_percentiles_matrix = np.zeros(
        (len(outlier_samples), len(nn_method_list)))
    nn_outliers_distance_matrix = np.zeros(
        (len(outlier_samples), len(nn_method_list)))
    for i in range(len(outlier_samples)):
        for j in range(len(nn_method_list)):
            y = nn_outliers_results[j][i, :]
            nn_dist = np.min(
                np.sqrt(np.sum((nn_models_orig[j] - y)**2, axis=1)))
            nn_outliers_distance_matrix[i, j] = nn_dist
            nn_outliers_percentiles_matrix[i, j] = stats.percentileofscore(
                nearest_neighbors_y_dist, nn_dist)
    nn_outliers_distance_percentiles = np.mean(nn_outliers_percentiles_matrix,
                                               axis=0)
    nn_outliers_distances = np.mean(nn_outliers_distance_matrix, axis=0)
    for j in range(len(nn_method_list)):
        print(nn_method_list[j], nn_outliers_distances[j],
              nn_outliers_distance_percentiles[j])

    output_file = generate_nn_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((nn_method_list, nn_avg_outliers_kl,
                     nn_outliers_distance_percentiles), f)
Ejemplo n.º 7
0
def main(parameters=settings.parameters, regenerate=False):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    letter_A_samples, _ = generate_data.load_A_letters(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    kernelized_results_file = exp_letter_A_test_kernelized.generate_letter_A_results_filename(
        parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_method_results, kernelized_detailed_tsne_time, kernelized_detailed_method_list = pickle.load(
            f)
    ind = [4, 24, 49]

    kernelized_method_list = [
        kernelized_detailed_method_list[i][:10] +
        kernelized_detailed_method_list[i][-8:] for i in ind
    ]
    kernelized_letters_results = [
        kernelized_detailed_method_results[i] for i in ind
    ]

    # =========== DISTANCE PERCENTILES ==========
    kernelized_letters_percentiles_matrix = np.zeros(
        (len(letter_A_samples), len(kernelized_method_list)))
    kernelized_letters_distance_matrix = np.zeros(
        (len(letter_A_samples), len(kernelized_method_list)))
    for i in range(len(letter_A_samples)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_letters_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            kernelized_letters_distance_matrix[i, j] = nn_dist
            kernelized_letters_percentiles_matrix[i,
                                                  j] = stats.percentileofscore(
                                                      nearest_neighbors_y_dist,
                                                      nn_dist)
    kernelized_letters_distance_percentiles = np.mean(
        kernelized_letters_percentiles_matrix, axis=0)
    kernelized_letters_distances = np.mean(kernelized_letters_distance_matrix,
                                           axis=0)
    kernelized_per_item_time = kernelized_detailed_tsne_time / len(
        letter_A_samples)
    for j in range(len(kernelized_method_list)):
        logging.info("%s: %f, %f", kernelized_method_list[j],
                     kernelized_letters_distances[j],
                     kernelized_letters_distance_percentiles[j])

    kernelized_letters_kl = np.zeros(
        (len(kernelized_method_list), len(letter_A_samples)))
    processed_indices = list()

    kl_kernelized_tsne_letters_performance_file = generate_kernelized_kl_temp_filename(
        parameters)
    if os.path.isfile(
            kl_kernelized_tsne_letters_performance_file) and not regenerate:
        with open(kl_kernelized_tsne_letters_performance_file, 'rb') as f:
            kernelized_letters_kl, processed_indices = pickle.load(f)

    # =========== KL DIVERGENCE ==========
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(letter_A_samples), ))
    for i in range(len(letter_A_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_A_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, letter_A_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_letters_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, kernelized_letters_results[j][i, :].reshape(
                    (1, -1))),
                axis=0)
            kernelized_letters_kl[j,
                                  i], _ = lion_tsne.kl_divergence_and_gradient(
                                      p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_tsne_letters_performance_file, 'wb') as f:
            pickle.dump((kernelized_letters_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_letters_kl = np.mean(kernelized_letters_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_avg_letters_kl,
                     kernelized_per_item_time,
                     kernelized_letters_distance_percentiles), f)