def calc_kl(*, common_info, embedded_neighbors, parameters):
    dTSNE_mnist = common_info["dTSNE_mnist"]
    X_mnist = common_info["X_mnist"]
    Y_mnist = common_info["Y_mnist"]
    letter_samples = common_info["letter_samples"]
    per_sample_kl_divergences = list()
    for j in range(len(letter_samples)):
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Don't store those matrices in a single file. Way too large.

        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            if j % 50 == 0:
                logging.info("\t%d P-matrix file found. Loading.", j)
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            if j % 50 == 0:
                logging.info(
                    "\t%d P-matrix file not found. Creating and saving.", j)
            new_X = np.concatenate((X_mnist, letter_samples[j, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma \
                (distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            if not os.path.isdir(distance_matrix_dir):
                logging.info('Creating directory: %s', distance_matrix_dir)
                os.mkdir(distance_matrix_dir)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # Single file with p matrix.
        # Now use it to calculate KL divergence.
        new_Y = np.concatenate((Y_mnist, embedded_neighbors[j, :].reshape(
            (1, -1))),
                               axis=0)
        kl, _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y)
        per_sample_kl_divergences.append(kl)
    return np.mean(per_sample_kl_divergences)
def generate_gd_kl_temp_filename(parameters):
    output_file_prefix = '../results/letter_A_gd_kl_temp_'
    return output_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.letter_A_parameter_set,
        parameters)
def generate_time_results_filename(parameters=settings.parameters):
    outlier_results_file_prefix = '../results/outlier_time_gd_'
    return outlier_results_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.outlier_parameter_set, parameters)
Ejemplo n.º 4
0
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== ACCURACY
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (picked_neighbors_y_gd_transformed,
         picked_neighbors_y_gd_variance_recalc_transformed,
         picked_neighbors_y_gd_transformed_random,
         picked_neighbors_y_gd_variance_recalc_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_method_results = [
        picked_neighbors_y_gd_transformed,
        picked_neighbors_y_gd_transformed_random,
        picked_neighbors_y_gd_variance_recalc_transformed,
        picked_neighbors_y_gd_variance_recalc_transformed_random,
        picked_neighbors_y_gd_early_exagg_transformed,
        picked_neighbors_y_gd_early_exagg_transformed_random,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \
        picked_neighbors_y_time_gd_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(picked_neighbors_y_time_gd_transformed),
        np.mean(picked_neighbors_y_time_gd_transformed_random),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed
        ),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random
        ),
    ]

    gd_accuracy = np.zeros((len(gd_method_list, )))
    gd_precision = np.zeros((len(gd_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    gd_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(gd_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(gd_method_list)):
            y = gd_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix,
                                      axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j])

    # ============================== KL divergence
    for j in range(len(gd_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :],
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            x = picked_neighbors[i, :]
            y = gd_method_results[j][i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [i for i in nn_x_indices if i in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

        gd_accuracy[j] = np.mean(per_sample_accuracy)
        gd_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j],
                     gd_accuracy[j])

    gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_gd_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_performance_file):
        with open(kl_gd_performance_file, 'rb') as f:
            gd_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors), ))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set
            | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0)
            gd_kl[j,
                  i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P,
                                                               y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_performance_file, 'wb') as f:
            pickle.dump((gd_kl, processed_indices), f)
    # This should be fast
    gd_avg_kl = np.mean(gd_kl, axis=1)

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time,
                     gd_avg_kl, gd_distance_percentiles), f)
def main(parameters = settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== Some starting stuff
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y) ** 2, axis=1)
        return np.argsort(y_distances)[:n]

    kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \
        kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f)
    ind = [4, 24, 49]
    kernelized_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:]
        for i in ind]
    kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind]

    kernelized_accuracy = np.zeros((len(kernelized_method_list,)))
    kernelized_precision = np.zeros((len(kernelized_method_list,)))
    kernelized_per_item_time = np.zeros((len(kernelized_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_method_results[j][i, :]
            kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1)))
            kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                                                            kernelized_dist)
    kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_method_list)):
        logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j])

    # ============================== Accuracy and precision
    for j in range(len(kernelized_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors),))
        per_sample_precision = np.zeros((len(picked_neighbors),))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]

            y = kernelized_method_results[j][i,:]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn)
            matching_indices = len([k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels)
        kernelized_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_precision[j] = np.mean(per_sample_precision)
        kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors)
        logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j],
                     kernelized_accuracy[j])

    kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters)
    if os.path.isfile(kl_kernelized_performance_file):
        with open(kl_kernelized_performance_file, 'rb') as f:
            kernelized_kl, processed_indices = pickle.load(f)

    # ============================== KL divergence
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors),))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0)
            kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_performance_file, 'wb') as f:
            pickle.dump((kernelized_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_kl = np.mean(kernelized_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision,
                     kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
Ejemplo n.º 6
0
def generate_idw_power_plot_filename(parameters=settings.parameters):
    return idw_power_plot_file_prefix + generate_data.combine_prefixes(
        settings.nn_accuracy_parameter_set, parameters)
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    letter_samples, _, _ = generate_data.load_letters(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    # ============== KL Divergence
    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_letter_test_GD.generate_letter_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (letters_y_gd_transformed, letters_y_gd_variance_recalc_transformed,
         letters_y_gd_transformed_random,
         letters_y_gd_variance_recalc_transformed_random,
         letters_y_gd_early_exagg_transformed_random,
         letters_y_gd_early_exagg_transformed,
         letters_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         letters_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_letters_results = [
        letters_y_gd_transformed,
        letters_y_gd_transformed_random,
        letters_y_gd_variance_recalc_transformed,
        letters_y_gd_variance_recalc_transformed_random,
        letters_y_gd_early_exagg_transformed,
        letters_y_gd_early_exagg_transformed_random,
        letters_y_gd_variance_recalc_early_exagg_transformed,
        letters_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_letter_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        letters_y_time_gd_transformed, letters_y_time_gd_variance_recalc_transformed, \
        letters_y_time_gd_transformed_random, \
        letters_y_time_gd_variance_recalc_transformed_random, \
        letters_y_time_gd_early_exagg_transformed_random, \
        letters_y_time_gd_early_exagg_transformed, \
        letters_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        letters_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(letters_y_time_gd_transformed),
        np.mean(letters_y_time_gd_transformed_random),
        np.mean(letters_y_time_gd_variance_recalc_transformed),
        np.mean(letters_y_time_gd_variance_recalc_transformed_random),
        np.mean(letters_y_time_gd_early_exagg_transformed),
        np.mean(letters_y_time_gd_early_exagg_transformed_random),
        np.mean(letters_y_time_gd_variance_recalc_early_exagg_transformed),
        np.mean(
            letters_y_time_gd_variance_recalc_early_exagg_transformed_random),
    ]

    gd_letters_kl = np.zeros((len(gd_method_list), len(letter_samples)))

    processed_indices = list()

    kl_gd_letters_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_letters_performance_file):
        with open(kl_gd_letters_performance_file, 'rb') as f:
            gd_letters_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(letter_samples), ))
    for i in range(len(letter_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, letter_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_letters_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_letters_results[j][i, :].reshape((1, -1))),
                axis=0)
            gd_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(
                p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_letters_performance_file, 'wb') as f:
            pickle.dump((gd_letters_kl, processed_indices), f)
    # This should be fast
    gd_avg_letters_kl = np.mean(gd_letters_kl, axis=1)

    # ============== Distance percentiles
    gd_letters_percentiles_matrix = np.zeros(
        (len(letter_samples), len(gd_method_list)))
    gd_letters_distance_matrix = np.zeros(
        (len(letter_samples), len(gd_method_list)))
    for i in range(len(letter_samples)):
        for j in range(len(gd_method_list)):
            y = gd_letters_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_letters_distance_matrix[i, j] = nn_dist
            gd_letters_percentiles_matrix[i, j] = stats.percentileofscore(
                nearest_neighbors_y_dist, nn_dist)
    gd_letters_distance_percentiles = np.mean(gd_letters_percentiles_matrix,
                                              axis=0)
    gd_letters_distances = np.mean(gd_letters_distance_matrix, axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s: %f, %f", gd_method_list[j], gd_letters_distances[j],
                     gd_letters_distance_percentiles[j])

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_time, gd_avg_letters_kl,
                     gd_letters_distance_percentiles), f)
def generate_nn_postprocess_filename(parameters):
    output_file_prefix = '../results/cluster_attr_nn_postprocess_'
    return output_file_prefix + generate_data.combine_prefixes(
        neural_network_commons.nn_model_prefixes | settings.x_neighbors_selection_parameter_set, parameters)
def generate_nn_kl_temp_filename(parameters):
    output_file_prefix = '../results/letter_nn_kl_temp_'
    return output_file_prefix + generate_data.combine_prefixes(
        neural_network_commons.nn_model_prefixes
        | settings.letter_parameter_set, parameters)
def generate_letter_results_filename(letter_results_file_prefix,
                                     parameters=settings.parameters):
    return letter_results_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.letter_parameter_set,
        parameters)
Ejemplo n.º 11
0
def generate_lion_power_performance_filename(parameters=settings.parameters):
    return lion_power_performance_prefix + generate_data.combine_prefixes(
        settings.nn_accuracy_parameter_set, parameters)
def train_or_load_models(regenerate_model1=False, regenerate_model2=False, regenerate_model3=False,
         parameters=settings.parameters):
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    keras_random_seed = parameters.get("keras_random_seed", settings.parameters["keras_random_seed"])

    # Reproducibility: parallel threads can bring uncontrolled randomness
    # Luckily, models here are small, no need for parallel threads etc.
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    session = tf.Session(config=session_conf)
    tf.keras.backend.set_session(session)

    model1_weights_file_prefix = '../results/model1'
    model1_json_file_prefix = '../results/model1'
    model2_weights_file_prefix = '../results/model2'
    model2_json_file_prefix = '../results/model2'
    model3_weights_file_prefix = '../results/model3'
    model3_json_file_prefix = '../results/model3'

    model1_weights_file = model1_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                      postfix='.hd5')
    model1_json_file = model1_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                postfix='.json')
    model2_weights_file = model2_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                      postfix='.hd5')
    model2_json_file = model2_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                postfix='.json')
    model3_weights_file = model3_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                      postfix='.hd5')
    model3_json_file = model3_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                postfix='.json')

    if not os.path.isfile(model1_weights_file) or regenerate_model1:
        # 2 layers, 250 nodes per layer, ReLu activation, dropout regularization with rate of 0.25.]

        set_all_random_seeds(keras_random_seed)
        model1 = keras.models.Sequential()
        model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model1.add(keras.layers.Dropout(0.25))
        model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model1.add(keras.layers.Dropout(0.25))
        model1.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal'))
        model1.compile(loss='mean_squared_error', optimizer='adam')
        model1.fit(X_mnist, Y_mnist,
                   epochs=5000,
                   verbose=1,
                   validation_data=(X_mnist, Y_mnist))
        with open(model1_json_file, "w") as f:
            f.write(model1.to_json())
        model1.save_weights(model1_weights_file)
    else:
        with open(model1_json_file, "r") as f:
            model1 = keras.models.model_from_json(f.read())
        model1.load_weights(model1_weights_file)
        model1.compile(loss='mean_squared_error', optimizer='adam')

    Y_nn1_mnist = model1.predict(X_mnist)

    if not os.path.isfile(model2_weights_file) or regenerate_model2:
        # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.]
        set_all_random_seeds(keras_random_seed)
        model2 = keras.models.Sequential()
        model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model2.add(keras.layers.Dropout(0.5))
        model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model2.add(keras.layers.Dropout(0.5))
        model2.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal'))
        model2.compile(loss='mean_squared_error', optimizer='adam')
        model2.fit(X_mnist, Y_mnist,
                   epochs=5000,
                   verbose=1,
                   validation_data=(X_mnist, Y_mnist))
        with open(model2_json_file, "w") as f:
            f.write(model2.to_json())
        model2.save_weights(model2_weights_file)
    else:
        with open(model2_json_file, "r") as f:
            model2 = keras.models.model_from_json(f.read())
        model2.load_weights(model2_weights_file)
        model2.compile(loss='mean_squared_error', optimizer='adam')

    Y_nn2_mnist = model2.predict(X_mnist)

    if not os.path.isfile(model3_weights_file) or regenerate_model3:
        # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.]
        set_all_random_seeds(keras_random_seed)
        model3 = keras.models.Sequential()
        model3.add(keras.layers.Dense(500, activation='tanh', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model3.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal'))
        model3.compile(loss='mean_squared_error', optimizer='adam')
        model3.fit(X_mnist, Y_mnist,
                   epochs=5000,
                   verbose=1,
                   validation_data=(X_mnist, Y_mnist))
        with open(model3_json_file, "w") as f:
            f.write(model3.to_json())
        model3.save_weights(model3_weights_file)
    else:
        with open(model3_json_file, "r") as f:
            model3 = keras.models.model_from_json(f.read())
        model3.load_weights(model3_weights_file)
        model3.compile(loss='mean_squared_error', optimizer='adam')

    Y_nn3_mnist = model3.predict(X_mnist)
    return {"models" : (model1, model2, model3), "Y_predicted" : (Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist)}
def generate_cache_filename(parameters=settings.parameters):
    cache_file_prefix = '../results/kernelized_tsne_parameters_cache'
    return cache_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set
        | settings.x_neighbors_selection_parameter_set, parameters)
def generate_nn_postprocess_filename(parameters):
    output_file_prefix = '../results/outlier_nn_postprocess_'
    return output_file_prefix + generate_data.combine_prefixes(
        neural_network_commons.nn_model_prefixes
        | settings.outlier_parameter_set, parameters)
Ejemplo n.º 15
0
def generate_cluster_results_filename(parameters=settings.parameters):
    cluster_results_file_prefix = '../results/cluster_attr_kernelized_'
    return cluster_results_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set
        | settings.x_neighbors_selection_parameter_set, parameters)
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_y_mnist(parameters=parameters)

    outlier_samples, _ = generate_data.load_outliers(parameters=parameters)

    nn_results_file = exp_outlier_test_NN.generate_outlier_results_filename(
        parameters)
    with open(nn_results_file, 'rb') as f:
        nn_outliers_results, nn_models_orig, nn_method_list = pickle.load(f)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    # ================ KL DIVERGENCE ===================
    nn_outliers_kl = np.zeros((len(nn_method_list), len(outlier_samples)))

    processed_indices = list()

    kl_nn_outliers_performance_file = generate_nn_kl_temp_filename(parameters)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(outlier_samples), ))
    for i in range(len(outlier_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.outlier_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(nn_outliers_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (nn_models_orig[j], nn_outliers_results[j][i, :].reshape(
                    (1, -1))),
                axis=0)
            nn_outliers_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(
                p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_nn_outliers_performance_file, 'wb') as f:
            pickle.dump((nn_outliers_kl, processed_indices), f)
    # This should be fast
    nn_avg_outliers_kl = np.mean(nn_outliers_kl, axis=1)

    # ================ DISTANCE MATRICES ===================
    nn_outliers_percentiles_matrix = np.zeros(
        (len(outlier_samples), len(nn_method_list)))
    nn_outliers_distance_matrix = np.zeros(
        (len(outlier_samples), len(nn_method_list)))
    for i in range(len(outlier_samples)):
        for j in range(len(nn_method_list)):
            y = nn_outliers_results[j][i, :]
            nn_dist = np.min(
                np.sqrt(np.sum((nn_models_orig[j] - y)**2, axis=1)))
            nn_outliers_distance_matrix[i, j] = nn_dist
            nn_outliers_percentiles_matrix[i, j] = stats.percentileofscore(
                nearest_neighbors_y_dist, nn_dist)
    nn_outliers_distance_percentiles = np.mean(nn_outliers_percentiles_matrix,
                                               axis=0)
    nn_outliers_distances = np.mean(nn_outliers_distance_matrix, axis=0)
    for j in range(len(nn_method_list)):
        print(nn_method_list[j], nn_outliers_distances[j],
              nn_outliers_distance_percentiles[j])

    output_file = generate_nn_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((nn_method_list, nn_avg_outliers_kl,
                     nn_outliers_distance_percentiles), f)
Ejemplo n.º 17
0
def generate_letter_results_filename(parameters=settings.parameters):
    letter_results_file_prefix = '../results/letter_nn_'
    return letter_results_file_prefix + generate_data.combine_prefixes(
        neural_network_commons.nn_model_prefixes
        | settings.letter_parameter_set, parameters)
def generate_gd_postprocess_filename(parameters):
    output_file_prefix = '../results/letter_gd_postprocess_'
    return output_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.letter_parameter_set,
        parameters)
Ejemplo n.º 19
0
def main(parameters=settings.parameters, regenerate=False):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    letter_A_samples, _ = generate_data.load_A_letters(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    kernelized_results_file = exp_letter_A_test_kernelized.generate_letter_A_results_filename(
        parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_method_results, kernelized_detailed_tsne_time, kernelized_detailed_method_list = pickle.load(
            f)
    ind = [4, 24, 49]

    kernelized_method_list = [
        kernelized_detailed_method_list[i][:10] +
        kernelized_detailed_method_list[i][-8:] for i in ind
    ]
    kernelized_letters_results = [
        kernelized_detailed_method_results[i] for i in ind
    ]

    # =========== DISTANCE PERCENTILES ==========
    kernelized_letters_percentiles_matrix = np.zeros(
        (len(letter_A_samples), len(kernelized_method_list)))
    kernelized_letters_distance_matrix = np.zeros(
        (len(letter_A_samples), len(kernelized_method_list)))
    for i in range(len(letter_A_samples)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_letters_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            kernelized_letters_distance_matrix[i, j] = nn_dist
            kernelized_letters_percentiles_matrix[i,
                                                  j] = stats.percentileofscore(
                                                      nearest_neighbors_y_dist,
                                                      nn_dist)
    kernelized_letters_distance_percentiles = np.mean(
        kernelized_letters_percentiles_matrix, axis=0)
    kernelized_letters_distances = np.mean(kernelized_letters_distance_matrix,
                                           axis=0)
    kernelized_per_item_time = kernelized_detailed_tsne_time / len(
        letter_A_samples)
    for j in range(len(kernelized_method_list)):
        logging.info("%s: %f, %f", kernelized_method_list[j],
                     kernelized_letters_distances[j],
                     kernelized_letters_distance_percentiles[j])

    kernelized_letters_kl = np.zeros(
        (len(kernelized_method_list), len(letter_A_samples)))
    processed_indices = list()

    kl_kernelized_tsne_letters_performance_file = generate_kernelized_kl_temp_filename(
        parameters)
    if os.path.isfile(
            kl_kernelized_tsne_letters_performance_file) and not regenerate:
        with open(kl_kernelized_tsne_letters_performance_file, 'rb') as f:
            kernelized_letters_kl, processed_indices = pickle.load(f)

    # =========== KL DIVERGENCE ==========
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(letter_A_samples), ))
    for i in range(len(letter_A_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_A_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, letter_A_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_letters_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, kernelized_letters_results[j][i, :].reshape(
                    (1, -1))),
                axis=0)
            kernelized_letters_kl[j,
                                  i], _ = lion_tsne.kl_divergence_and_gradient(
                                      p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_tsne_letters_performance_file, 'wb') as f:
            pickle.dump((kernelized_letters_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_letters_kl = np.mean(kernelized_letters_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_avg_letters_kl,
                     kernelized_per_item_time,
                     kernelized_letters_distance_percentiles), f)
Ejemplo n.º 20
0
def generate_idw_power_filename(parameters=settings.parameters):
    return idw_power_performance_file_prefix +\
                    generate_data.combine_prefixes(settings.x_neighbors_selection_parameter_set, parameters)
def generate_time_results_filename(parameters=settings.parameters):
    letter_A_results_file_prefix = '../results/letter_A_time_gd_'
    return letter_A_results_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.letter_A_parameter_set,
        parameters)
def generate_kernelized_kl_temp_filename(parameters):
    output_file_prefix = '../results/cluster_attr_kernelized_kl_temp_'
    return output_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters)
import settings
import logging
import numpy as np
import generate_data
import pickle

input_prefixes = (
    './cluster-results/cluster_attr_gd_',
    './outlier-results/outlier_gd_',
    './letter-results/letter_gd_',
    './letter-A-results/letter_A_gd_',
)

output_files = (
    '../results/cluster_attr_gd_' + generate_data.combine_prefixes(
        settings.tsne_parameter_set
        | settings.x_neighbors_selection_parameter_set, settings.parameters),
    '../results/outlier_gd_' + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.outlier_parameter_set,
        settings.parameters),
    '../results/letter_gd_' + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.letter_parameter_set,
        settings.parameters),
    '../results/letter_A_gd_' + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.letter_A_parameter_set,
        settings.parameters),
)

output_time_files = (
    '../results/cluster_attr_time_gd_' + generate_data.combine_prefixes(
        settings.tsne_parameter_set
Ejemplo n.º 24
0
def generate_gd_postprocess_filename(parameters):
    output_file_prefix = '../results/cluster_attr_gd_postprocess_'
    return output_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set
        | settings.x_neighbors_selection_parameter_set, parameters)
Ejemplo n.º 25
0
def generate_kernelized_kl_temp_filename(parameters):
    output_file_prefix = '../results/outlier_kernelized_kl_temp_'
    return output_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.outlier_parameter_set,
        parameters)
Ejemplo n.º 26
0
def generate_outlier_results_filename(parameters=settings.parameters):
    cluster_results_file_prefix = '../results/outlier_kernelized_'
    return cluster_results_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set | settings.outlier_parameter_set,
        parameters)
def generate_cluster_results_filename(cluster_results_file_prefix,
                                      parameters=settings.parameters):
    return cluster_results_file_prefix + generate_data.combine_prefixes(
        settings.tsne_parameter_set
        | settings.x_neighbors_selection_parameter_set, parameters)