def calc_kl(*, common_info, embedded_neighbors, parameters): dTSNE_mnist = common_info["dTSNE_mnist"] X_mnist = common_info["X_mnist"] Y_mnist = common_info["Y_mnist"] letter_samples = common_info["letter_samples"] per_sample_kl_divergences = list() for j in range(len(letter_samples)): distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Don't store those matrices in a single file. Way too large. # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): if j % 50 == 0: logging.info("\t%d P-matrix file found. Loading.", j) with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: if j % 50 == 0: logging.info( "\t%d P-matrix file not found. Creating and saving.", j) new_X = np.concatenate((X_mnist, letter_samples[j, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma \ (distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) if not os.path.isdir(distance_matrix_dir): logging.info('Creating directory: %s', distance_matrix_dir) os.mkdir(distance_matrix_dir) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # Single file with p matrix. # Now use it to calculate KL divergence. new_Y = np.concatenate((Y_mnist, embedded_neighbors[j, :].reshape( (1, -1))), axis=0) kl, _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) per_sample_kl_divergences.append(kl) return np.mean(per_sample_kl_divergences)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== ACCURACY def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_method_results = [ picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \ picked_neighbors_y_time_gd_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(picked_neighbors_y_time_gd_transformed), np.mean(picked_neighbors_y_time_gd_transformed_random), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed ), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random ), ] gd_accuracy = np.zeros((len(gd_method_list, ))) gd_precision = np.zeros((len(gd_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis gd_nearest_neighbors_percentiles_matrix = np.zeros( (len(picked_neighbors), len(gd_method_list))) for i in range(len(picked_neighbors)): for j in range(len(gd_method_list)): y = gd_method_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_nearest_neighbors_percentiles_matrix[ i, j] = stats.percentileofscore(nearest_neighbors_y_dist, nn_dist) gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j]) # ============================== KL divergence for j in range(len(gd_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) x = picked_neighbors[i, :] y = gd_method_results[j][i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [i for i in nn_x_indices if i in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) gd_accuracy[j] = np.mean(per_sample_accuracy) gd_precision[j] = np.mean(per_sample_precision) logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j], gd_accuracy[j]) gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors))) processed_indices = list() kl_gd_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_performance_file): with open(kl_gd_performance_file, 'rb') as f: gd_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_method_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0) gd_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_performance_file, 'wb') as f: pickle.dump((gd_kl, processed_indices), f) # This should be fast gd_avg_kl = np.mean(gd_kl, axis=1) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time, gd_avg_kl, gd_distance_percentiles), f)
def main(parameters = settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== Some starting stuff def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y) ** 2, axis=1) return np.argsort(y_distances)[:n] kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters) with open(kernelized_results_file, 'rb') as f: kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \ kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f) ind = [4, 24, 49] kernelized_method_list = [ kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:] for i in ind] kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind] kernelized_accuracy = np.zeros((len(kernelized_method_list,))) kernelized_precision = np.zeros((len(kernelized_method_list,))) kernelized_per_item_time = np.zeros((len(kernelized_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list))) for i in range(len(picked_neighbors)): for j in range(len(kernelized_method_list)): y = kernelized_method_results[j][i, :] kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1))) kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist, kernelized_dist) kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(kernelized_method_list)): logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j]) # ============================== Accuracy and precision for j in range(len(kernelized_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors),)) per_sample_precision = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] y = kernelized_method_results[j][i,:] x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len([k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[kernelized_indices] per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels) kernelized_accuracy[j] = np.mean(per_sample_accuracy) kernelized_precision[j] = np.mean(per_sample_precision) kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors) logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j], kernelized_accuracy[j]) kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors))) processed_indices = list() kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters) if os.path.isfile(kl_kernelized_performance_file): with open(kl_kernelized_performance_file, 'rb') as f: kernelized_kl, processed_indices = pickle.load(f) # ============================== KL divergence # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(kernelized_method_results)): # Single file with p matrix new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0) kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_kernelized_performance_file, 'wb') as f: pickle.dump((kernelized_kl, processed_indices), f) # This should be fast kernelized_avg_kl = np.mean(kernelized_kl, axis=1) output_file = generate_kernelized_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision, kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
if time_s > 100: return ">$10^5$" if time_s > 10: return ">$10^4$" if time_s > 1: return ">$10^3$" return "%2.2f" % (time_s * 1000) # ================= GENERATING THE TEX TABLE ========================== print("\n\nTABLE FOR COPY-PASTING TO LATEX\n\n\n") s = "" initial_kl_divergence, _ = lion_tsne.kl_divergence_and_gradient( y=dTSNE_mnist.Y, p_matrix=dTSNE_mnist.P_matrix) s += '''\\begin{table} \small\sf\centering \caption{Letters placement test: methods comparison. Original KL divergence of the dataset is %.5f} \label{tab_letter_methods_comparison} \\begin{tabular}{ m{0.19\\textwidth} m{0.07\\textwidth} m{0.07\\textwidth} m{0.06\\textwidth} } \\toprule \\textbf{Method} & \\textbf{Distance Perc-le} & \\textbf{KL Div.} & \\textbf{Time (ms)} \\\\ \\midrule''' % (initial_kl_divergence) s += '\\multicolumn{4}{c}{\\textbf{RBF Interpolation}}\n' s += '\t\\\\\n' for j in range(len(rbf_method_list)):
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) letter_samples, _, _ = generate_data.load_letters(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis # ============== KL Divergence gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_letter_test_GD.generate_letter_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (letters_y_gd_transformed, letters_y_gd_variance_recalc_transformed, letters_y_gd_transformed_random, letters_y_gd_variance_recalc_transformed_random, letters_y_gd_early_exagg_transformed_random, letters_y_gd_early_exagg_transformed, letters_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, letters_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_letters_results = [ letters_y_gd_transformed, letters_y_gd_transformed_random, letters_y_gd_variance_recalc_transformed, letters_y_gd_variance_recalc_transformed_random, letters_y_gd_early_exagg_transformed, letters_y_gd_early_exagg_transformed_random, letters_y_gd_variance_recalc_early_exagg_transformed, letters_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_letter_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: letters_y_time_gd_transformed, letters_y_time_gd_variance_recalc_transformed, \ letters_y_time_gd_transformed_random, \ letters_y_time_gd_variance_recalc_transformed_random, \ letters_y_time_gd_early_exagg_transformed_random, \ letters_y_time_gd_early_exagg_transformed, \ letters_y_time_gd_variance_recalc_early_exagg_transformed_random, \ letters_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(letters_y_time_gd_transformed), np.mean(letters_y_time_gd_transformed_random), np.mean(letters_y_time_gd_variance_recalc_transformed), np.mean(letters_y_time_gd_variance_recalc_transformed_random), np.mean(letters_y_time_gd_early_exagg_transformed), np.mean(letters_y_time_gd_early_exagg_transformed_random), np.mean(letters_y_time_gd_variance_recalc_early_exagg_transformed), np.mean( letters_y_time_gd_variance_recalc_early_exagg_transformed_random), ] gd_letters_kl = np.zeros((len(gd_method_list), len(letter_samples))) processed_indices = list() kl_gd_letters_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_letters_performance_file): with open(kl_gd_letters_performance_file, 'rb') as f: gd_letters_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(letter_samples), )) for i in range(len(letter_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, letter_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_letters_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_letters_results[j][i, :].reshape((1, -1))), axis=0) gd_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_letters_performance_file, 'wb') as f: pickle.dump((gd_letters_kl, processed_indices), f) # This should be fast gd_avg_letters_kl = np.mean(gd_letters_kl, axis=1) # ============== Distance percentiles gd_letters_percentiles_matrix = np.zeros( (len(letter_samples), len(gd_method_list))) gd_letters_distance_matrix = np.zeros( (len(letter_samples), len(gd_method_list))) for i in range(len(letter_samples)): for j in range(len(gd_method_list)): y = gd_letters_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_letters_distance_matrix[i, j] = nn_dist gd_letters_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) gd_letters_distance_percentiles = np.mean(gd_letters_percentiles_matrix, axis=0) gd_letters_distances = np.mean(gd_letters_distance_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s: %f, %f", gd_method_list[j], gd_letters_distances[j], gd_letters_distance_percentiles[j]) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_time, gd_avg_letters_kl, gd_letters_distance_percentiles), f)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_y_mnist(parameters=parameters) outlier_samples, _ = generate_data.load_outliers(parameters=parameters) nn_results_file = exp_outlier_test_NN.generate_outlier_results_filename( parameters) with open(nn_results_file, 'rb') as f: nn_outliers_results, nn_models_orig, nn_method_list = pickle.load(f) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis # ================ KL DIVERGENCE =================== nn_outliers_kl = np.zeros((len(nn_method_list), len(outlier_samples))) processed_indices = list() kl_nn_outliers_performance_file = generate_nn_kl_temp_filename(parameters) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(outlier_samples), )) for i in range(len(outlier_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(nn_outliers_results)): # Single file with p matrix new_Y = np.concatenate( (nn_models_orig[j], nn_outliers_results[j][i, :].reshape( (1, -1))), axis=0) nn_outliers_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_nn_outliers_performance_file, 'wb') as f: pickle.dump((nn_outliers_kl, processed_indices), f) # This should be fast nn_avg_outliers_kl = np.mean(nn_outliers_kl, axis=1) # ================ DISTANCE MATRICES =================== nn_outliers_percentiles_matrix = np.zeros( (len(outlier_samples), len(nn_method_list))) nn_outliers_distance_matrix = np.zeros( (len(outlier_samples), len(nn_method_list))) for i in range(len(outlier_samples)): for j in range(len(nn_method_list)): y = nn_outliers_results[j][i, :] nn_dist = np.min( np.sqrt(np.sum((nn_models_orig[j] - y)**2, axis=1))) nn_outliers_distance_matrix[i, j] = nn_dist nn_outliers_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) nn_outliers_distance_percentiles = np.mean(nn_outliers_percentiles_matrix, axis=0) nn_outliers_distances = np.mean(nn_outliers_distance_matrix, axis=0) for j in range(len(nn_method_list)): print(nn_method_list[j], nn_outliers_distances[j], nn_outliers_distance_percentiles[j]) output_file = generate_nn_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((nn_method_list, nn_avg_outliers_kl, nn_outliers_distance_percentiles), f)
def main(parameters=settings.parameters, regenerate=False): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) letter_A_samples, _ = generate_data.load_A_letters(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis kernelized_results_file = exp_letter_A_test_kernelized.generate_letter_A_results_filename( parameters) with open(kernelized_results_file, 'rb') as f: kernelized_detailed_method_results, kernelized_detailed_tsne_time, kernelized_detailed_method_list = pickle.load( f) ind = [4, 24, 49] kernelized_method_list = [ kernelized_detailed_method_list[i][:10] + kernelized_detailed_method_list[i][-8:] for i in ind ] kernelized_letters_results = [ kernelized_detailed_method_results[i] for i in ind ] # =========== DISTANCE PERCENTILES ========== kernelized_letters_percentiles_matrix = np.zeros( (len(letter_A_samples), len(kernelized_method_list))) kernelized_letters_distance_matrix = np.zeros( (len(letter_A_samples), len(kernelized_method_list))) for i in range(len(letter_A_samples)): for j in range(len(kernelized_method_list)): y = kernelized_letters_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) kernelized_letters_distance_matrix[i, j] = nn_dist kernelized_letters_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) kernelized_letters_distance_percentiles = np.mean( kernelized_letters_percentiles_matrix, axis=0) kernelized_letters_distances = np.mean(kernelized_letters_distance_matrix, axis=0) kernelized_per_item_time = kernelized_detailed_tsne_time / len( letter_A_samples) for j in range(len(kernelized_method_list)): logging.info("%s: %f, %f", kernelized_method_list[j], kernelized_letters_distances[j], kernelized_letters_distance_percentiles[j]) kernelized_letters_kl = np.zeros( (len(kernelized_method_list), len(letter_A_samples))) processed_indices = list() kl_kernelized_tsne_letters_performance_file = generate_kernelized_kl_temp_filename( parameters) if os.path.isfile( kl_kernelized_tsne_letters_performance_file) and not regenerate: with open(kl_kernelized_tsne_letters_performance_file, 'rb') as f: kernelized_letters_kl, processed_indices = pickle.load(f) # =========== KL DIVERGENCE ========== # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(letter_A_samples), )) for i in range(len(letter_A_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_A_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, letter_A_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(kernelized_letters_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, kernelized_letters_results[j][i, :].reshape( (1, -1))), axis=0) kernelized_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_kernelized_tsne_letters_performance_file, 'wb') as f: pickle.dump((kernelized_letters_kl, processed_indices), f) # This should be fast kernelized_avg_letters_kl = np.mean(kernelized_letters_kl, axis=1) output_file = generate_kernelized_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((kernelized_method_list, kernelized_avg_letters_kl, kernelized_per_item_time, kernelized_letters_distance_percentiles), f)