def generate_kernelized_tsne_mapping_function(parameters=settings.parameters, regenerate_parameters_cache=False ): X_mnist = generate_data.load_x_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) kernelized_tsne_parameters_cache = load_all_kernelized_tsne_embedders( parameters=parameters, regenerate_parameters_cache=regenerate_parameters_cache) def kernel_tsne_mapping(x, k=1): ''' Getting kernel tSNE. Starting from scratch, so use all data at once. ''' # Let's go for reliable option. cache = kernelized_tsne_parameters_cache["%.2f" % k] y = np.zeros((x.shape[0], Y_mnist.shape[1])) for i in range(len(x)): square_distances = np.sum((X_mnist - x[i, :])**2, axis=1).reshape( (1, -1)) kernel_values = np.exp(-square_distances / (2 * cache['sigma']**2)) kernel_values = kernel_values / np.sum(kernel_values) y[i, :] = kernel_values.dot(cache['coefs']).reshape( (-1, Y_mnist.shape[1])) return y return kernel_tsne_mapping
def get_common_info(parameters): res = {} res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters) res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters) res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters) letter_samples, _, _ = generate_data.load_letters(parameters=parameters) res['letter_samples'] = letter_samples D_Y = distance.squareform(distance.pdist(res['Y_mnist'])) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself res['nearest_neighbors_y_dist'] = np.min(D_Y, axis=1) # Actually, whatever axis return res
def load_all_kernelized_tsne_embedders(parameters=settings.parameters, regenerate_parameters_cache=False): X_mnist = generate_data.load_x_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) # Implementing carefully. Not the fastest, but the most reliable way. kernelized_tsne_parameters_cache = dict() cache_filename = generate_cache_filename(parameters=parameters) if not regenerate_parameters_cache and os.path.isfile(cache_filename): with open(cache_filename, 'rb') as f: kernelized_tsne_parameters_cache = pickle.load(f) else: D = distance.squareform(distance.pdist(X_mnist)) step = 0.01 choice_K = np.arange(step, 2 + step, step) # Let's try those K. np.fill_diagonal(D, np.inf) closest_neighbor_dist = np.min(D, axis=1).reshape((1, -1)) np.fill_diagonal(D, 0) # Sigma is a multiply over closest NN distance for k in choice_K: key = "%.2f" % k if k not in kernelized_tsne_parameters_cache or regenerate_parameters_cache: kernelized_tsne_parameters_cache[key] = dict() # Creating matrix to get coefficients using SLE sigma_matrix = k * np.repeat( closest_neighbor_dist, X_mnist.shape[0], axis=0) kernel_matrix = np.exp(-D**2 / (2 * sigma_matrix**2)) kernel_matrix = kernel_matrix / np.sum( kernel_matrix, axis=1).reshape( (-1, 1)) # Normalizing by rows coefs = np.linalg.inv(kernel_matrix).dot(Y_mnist) kernelized_tsne_parameters_cache[key]['coefs'] = coefs kernelized_tsne_parameters_cache[key]['sigma'] = sigma_matrix[ 0, :] logging.info("Got coefs for coefficient %f", k) with open(cache_filename, 'wb') as f: pickle.dump(kernelized_tsne_parameters_cache, f) return kernelized_tsne_parameters_cache
def get_common_info(parameters): res = {} res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters) res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters) res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters) res['labels_mnist'] = generate_data.load_labels_mnist( parameters=parameters) res['picked_neighbors'] = generate_data.load_picked_neighbors( parameters=parameters) res['picked_neighbors_labels'] = generate_data.load_picked_neighbors_labels( parameters=parameters) res['accuracy_nn'] = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) D_Y = distance.squareform(distance.pdist(res['Y_mnist'])) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself res['nearest_neighbors_y_dist'] = np.min(D_Y, axis=1) # Actually, whatever axis return res
def main(parameters=settings.parameters, regenerate=False): picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) X_mnist = generate_data.load_x_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) result = dict() output_file = \ cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(output_prefix, parameters) if os.path.isfile(output_file) and not regenerate: with open(output_file, "rb") as f: result = pickle.load(f) logging.info("Previous result loaded") else: logging.info("No previous result or regeneration requested") for fname_prefix in original_files_prefixes: cluster_results_file = \ cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(fname_prefix, parameters) logging.info("Processing file: %s", cluster_results_file) with open(cluster_results_file, 'rb') as f: res = pickle.load(f) for i in res.keys(): logging.info("Processing method: %s", i) if i not in result or regenerate: precision = calc_precision(res[i]["EmbeddedPoints"], X_mnist, Y_mnist, picked_neighbors, precision_nn) logging.info("%s precision: %f (accuracy was %f)", i, precision, res[i]["Accuracy"]) result[i] = precision with open(output_file, "wb") as f: pickle.dump(result, f)
# At some point I lost reproducibility because PCA required random state, and I did not lock it # (turns out it used some solvers use approximation with randomization needed). # That is an attempt to reproduce old results by trying different random seeds and comparing the picture. # It is unlikely to work, but why not try. import matplotlib.pyplot as plt import generate_data from matplotlib.font_manager import FontProperties import settings import os import logging import numpy as np regenerate = False logging.basicConfig(level=logging.INFO) X_mnist_old = generate_data.load_x_mnist() for i in range(0,10): parameters = settings.parameters.copy() parameters["pca_random_seed"] = i X_mnist_new = generate_data.load_x_mnist(parameters=parameters) print(i, np.max(np.abs(X_mnist_old-X_mnist_new)))
def main(parameters = settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== Some starting stuff def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y) ** 2, axis=1) return np.argsort(y_distances)[:n] kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters) with open(kernelized_results_file, 'rb') as f: kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \ kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f) ind = [4, 24, 49] kernelized_method_list = [ kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:] for i in ind] kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind] kernelized_accuracy = np.zeros((len(kernelized_method_list,))) kernelized_precision = np.zeros((len(kernelized_method_list,))) kernelized_per_item_time = np.zeros((len(kernelized_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list))) for i in range(len(picked_neighbors)): for j in range(len(kernelized_method_list)): y = kernelized_method_results[j][i, :] kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1))) kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist, kernelized_dist) kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(kernelized_method_list)): logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j]) # ============================== Accuracy and precision for j in range(len(kernelized_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors),)) per_sample_precision = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] y = kernelized_method_results[j][i,:] x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len([k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[kernelized_indices] per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels) kernelized_accuracy[j] = np.mean(per_sample_accuracy) kernelized_precision[j] = np.mean(per_sample_precision) kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors) logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j], kernelized_accuracy[j]) kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors))) processed_indices = list() kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters) if os.path.isfile(kl_kernelized_performance_file): with open(kl_kernelized_performance_file, 'rb') as f: kernelized_kl, processed_indices = pickle.load(f) # ============================== KL divergence # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(kernelized_method_results)): # Single file with p matrix new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0) kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_kernelized_performance_file, 'wb') as f: pickle.dump((kernelized_kl, processed_indices), f) # This should be fast kernelized_avg_kl = np.mean(kernelized_kl, axis=1) output_file = generate_kernelized_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision, kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== ACCURACY def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_method_results = [ picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \ picked_neighbors_y_time_gd_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(picked_neighbors_y_time_gd_transformed), np.mean(picked_neighbors_y_time_gd_transformed_random), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed ), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random ), ] gd_accuracy = np.zeros((len(gd_method_list, ))) gd_precision = np.zeros((len(gd_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis gd_nearest_neighbors_percentiles_matrix = np.zeros( (len(picked_neighbors), len(gd_method_list))) for i in range(len(picked_neighbors)): for j in range(len(gd_method_list)): y = gd_method_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_nearest_neighbors_percentiles_matrix[ i, j] = stats.percentileofscore(nearest_neighbors_y_dist, nn_dist) gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j]) # ============================== KL divergence for j in range(len(gd_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) x = picked_neighbors[i, :] y = gd_method_results[j][i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [i for i in nn_x_indices if i in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) gd_accuracy[j] = np.mean(per_sample_accuracy) gd_precision[j] = np.mean(per_sample_precision) logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j], gd_accuracy[j]) gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors))) processed_indices = list() kl_gd_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_performance_file): with open(kl_gd_performance_file, 'rb') as f: gd_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_method_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0) gd_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_performance_file, 'wb') as f: pickle.dump((gd_kl, processed_indices), f) # This should be fast gd_avg_kl = np.mean(gd_kl, axis=1) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time, gd_avg_kl, gd_distance_percentiles), f)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) letter_samples, _, _ = generate_data.load_letters(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis # ============== KL Divergence gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_letter_test_GD.generate_letter_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (letters_y_gd_transformed, letters_y_gd_variance_recalc_transformed, letters_y_gd_transformed_random, letters_y_gd_variance_recalc_transformed_random, letters_y_gd_early_exagg_transformed_random, letters_y_gd_early_exagg_transformed, letters_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, letters_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_letters_results = [ letters_y_gd_transformed, letters_y_gd_transformed_random, letters_y_gd_variance_recalc_transformed, letters_y_gd_variance_recalc_transformed_random, letters_y_gd_early_exagg_transformed, letters_y_gd_early_exagg_transformed_random, letters_y_gd_variance_recalc_early_exagg_transformed, letters_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_letter_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: letters_y_time_gd_transformed, letters_y_time_gd_variance_recalc_transformed, \ letters_y_time_gd_transformed_random, \ letters_y_time_gd_variance_recalc_transformed_random, \ letters_y_time_gd_early_exagg_transformed_random, \ letters_y_time_gd_early_exagg_transformed, \ letters_y_time_gd_variance_recalc_early_exagg_transformed_random, \ letters_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(letters_y_time_gd_transformed), np.mean(letters_y_time_gd_transformed_random), np.mean(letters_y_time_gd_variance_recalc_transformed), np.mean(letters_y_time_gd_variance_recalc_transformed_random), np.mean(letters_y_time_gd_early_exagg_transformed), np.mean(letters_y_time_gd_early_exagg_transformed_random), np.mean(letters_y_time_gd_variance_recalc_early_exagg_transformed), np.mean( letters_y_time_gd_variance_recalc_early_exagg_transformed_random), ] gd_letters_kl = np.zeros((len(gd_method_list), len(letter_samples))) processed_indices = list() kl_gd_letters_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_letters_performance_file): with open(kl_gd_letters_performance_file, 'rb') as f: gd_letters_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(letter_samples), )) for i in range(len(letter_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, letter_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_letters_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_letters_results[j][i, :].reshape((1, -1))), axis=0) gd_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_letters_performance_file, 'wb') as f: pickle.dump((gd_letters_kl, processed_indices), f) # This should be fast gd_avg_letters_kl = np.mean(gd_letters_kl, axis=1) # ============== Distance percentiles gd_letters_percentiles_matrix = np.zeros( (len(letter_samples), len(gd_method_list))) gd_letters_distance_matrix = np.zeros( (len(letter_samples), len(gd_method_list))) for i in range(len(letter_samples)): for j in range(len(gd_method_list)): y = gd_letters_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_letters_distance_matrix[i, j] = nn_dist gd_letters_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) gd_letters_distance_percentiles = np.mean(gd_letters_percentiles_matrix, axis=0) gd_letters_distances = np.mean(gd_letters_distance_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s: %f, %f", gd_method_list[j], gd_letters_distances[j], gd_letters_distance_percentiles[j]) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_time, gd_avg_letters_kl, gd_letters_distance_percentiles), f)
def generate_idw_power_performance(*, regenerate=False, recursive_regenerate=False, parameters=settings.parameters): global_idw_power_performance = dict() # Start from scratch global_idw_power_performance_abs = dict() # Start from scratch global_idw_accuracy = dict() global_idw_precision = dict() start_time = datetime.datetime.now() logging.info("IDW power experiment started: %s", start_time) idw_power_performance_file = generate_idw_power_filename(parameters) idw_power_plot_file = generate_idw_power_plot_filename(parameters) X_mnist = generate_data.load_x_mnist( parameters=parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) Y_mnist = generate_data.load_y_mnist( parameters=parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) labels_mnist = generate_data.load_labels_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] distance_matrix = distance.squareform(distance.pdist(X_mnist)) np.fill_diagonal(distance_matrix, np.inf) # We are not interested in distance to itself nn_x_distance = np.min(distance_matrix, axis=1) # Any axis will do radius_x = dict() for p in idw_percentile_options: radius_x[p] = np.percentile(nn_x_distance, p) if os.path.isfile(idw_power_performance_file) and not regenerate: with open(idw_power_performance_file, 'rb') as f: global_idw_power_performance, global_idw_power_performance_abs, global_idw_accuracy = pickle.load( f) else: logging.info("Regeneration requested") for p in idw_power_options: if p in global_idw_power_performance: logging.info("Loaded p %f", p) continue logging.info("Processing p %f", p) interpolator = dTSNE_mnist.generate_embedding_function( embedding_function_type='weighted-inverse-distance', function_kwargs={'power': p}) per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] result = interpolator(picked_neighbors[i], verbose=0) nn_indices = get_nearest_neighbors_in_y(result, Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) y = result x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) cur_acc = np.mean(per_sample_accuracy) cur_prec = np.mean(per_sample_precision) y_sum_square_dist = 0.0 y_sum_abs_dist = 0.0 y_abs_dist = 0.0 y_count = 0.0 for i in range(len(X_mnist)): distances = distance_matrix[i, :].copy() # distances[i] = np.inf #Not interested in distance to itself # Step 1. Find nearest neighbors in the neighborhood. neighbor_indices = list(range(X_mnist.shape[0])) neighbor_indices.remove(i) num_neighbors = len(neighbor_indices) weights = 1 / distances[neighbor_indices]**p weights = weights / np.sum(weights) cur_y_result = weights.dot(Y_mnist[neighbor_indices, :]) y_sum_square_dist += np.sum(cur_y_result - Y_mnist[i, :])**2 y_sum_abs_dist += np.sqrt(np.sum(cur_y_result - Y_mnist[i, :])**2) y_count += 1.0 global_idw_power_performance[p] = y_sum_square_dist / y_count global_idw_power_performance_abs[p] = y_sum_abs_dist / y_count global_idw_accuracy[p] = cur_acc global_idw_precision[p] = cur_prec # Just in case it will become unstable due to too few neighbors # lion_power_plot_data[(p, perc)]['PowerSquareDistSum'] = y_sum_square_dist # lion_power_plot_data[(p, perc)]['PowerSquareDistCount'] = y_count with open(idw_power_performance_file, 'wb') as f: pickle.dump((global_idw_power_performance, global_idw_power_performance_abs, global_idw_accuracy, global_idw_precision), f) EPS = 1e-5 y = list() x_global = list() for cur_power in idw_power_options: closest_power = [ i for i in global_idw_power_performance_abs if np.abs(i - cur_power) < EPS ] if len(closest_power) > 0: x_global.append(cur_power) y.append(global_idw_power_performance[closest_power[0]]) idw_optimal_power = x_global[np.argmin(y)] with open(idw_power_plot_file, 'wb') as f: pickle.dump((x_global, y, idw_optimal_power), f) logging.info("IDW optimal power: %f", idw_optimal_power) end_time = datetime.datetime.now() logging.info("IDW power experiment ended: %s", end_time) logging.info("IDW power experiment duration: %s", end_time - start_time)
def generate_idw_power_performance(*, regenerate=False, recursive_regenerate=False, parameters=settings.parameters): global_idw_precision_by_y = dict() global_idw_precision_by_x = dict() start_time = datetime.datetime.now() logging.info("IDW internal precision power experiment started: %s", start_time) idw_power_performance_file = generate_idw_power_filename(parameters) idw_power_plot_file = generate_idw_power_plot_filename(parameters) X_mnist = generate_data.load_x_mnist( parameters=parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) Y_mnist = generate_data.load_y_mnist( parameters=parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) distance_matrix = distance.squareform(distance.pdist(X_mnist)) np.fill_diagonal(distance_matrix, np.inf) # We are not interested in distance to itself nn_x_distance = np.min(distance_matrix, axis=1) # Any axis will do radius_x = dict() for p in idw_percentile_options: radius_x[p] = np.percentile(nn_x_distance, p) if os.path.isfile(idw_power_performance_file) and not regenerate: with open(idw_power_performance_file, 'rb') as f: global_idw_precision_by_x, global_idw_precision_by_y = pickle.load( f) else: logging.info("Regeneration requested") for p in idw_power_options: if p in global_idw_precision_by_x: logging.info("Loaded p %f %f %f", p, global_idw_precision_by_x[p], global_idw_precision_by_y[p]) continue logging.info("Processing p %f", p) per_sample_precision_x = list() per_sample_precision_y = list() for i in range(len(X_mnist)): distances = distance_matrix[i, :].copy() # distances[i] = np.inf #Not interested in distance to itself # Step 1. Find nearest neighbors in the neighborhood. neighbor_indices = list(range(X_mnist.shape[0])) neighbor_indices.remove(i) weights = 1 / distances[neighbor_indices]**p weights = weights / np.sum(weights) cur_y_result = weights.dot(Y_mnist[neighbor_indices, :]) nn_xreal_indices = get_nearest_neighbors(X_mnist[i, :], X_mnist, n=precision_nn, exclude_index=i) nn_yreal_indices = get_nearest_neighbors(Y_mnist[i, :], Y_mnist, n=precision_nn, exclude_index=i) nn_yembedded_indices = get_nearest_neighbors(cur_y_result, Y_mnist, n=precision_nn, exclude_index=i) matching_indices_xreal_yembedded = len( [j for j in nn_xreal_indices if j in nn_yembedded_indices]) matching_indices_yreal_yembedded = len( [j for j in nn_yreal_indices if j in nn_yembedded_indices]) per_sample_precision_x.append(matching_indices_xreal_yembedded / precision_nn) per_sample_precision_y.append(matching_indices_yreal_yembedded / precision_nn) global_idw_precision_by_x[p] = np.mean(per_sample_precision_x) global_idw_precision_by_y[p] = np.mean(per_sample_precision_y) # Just in case it will become unstable due to too few neighbors # lion_power_plot_data[(p, perc)]['PowerSquareDistSum'] = y_sum_square_dist # lion_power_plot_data[(p, perc)]['PowerSquareDistCount'] = y_count with open(idw_power_performance_file, 'wb') as f: pickle.dump((global_idw_precision_by_x, global_idw_precision_by_y), f) EPS = 1e-5 y = list() x_global = list() for cur_power in idw_power_options: closest_power = [ i for i in global_idw_precision_by_x if np.abs(i - cur_power) < EPS ] if len(closest_power) > 0: x_global.append(cur_power) y.append(global_idw_precision_by_x[closest_power[0]]) idw_optimal_power_precision_by_x = x_global[np.argmax(y)] precision_plot_by_x = y EPS = 1e-5 y = list() x_global = list() for cur_power in idw_power_options: closest_power = [ i for i in global_idw_precision_by_y if np.abs(i - cur_power) < EPS ] if len(closest_power) > 0: x_global.append(cur_power) y.append(global_idw_precision_by_y[closest_power[0]]) idw_optimal_power_precision_by_y = x_global[np.argmax(y)] precision_plot_by_y = y with open(idw_power_plot_file, 'wb') as f: pickle.dump((x_global, precision_plot_by_x, precision_plot_by_y, idw_optimal_power_precision_by_x, idw_optimal_power_precision_by_y), f) logging.info("IDW optimal power (precision by X): %f", idw_optimal_power_precision_by_x) logging.info("IDW optimal power (precision by Y): %f", idw_optimal_power_precision_by_y) end_time = datetime.datetime.now() logging.info("IDW internal precision power experiment ended: %s", end_time) logging.info("IDW internal precision power experiment duration: %s", end_time - start_time)
def main(parameters=settings.parameters, regenerate=False): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) outlier_samples, _ = generate_data.load_outliers(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis kernelized_results_file = exp_outlier_test_kernelized.generate_outlier_results_filename( parameters) with open(kernelized_results_file, 'rb') as f: kernelized_detailed_method_results, kernelized_detailed_tsne_time, kernelized_detailed_method_list = pickle.load( f) ind = [4, 24, 49] kernelized_method_list = [ kernelized_detailed_method_list[i][:10] + kernelized_detailed_method_list[i][-8:] for i in ind ] kernelized_outliers_results = [ kernelized_detailed_method_results[i] for i in ind ] # =========== DISTANCE PERCENTILES ========== kernelized_outliers_percentiles_matrix = np.zeros( (len(outlier_samples), len(kernelized_method_list))) kernelized_outliers_distance_matrix = np.zeros( (len(outlier_samples), len(kernelized_method_list))) for i in range(len(outlier_samples)): for j in range(len(kernelized_method_list)): y = kernelized_outliers_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) kernelized_outliers_distance_matrix[i, j] = nn_dist kernelized_outliers_percentiles_matrix[ i, j] = stats.percentileofscore(nearest_neighbors_y_dist, nn_dist) kernelized_outliers_distance_percentiles = np.mean( kernelized_outliers_percentiles_matrix, axis=0) kernelized_outliers_distances = np.mean( kernelized_outliers_distance_matrix, axis=0) kernelized_per_item_time = kernelized_detailed_tsne_time / len( outlier_samples) for j in range(len(kernelized_method_list)): logging.info("%s: %f, %f", kernelized_method_list[j], kernelized_outliers_distances[j], kernelized_outliers_distance_percentiles[j]) kernelized_outliers_kl = np.zeros( (len(kernelized_method_list), len(outlier_samples))) processed_indices = list() kl_kernelized_tsne_outliers_performance_file = generate_kernelized_kl_temp_filename( parameters) if os.path.isfile( kl_kernelized_tsne_outliers_performance_file) and not regenerate: with open(kl_kernelized_tsne_outliers_performance_file, 'rb') as f: kernelized_outliers_kl, processed_indices = pickle.load(f) # =========== KL DIVERGENCE ========== # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(outlier_samples), )) for i in range(len(outlier_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(kernelized_outliers_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, kernelized_outliers_results[j][i, :].reshape( (1, -1))), axis=0) kernelized_outliers_kl[ j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_kernelized_tsne_outliers_performance_file, 'wb') as f: pickle.dump((kernelized_outliers_kl, processed_indices), f) # This should be fast kernelized_avg_outliers_kl = np.mean(kernelized_outliers_kl, axis=1) output_file = generate_kernelized_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((kernelized_method_list, kernelized_avg_outliers_kl, kernelized_per_item_time, kernelized_outliers_distance_percentiles), f)
def train_or_load_models(regenerate_model1=False, regenerate_model2=False, regenerate_model3=False, parameters=settings.parameters): X_mnist = generate_data.load_x_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) keras_random_seed = parameters.get("keras_random_seed", settings.parameters["keras_random_seed"]) # Reproducibility: parallel threads can bring uncontrolled randomness # Luckily, models here are small, no need for parallel threads etc. session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) session = tf.Session(config=session_conf) tf.keras.backend.set_session(session) model1_weights_file_prefix = '../results/model1' model1_json_file_prefix = '../results/model1' model2_weights_file_prefix = '../results/model2' model2_json_file_prefix = '../results/model2' model3_weights_file_prefix = '../results/model3' model3_json_file_prefix = '../results/model3' model1_weights_file = model1_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.hd5') model1_json_file = model1_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.json') model2_weights_file = model2_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.hd5') model2_json_file = model2_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.json') model3_weights_file = model3_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.hd5') model3_json_file = model3_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.json') if not os.path.isfile(model1_weights_file) or regenerate_model1: # 2 layers, 250 nodes per layer, ReLu activation, dropout regularization with rate of 0.25.] set_all_random_seeds(keras_random_seed) model1 = keras.models.Sequential() model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model1.add(keras.layers.Dropout(0.25)) model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model1.add(keras.layers.Dropout(0.25)) model1.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal')) model1.compile(loss='mean_squared_error', optimizer='adam') model1.fit(X_mnist, Y_mnist, epochs=5000, verbose=1, validation_data=(X_mnist, Y_mnist)) with open(model1_json_file, "w") as f: f.write(model1.to_json()) model1.save_weights(model1_weights_file) else: with open(model1_json_file, "r") as f: model1 = keras.models.model_from_json(f.read()) model1.load_weights(model1_weights_file) model1.compile(loss='mean_squared_error', optimizer='adam') Y_nn1_mnist = model1.predict(X_mnist) if not os.path.isfile(model2_weights_file) or regenerate_model2: # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.] set_all_random_seeds(keras_random_seed) model2 = keras.models.Sequential() model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model2.add(keras.layers.Dropout(0.5)) model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model2.add(keras.layers.Dropout(0.5)) model2.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal')) model2.compile(loss='mean_squared_error', optimizer='adam') model2.fit(X_mnist, Y_mnist, epochs=5000, verbose=1, validation_data=(X_mnist, Y_mnist)) with open(model2_json_file, "w") as f: f.write(model2.to_json()) model2.save_weights(model2_weights_file) else: with open(model2_json_file, "r") as f: model2 = keras.models.model_from_json(f.read()) model2.load_weights(model2_weights_file) model2.compile(loss='mean_squared_error', optimizer='adam') Y_nn2_mnist = model2.predict(X_mnist) if not os.path.isfile(model3_weights_file) or regenerate_model3: # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.] set_all_random_seeds(keras_random_seed) model3 = keras.models.Sequential() model3.add(keras.layers.Dense(500, activation='tanh', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model3.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal')) model3.compile(loss='mean_squared_error', optimizer='adam') model3.fit(X_mnist, Y_mnist, epochs=5000, verbose=1, validation_data=(X_mnist, Y_mnist)) with open(model3_json_file, "w") as f: f.write(model3.to_json()) model3.save_weights(model3_weights_file) else: with open(model3_json_file, "r") as f: model3 = keras.models.model_from_json(f.read()) model3.load_weights(model3_weights_file) model3.compile(loss='mean_squared_error', optimizer='adam') Y_nn3_mnist = model3.predict(X_mnist) return {"models" : (model1, model2, model3), "Y_predicted" : (Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist)}
def generate_lion_power_performance(*, regenerate=False, recursive_regenerate=False, parameters=settings.parameters): start_time = datetime.datetime.now() logging.info("LION power internal precision experiment started: %s", start_time) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) lion_power_performance_data_file = generate_lion_power_performance_filename( parameters) lion_power_plot_data_file = generate_lion_power_plot_filename(parameters) lion_power_performance_data = dict() # Start from scratch X_mnist = generate_data.load_x_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) Y_mnist = generate_data.load_y_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) def get_nearest_neighbors(y, Y_mnist, n, exclude_index): y_distances = np.sum((Y_mnist - y)**2, axis=1) y_distances[exclude_index] = np.inf return np.argsort(y_distances)[:n] distance_matrix = distance.squareform(distance.pdist(X_mnist)) np.fill_diagonal(distance_matrix, np.inf) # We are not interested in distance to itself nn_x_distance = np.min(distance_matrix, axis=1) # Any axis will do radius_x = dict() for p in lion_percentile_options: radius_x[p] = np.percentile(nn_x_distance, p) logging.info("Radius X: %s", radius_x) if os.path.isfile(lion_power_performance_data_file) and not regenerate: with open(lion_power_performance_data_file, 'rb') as f: lion_power_performance_data = pickle.load(f) for perc in lion_percentile_options: for p in lion_power_options: logging.info("Processing percentile and power: %f, %d", p, perc) key = str(perc) + ";" + "%.3f" % (p) logging.info("Key: %s", key) if key not in lion_power_performance_data: lion_power_performance_data[key] = dict() if 'InternalPrecisionByX' not in lion_power_performance_data[ key] or regenerate: logging.info( "Power performance not found for power %f percentile %d.\tCalculating...", p, perc) per_sample_precision_x = list() per_sample_precision_y = list() for i in range(len(X_mnist)): distances = distance_matrix[i, :].copy() distances[ i] = np.inf # Not interested in distance to itself # Step 1. Find nearest neighbors in the neighborhood. neighbor_indices = np.where(distances <= radius_x[perc])[0] num_neighbors = len(neighbor_indices) if num_neighbors >= 2: # Below 2? Cannot interpolate # We are good weights = 1 / distances[neighbor_indices]**p weights = weights / np.sum(weights) cur_y_result = weights.dot( Y_mnist[neighbor_indices, :]) nn_xreal_indices = get_nearest_neighbors( X_mnist[i, :], X_mnist, n=precision_nn, exclude_index=i) nn_yreal_indices = get_nearest_neighbors( Y_mnist[i, :], Y_mnist, n=precision_nn, exclude_index=i) nn_yembedded_indices = get_nearest_neighbors( cur_y_result, Y_mnist, n=precision_nn, exclude_index=i) matching_indices_xreal_yembedded = len([ j for j in nn_xreal_indices if j in nn_yembedded_indices ]) matching_indices_yreal_yembedded = len([ j for j in nn_yreal_indices if j in nn_yembedded_indices ]) per_sample_precision_x.append( matching_indices_xreal_yembedded / precision_nn) per_sample_precision_y.append( matching_indices_yreal_yembedded / precision_nn) new_dict = dict() new_dict['InternalPrecisionByX'] = np.mean( per_sample_precision_x) new_dict['InternalPrecisionByY'] = np.mean( per_sample_precision_y) for ndk in new_dict.keys(): lion_power_performance_data[key][ndk] = new_dict[ndk] with open(lion_power_performance_data_file, 'wb') as f: pickle.dump(lion_power_performance_data, f) else: logging.info( "Power FOUND for power %f percentile %d. Using loaded.", p, perc) logging.info("%s %s", key, lion_power_performance_data[key]) lion_optimal_power_x = dict() lion_power_plot_x = dict() for perc in lion_percentile_options: y = list() for cur_power in lion_power_options: key = str(perc) + ";%.3f" % (cur_power) # print(cur_power, perc, lion_power_plot_data[key]) y.append(lion_power_performance_data[key]['InternalPrecisionByX']) lion_power_plot_x[perc] = y lion_optimal_power_x[perc] = lion_power_options[np.argmax(y)] lion_optimal_power_y = dict() lion_power_plot_y = dict() for perc in lion_percentile_options: y = list() for cur_power in lion_power_options: key = str(perc) + ";%.3f" % (cur_power) # print(cur_power, perc, lion_power_plot_data[key]) y.append(lion_power_performance_data[key]['InternalPrecisionByY']) lion_power_plot_y[perc] = y lion_optimal_power_y[perc] = lion_power_options[np.argmax(y)] with open(lion_power_plot_data_file, 'wb') as f: pickle.dump( (lion_power_options, lion_power_plot_y, lion_optimal_power_y, lion_power_plot_x, lion_optimal_power_x), f) logging.info("LION optimal power X: %s", lion_optimal_power_x) logging.info("LION optimal power Y: %s", lion_optimal_power_y) end_time = datetime.datetime.now() logging.info("LION power experiment ended: %s", end_time) logging.info("LION power experiment duration: %s", end_time - start_time)
def main(parameters=settings.parameters, regenerate_parameters_cache=False): step = 0.01 choice_K = np.arange(step, 2 + step, step) # Let's try those K. logging.info("Started loading.") Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) baseline_accuracy = generate_data.get_baseline_accuracy( parameters=parameters) logging.info("Loaded everything.") D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] # Implementing carefully. Not the fastest, but the most reliable way. kernel_tsne_mapping = kernelized_tsne.generate_kernelized_tsne_mapping_function( parameters=parameters, regenerate_parameters_cache=regenerate_parameters_cache) kernelized_detailed_tsne_method_list = [ "Kernelized tSNE; K=%.2f" % (k) for k in choice_K ] kernelized_detailed_tsne_method_results = list() kernelized_detailed_tsne_accuracy = np.zeros( (len(kernelized_detailed_tsne_method_list), )) kernelized_detailed_tsne_precision = np.zeros( (len(kernelized_detailed_tsne_method_list), )) kernelized_detailed_tsne_time = np.zeros( (len(kernelized_detailed_tsne_method_list), )) for j in range(len(choice_K)): k = choice_K[j] logging.info("%f", k) embedder_start_time = datetime.datetime.now() kernelized_detailed_tsne_method_results.append( kernel_tsne_mapping(picked_neighbors, k=k)) embedder_end_time = datetime.datetime.now() kernelized_detailed_tsne_time[j] = ( embedder_end_time - embedder_start_time).total_seconds() logging.info("%f complete", k) #kernelized_detailed_tsne_method_results = [kernel_tsne_mapping(picked_neighbors, k=k) for k in choice_K] logging.info("%s", kernelized_detailed_tsne_method_list[j]) per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): if i % 200 == 0: logging.info("%d", i) expected_label = picked_neighbor_labels[i] y = kernelized_detailed_tsne_method_results[j][i, :] x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) kernelized_indices = get_nearest_neighbors_in_y( kernelized_detailed_tsne_method_results[j][i, :], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[kernelized_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) kernelized_detailed_tsne_accuracy[j] = np.mean(per_sample_accuracy) kernelized_detailed_tsne_precision[j] = np.mean(per_sample_precision) logging.info("%s :\t%f\t%f\t%f s", kernelized_detailed_tsne_method_list[j], kernelized_detailed_tsne_precision[j], kernelized_detailed_tsne_accuracy[j], kernelized_detailed_tsne_time[j]) # Accuracy-vs-power plot legend_list = list() f, ax = plt.subplots() f.set_size_inches(6, 3) x = [k for k in choice_K] # Ensuring order y = kernelized_detailed_tsne_accuracy # plt.title("IDW - Accuracy vs Power") # We'd better use figure caption # ax.legend([h1,h2,h3,h4,h5,h6], ["Closest Training Set Image"]+idw_method_list) plt.plot(x, y, c='blue') h = plt.axhline(y=baseline_accuracy, c='black', linestyle='--') plt.legend([h], ["Baseline Accuracy (%.4f)" % baseline_accuracy]) plt.xlabel("Kernelized tSNE: K parameter") plt.ylabel("10-NN Accuracy") plt.ylim([0, 1]) plt.xlim([0, 2]) f.tight_layout() plt.savefig("../figures/kernelized-tsne-K-vs-accuracy.png") ind = [4, 24, 49] kernelized_tsne_method_list = [ kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:] for i in ind ] kernelized_tsne_method_results = [ kernelized_detailed_tsne_method_results[i] for i in ind ] kernelized_tsne_nearest_neighbors_percentiles_matrix = np.zeros( (len(picked_neighbors), len(kernelized_tsne_method_list))) for i in range(len(picked_neighbors)): for j in range(len(kernelized_tsne_method_list)): y = kernelized_tsne_method_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) kernelized_tsne_nearest_neighbors_percentiles_matrix[ i, j] = stats.percentileofscore(nearest_neighbors_y_dist, nn_dist) kernelized_tsne_distance_percentiles = np.mean( kernelized_tsne_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(kernelized_tsne_method_list)): print(kernelized_tsne_method_list[j], kernelized_tsne_distance_percentiles[j]) output_file = generate_cluster_results_filename(parameters) with open(output_file, 'wb') as f: pickle.dump( (kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list), f)
import matplotlib.pyplot as plt import generate_data import numpy as np import settings from matplotlib.font_manager import FontProperties from scipy.spatial import distance # Step 1. Load all data. parameters = settings.parameters radius_y_percentile = 100 Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) lion_toy_interp = dTSNE_mnist.generate_lion_tsne_embedder( verbose=0, random_state=0, function_kwargs={ 'y_safety_margin': 0, 'radius_y_percentile': radius_y_percentile }) # Step 2. Generate outliers and embed them # Usually we separate data generation and figure generation, but here calculations are just too fast and # data should never be used anywhere else. n_outl = 123 x_outl = np.zeros([n_outl, X_mnist.shape[1]]) np.random.seed(0) for i in range(n_outl): n = np.random.choice(30, 15)
def generate_lion_power_performance(*, regenerate=False, recursive_regenerate=False, parameters=settings.parameters): start_time = datetime.datetime.now() logging.info("LION power experiment started: %s", start_time) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) lion_power_performance_data_file = generate_lion_power_performance_filename( parameters) lion_power_plot_data_file = generate_lion_power_plot_filename(parameters) lion_power_performance_data = dict() # Start from scratch X_mnist = generate_data.load_x_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) Y_mnist = generate_data.load_y_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) labels_mnist = generate_data.load_labels_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] dTSNE_mnist = generate_data.load_dtsne_mnist( parameters=settings.parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=settings.parameters) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=settings.parameters) distance_matrix = distance.squareform(distance.pdist(X_mnist)) np.fill_diagonal(distance_matrix, np.inf) # We are not interested in distance to itself nn_x_distance = np.min(distance_matrix, axis=1) # Any axis will do radius_x = dict() for p in lion_percentile_options: radius_x[p] = np.percentile(nn_x_distance, p) logging.info("Radius X: %s", radius_x) if os.path.isfile(lion_power_performance_data_file) and not regenerate: with open(lion_power_performance_data_file, 'rb') as f: lion_power_performance_data = pickle.load(f) for perc in lion_percentile_options: for p in lion_power_options: logging.info("Processing percentile and power: %f, %d", p, perc) key = str(perc) + ";" + "%.3f" % (p) logging.info("Key: %s", key) if key not in lion_power_performance_data: lion_power_performance_data[key] = dict() if 'Accuracy' not in lion_power_performance_data[key]: logging.info( "Accuracy not found for power %f percentile %d. \tCalculating...", p, perc) interpolator = dTSNE_mnist.generate_lion_tsne_embedder( verbose=0, random_state=0, function_kwargs={ 'radius_x_percentile': perc, 'power': p }) per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): # if i%100==0: # print("\tPower: ",p,"Processing:",i) expected_label = picked_neighbor_labels[i] result = interpolator(picked_neighbors[i], verbose=0) nn_indices = get_nearest_neighbors_in_y(result, Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len( obtained_labels) y = result x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) cur_acc = np.mean(per_sample_accuracy) cur_prec = np.mean(per_sample_precision) # print('================= ',p,perc, cur_acc) lion_power_performance_data[key]['Accuracy'] = cur_acc lion_power_performance_data[key]['Precision'] = cur_prec with open(lion_power_performance_data_file, 'wb') as f: pickle.dump(lion_power_performance_data, f) else: logging.info( "Accuracy FOUND for power %f percentile %d. Using loaded.", p, perc) if 'PowerSquareDist' not in lion_power_performance_data[ key] or regenerate: logging.info( "Power performance not found for power %f percentile %d.\tCalculating...", p, perc) y_sum_square_dist = 0.0 y_sum_abs_dist = 0.0 y_count = 0.0 for i in range(len(X_mnist)): distances = distance_matrix[i, :].copy() distances[ i] = np.inf # Not interested in distance to itself # Step 1. Find nearest neighbors in the neighborhood. neighbor_indices = np.where(distances <= radius_x[perc])[0] num_neighbors = len(neighbor_indices) if num_neighbors >= 2: # Below 2? Cannot interpolate # We are good weights = 1 / distances[neighbor_indices]**p weights = weights / np.sum(weights) cur_y_result = weights.dot( Y_mnist[neighbor_indices, :]) y_sum_square_dist += np.sum(cur_y_result - Y_mnist[i, :])**2 y_sum_abs_dist += np.sqrt( np.sum(cur_y_result - Y_mnist[i, :])**2) y_count += 1.0 new_dict = dict() new_dict['PowerSquareDist'] = y_sum_square_dist / y_count new_dict['PowerAbsDist'] = y_sum_abs_dist / y_count # Just in case it will become unstable due to too few neighbors new_dict['PowerSquareDistSum'] = y_sum_square_dist new_dict['PowerSquareDistCount'] = y_count for ndk in new_dict.keys(): lion_power_performance_data[key][ndk] = new_dict[ndk] with open(lion_power_performance_data_file, 'wb') as f: pickle.dump(lion_power_performance_data, f) else: logging.info( "Power FOUND for power %f percentile %d. Using loaded.", p, perc) logging.info("%s %s", key, lion_power_performance_data[key]) lion_optimal_power = dict() lion_power_plot_y = dict() for perc in lion_percentile_options: y = list() for cur_power in lion_power_options: key = str(perc) + ";%.3f" % (cur_power) # print(cur_power, perc, lion_power_plot_data[key]) y.append(lion_power_performance_data[key]['PowerSquareDist']) lion_power_plot_y[perc] = y lion_optimal_power[perc] = lion_power_options[np.argmin(y)] with open(lion_power_plot_data_file, 'wb') as f: pickle.dump( (lion_power_options, lion_power_plot_y, lion_optimal_power), f) logging.info("LION optimal power: %s", lion_optimal_power) end_time = datetime.datetime.now() logging.info("LION power experiment ended: %s", end_time) logging.info("LION power experiment duration: %s", end_time - start_time)