def main(parameters=settings.parameters, regenerate_parameters_cache=False): step = 0.01 choice_K = np.arange(step, 2 + step, step) # Let's try those K. outlier_samples, _ = generate_data.load_outliers(parameters=parameters) kernel_tsne_mapping = kernelized_tsne.generate_kernelized_tsne_mapping_function( parameters=parameters, regenerate_parameters_cache=regenerate_parameters_cache) kernelized_detailed_tsne_method_list = [ "Kernelized tSNE; K=%.2f" % (k) for k in choice_K ] kernelized_detailed_tsne_outliers_results = list() kernelized_detailed_tsne_time = np.zeros( (len(kernelized_detailed_tsne_method_list), )) for j in range(len(choice_K)): k = choice_K[j] logging.info("%f", k) embedder_start_time = datetime.datetime.now() kernelized_detailed_tsne_outliers_results.append( kernel_tsne_mapping(outlier_samples, k=k)) embedder_end_time = datetime.datetime.now() kernelized_detailed_tsne_time[j] = ( embedder_end_time - embedder_start_time).total_seconds() logging.info("%f complete: %f s", k, kernelized_detailed_tsne_time[j]) output_file = generate_outlier_results_filename(parameters=parameters) with open(output_file, 'wb') as f: pickle.dump((kernelized_detailed_tsne_outliers_results, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list), f)
def get_common_info(parameters): res = {} res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters) res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters) res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters) outlier_samples, _ = generate_data.load_outliers(parameters=parameters) res['outlier_samples'] = outlier_samples D_Y = distance.squareform(distance.pdist(res['Y_mnist'])) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself res['nearest_neighbors_y_dist'] = np.min(D_Y, axis=1) # Actually, whatever axis return res
def main(regenerate_model1=False, regenerate_model2=False, regenerate_model3=False, parameters=settings.parameters): outlier_samples, _ = generate_data.load_outliers(parameters=parameters) models_and_results = neural_network_commons.train_or_load_models(regenerate_model1=regenerate_model1, regenerate_model3=regenerate_model3,regenerate_model2=regenerate_model2,parameters=parameters) model1, model2, model3 = models_and_results["models"] Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist = models_and_results["Y_predicted"] Y_outl1_mnist = model1.predict(outlier_samples) Y_outl2_mnist = model2.predict(outlier_samples) Y_outl3_mnist = model3.predict(outlier_samples) nn_models_orig = [Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist] nn_method_list = ['NN - 2L; 250N; ReLu; D0.25','NN - 2L; 500N; ReLu; D0.5', 'NN - 1L; 500N; tanh'] nn_outliers_results = [Y_outl1_mnist, Y_outl2_mnist, Y_outl3_mnist] output_file = generate_outlier_results_filename(parameters) with open(output_file, 'wb') as f: pickle.dump((nn_outliers_results, nn_models_orig, nn_method_list), f)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_y_mnist(parameters=parameters) outlier_samples, _ = generate_data.load_outliers(parameters=parameters) nn_results_file = exp_outlier_test_NN.generate_outlier_results_filename( parameters) with open(nn_results_file, 'rb') as f: nn_outliers_results, nn_models_orig, nn_method_list = pickle.load(f) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis # ================ KL DIVERGENCE =================== nn_outliers_kl = np.zeros((len(nn_method_list), len(outlier_samples))) processed_indices = list() kl_nn_outliers_performance_file = generate_nn_kl_temp_filename(parameters) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(outlier_samples), )) for i in range(len(outlier_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(nn_outliers_results)): # Single file with p matrix new_Y = np.concatenate( (nn_models_orig[j], nn_outliers_results[j][i, :].reshape( (1, -1))), axis=0) nn_outliers_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_nn_outliers_performance_file, 'wb') as f: pickle.dump((nn_outliers_kl, processed_indices), f) # This should be fast nn_avg_outliers_kl = np.mean(nn_outliers_kl, axis=1) # ================ DISTANCE MATRICES =================== nn_outliers_percentiles_matrix = np.zeros( (len(outlier_samples), len(nn_method_list))) nn_outliers_distance_matrix = np.zeros( (len(outlier_samples), len(nn_method_list))) for i in range(len(outlier_samples)): for j in range(len(nn_method_list)): y = nn_outliers_results[j][i, :] nn_dist = np.min( np.sqrt(np.sum((nn_models_orig[j] - y)**2, axis=1))) nn_outliers_distance_matrix[i, j] = nn_dist nn_outliers_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) nn_outliers_distance_percentiles = np.mean(nn_outliers_percentiles_matrix, axis=0) nn_outliers_distances = np.mean(nn_outliers_distance_matrix, axis=0) for j in range(len(nn_method_list)): print(nn_method_list[j], nn_outliers_distances[j], nn_outliers_distance_percentiles[j]) output_file = generate_nn_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((nn_method_list, nn_avg_outliers_kl, nn_outliers_distance_percentiles), f)
import matplotlib.pyplot as plt import generate_data import settings import logging logging.basicConfig(level=logging.INFO) _, outlier_samples_raw = generate_data.load_outliers( parameters=settings.parameters) width = 10 #total number to show height = 1 start_index = 0 f, ax = plt.subplots(height, width, dpi=300) f.set_size_inches( 3.3, 0.33) # 3.3, 1 - 3 rows, 3.3, 0.66 - 2 rows, 3.3, 0.33 - 1 row f.subplots_adjust() #f.tight_layout() if height > 1: for i in range(height): for j in range(width): ax[i, j].imshow(outlier_samples_raw[i * width + j, :].reshape(28, 28), cmap='gray_r') #Set_axis_off does not fit. I want a bounding box. ax[i, j].axes.get_xaxis().set_visible(False) ax[i, j].axes.get_yaxis().set_visible(False) else: for j in range(width): ax[j].imshow(outlier_samples_raw[j, :].reshape(28, 28), cmap='gray_r')
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) outlier_samples, _ = generate_data.load_outliers(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis # ============== KL Divergence gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_outlier_test_GD.generate_outlier_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (outliers_y_gd_transformed, outliers_y_gd_variance_recalc_transformed, outliers_y_gd_transformed_random, outliers_y_gd_variance_recalc_transformed_random, outliers_y_gd_early_exagg_transformed_random, outliers_y_gd_early_exagg_transformed, outliers_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, outliers_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_outliers_results = [ outliers_y_gd_transformed, outliers_y_gd_transformed_random, outliers_y_gd_variance_recalc_transformed, outliers_y_gd_variance_recalc_transformed_random, outliers_y_gd_early_exagg_transformed, outliers_y_gd_early_exagg_transformed_random, outliers_y_gd_variance_recalc_early_exagg_transformed, outliers_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_outlier_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: outliers_y_time_gd_transformed, outliers_y_time_gd_variance_recalc_transformed, \ outliers_y_time_gd_transformed_random, \ outliers_y_time_gd_variance_recalc_transformed_random, \ outliers_y_time_gd_early_exagg_transformed_random, \ outliers_y_time_gd_early_exagg_transformed, \ outliers_y_time_gd_variance_recalc_early_exagg_transformed_random, \ outliers_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(outliers_y_time_gd_transformed), np.mean(outliers_y_time_gd_transformed_random), np.mean(outliers_y_time_gd_variance_recalc_transformed), np.mean(outliers_y_time_gd_variance_recalc_transformed_random), np.mean(outliers_y_time_gd_early_exagg_transformed), np.mean(outliers_y_time_gd_early_exagg_transformed_random), np.mean(outliers_y_time_gd_variance_recalc_early_exagg_transformed), np.mean( outliers_y_time_gd_variance_recalc_early_exagg_transformed_random), ] gd_outliers_kl = np.zeros((len(gd_method_list), len(outlier_samples))) processed_indices = list() kl_gd_outliers_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_outliers_performance_file): with open(kl_gd_outliers_performance_file, 'rb') as f: gd_outliers_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(outlier_samples), )) for i in range(len(outlier_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_outliers_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_outliers_results[j][i, :].reshape((1, -1))), axis=0) gd_outliers_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_outliers_performance_file, 'wb') as f: pickle.dump((gd_outliers_kl, processed_indices), f) # This should be fast gd_avg_outliers_kl = np.mean(gd_outliers_kl, axis=1) # ============== Distance percentiles gd_outliers_percentiles_matrix = np.zeros( (len(outlier_samples), len(gd_method_list))) gd_outliers_distance_matrix = np.zeros( (len(outlier_samples), len(gd_method_list))) for i in range(len(outlier_samples)): for j in range(len(gd_method_list)): y = gd_outliers_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_outliers_distance_matrix[i, j] = nn_dist gd_outliers_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) gd_outliers_distance_percentiles = np.mean(gd_outliers_percentiles_matrix, axis=0) gd_outliers_distances = np.mean(gd_outliers_distance_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s: %f, %f", gd_method_list[j], gd_outliers_distances[j], gd_outliers_distance_percentiles[j]) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_time, gd_avg_outliers_kl, gd_outliers_distance_percentiles), f)
def main(parameters=settings.parameters, regenerate=False, only_time=False): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist= generate_data.load_y_mnist(parameters=parameters) outlier_samples, _ = generate_data.load_outliers(parameters=parameters) output_file = generate_outlier_results_filename(parameters) output_time_file = generate_time_results_filename(parameters) first_sample_inc = 0 # Change only if it is one of "Other notebooks just for parallelization" last_sample_exclusive = len(outlier_samples) # Doing it from scratch takes REALLY long time. If possible, save results & pre-load if os.path.isfile(output_file) and not regenerate: logging.info("Found previous partially completed test. Starting from there.") with open(output_file, 'rb') as f: (outliers_y_gd_transformed, outliers_y_gd_variance_recalc_transformed, outliers_y_gd_transformed_random, outliers_y_gd_variance_recalc_transformed_random, outliers_y_gd_early_exagg_transformed_random, outliers_y_gd_early_exagg_transformed, outliers_y_gd_variance_recalc_early_exagg_transformed_random, outliers_random_starting_positions, outliers_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) with open(output_time_file, 'rb') as f: (outliers_y_time_gd_transformed, outliers_y_time_gd_variance_recalc_transformed, outliers_y_time_gd_transformed_random, outliers_y_time_gd_variance_recalc_transformed_random, outliers_y_time_gd_early_exagg_transformed_random, outliers_y_time_gd_early_exagg_transformed, outliers_y_time_gd_variance_recalc_early_exagg_transformed_random, outliers_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) else: logging.info("No previous partially completed test, or regeneration requested. Starting from scratch.") covered_samples = list() outliers_y_gd_transformed = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_gd_variance_recalc_transformed = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_gd_transformed_random = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_gd_variance_recalc_transformed_random = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_gd_early_exagg_transformed_random = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_gd_early_exagg_transformed = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_gd_variance_recalc_early_exagg_transformed_random = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_gd_variance_recalc_early_exagg_transformed = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_random_starting_positions = np.zeros((len(outlier_samples), Y_mnist.shape[1])) outliers_y_time_gd_transformed = np.zeros((len(outlier_samples), )) outliers_y_time_gd_variance_recalc_transformed = np.zeros((len(outlier_samples), )) outliers_y_time_gd_transformed_random = np.zeros((len(outlier_samples), )) outliers_y_time_gd_variance_recalc_transformed_random = np.zeros((len(outlier_samples), )) outliers_y_time_gd_early_exagg_transformed_random = np.zeros((len(outlier_samples), )) outliers_y_time_gd_early_exagg_transformed = np.zeros((len(outlier_samples), )) outliers_y_time_gd_variance_recalc_early_exagg_transformed_random = np.zeros((len(outlier_samples), )) outliers_y_time_gd_variance_recalc_early_exagg_transformed = np.zeros((len(outlier_samples), )) for i in range(first_sample_inc, last_sample_exclusive): np.random.seed( i) # We reset random seed every time. Otherwise, if you load partial results from file, everything # will depend on which parts were loaded, random sequence will "shift" depend on that, and reproducibility will be lost. # I.e. if put seed(0) before the loop and start from scratch, then you will have some random sequence [abc] for sample 0, # other (continuation of that sequence) [def] for sample 1, etc. But if you already loaded sample 0 from file, you will # have [abc] for sample 1, [def] for sample 2, etc. Reproducibility should not depend on what parts are loaded. # Hence, random seed every time, and it depends on ABSOLUTE sample number. logging.info(" ====================== Sample %d \n\n", i) if i in covered_samples: logging.info("Already loaded.") else: outlier = outlier_samples[i].reshape((1, -1)) embedder_start_time = datetime.datetime.now() outliers_y_gd_transformed[i, :] = dTSNE_mnist.transform(outlier, y='closest', verbose=2, optimizer_kwargs={'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_transformed[i] = (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time: %f s", outliers_y_time_gd_transformed[i]) embedder_start_time = datetime.datetime.now() outliers_y_gd_variance_recalc_transformed[i, :] = dTSNE_mnist.transform(outlier, keep_sigmas=False, y='closest', verbose=2, optimizer_kwargs={ 'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_variance_recalc_transformed[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR): %f s", outliers_y_time_gd_variance_recalc_transformed[i]) # Let's pick random starts at any point. not necessary near the center. y_start = np.array([[ np.random.uniform(np.min(Y_mnist[:, 0]), np.max(Y_mnist[:, 0])), np.random.uniform(np.min(Y_mnist[:, 1]), np.max(Y_mnist[:, 1])) ]]) outliers_random_starting_positions[i, :] = y_start embedder_start_time = datetime.datetime.now() outliers_y_gd_transformed_random[i, :] = dTSNE_mnist.transform(outlier, y=y_start, # y='random', verbose=2, optimizer_kwargs={ 'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (random): %f s", outliers_y_time_gd_transformed_random[i]) embedder_start_time = datetime.datetime.now() outliers_y_gd_variance_recalc_transformed_random[i, :] = dTSNE_mnist.transform(outlier, keep_sigmas=False, y=y_start, # y='random', verbose=2, optimizer_kwargs={ 'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_variance_recalc_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR, random): %f s", outliers_y_time_gd_variance_recalc_transformed_random[i]) embedder_start_time = datetime.datetime.now() outliers_y_gd_early_exagg_transformed_random[i, :] = dTSNE_mnist.transform(outlier, y=y_start, # y='random', verbose=2) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_early_exagg_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (EE, random): %f s", outliers_y_time_gd_early_exagg_transformed_random[i]) embedder_start_time = datetime.datetime.now() outliers_y_gd_early_exagg_transformed[i, :] = dTSNE_mnist.transform(outlier, y='closest', verbose=2) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_early_exagg_transformed[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (EE): %f s", outliers_y_time_gd_early_exagg_transformed[i]) embedder_start_time = datetime.datetime.now() outliers_y_gd_variance_recalc_early_exagg_transformed_random[i, :] = dTSNE_mnist.transform(outlier, y=y_start, keep_sigmas=False, verbose=2) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_variance_recalc_early_exagg_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR,EE,random): %f s", outliers_y_time_gd_variance_recalc_early_exagg_transformed_random[i]) embedder_start_time = datetime.datetime.now() outliers_y_gd_variance_recalc_early_exagg_transformed[i, :] = dTSNE_mnist.transform(outlier, keep_sigmas=False, y='closest', verbose=2) embedder_end_time = datetime.datetime.now() outliers_y_time_gd_variance_recalc_early_exagg_transformed[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR,EE): %f s", outliers_y_time_gd_variance_recalc_early_exagg_transformed[i]) covered_samples.append(i) logging.info("Saving...") # Gradient descent results take a long while. Let's cache. if not only_time: with open(output_file, 'wb') as f: pickle.dump((outliers_y_gd_transformed, outliers_y_gd_variance_recalc_transformed, outliers_y_gd_transformed_random, outliers_y_gd_variance_recalc_transformed_random, outliers_y_gd_early_exagg_transformed_random, outliers_y_gd_early_exagg_transformed, outliers_y_gd_variance_recalc_early_exagg_transformed_random, outliers_random_starting_positions, outliers_y_gd_variance_recalc_early_exagg_transformed, covered_samples), f) with open(output_time_file, 'wb') as f: pickle.dump((outliers_y_time_gd_transformed, outliers_y_time_gd_variance_recalc_transformed, outliers_y_time_gd_transformed_random, outliers_y_time_gd_variance_recalc_transformed_random, outliers_y_time_gd_early_exagg_transformed_random, outliers_y_time_gd_early_exagg_transformed, outliers_y_time_gd_variance_recalc_early_exagg_transformed_random, outliers_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples), f)