def calc_kl(*, common_info, embedded_neighbors, parameters): dTSNE_mnist = common_info["dTSNE_mnist"] X_mnist = common_info["X_mnist"] Y_mnist = common_info["Y_mnist"] letter_samples = common_info["letter_samples"] per_sample_kl_divergences = list() for j in range(len(letter_samples)): distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Don't store those matrices in a single file. Way too large. # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): if j % 50 == 0: logging.info("\t%d P-matrix file found. Loading.", j) with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: if j % 50 == 0: logging.info( "\t%d P-matrix file not found. Creating and saving.", j) new_X = np.concatenate((X_mnist, letter_samples[j, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma \ (distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) if not os.path.isdir(distance_matrix_dir): logging.info('Creating directory: %s', distance_matrix_dir) os.mkdir(distance_matrix_dir) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # Single file with p matrix. # Now use it to calculate KL divergence. new_Y = np.concatenate((Y_mnist, embedded_neighbors[j, :].reshape( (1, -1))), axis=0) kl, _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) per_sample_kl_divergences.append(kl) return np.mean(per_sample_kl_divergences)
def generate_gd_kl_temp_filename(parameters): output_file_prefix = '../results/letter_A_gd_kl_temp_' return output_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_A_parameter_set, parameters)
def generate_time_results_filename(parameters=settings.parameters): outlier_results_file_prefix = '../results/outlier_time_gd_' return outlier_results_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== ACCURACY def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_method_results = [ picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \ picked_neighbors_y_time_gd_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(picked_neighbors_y_time_gd_transformed), np.mean(picked_neighbors_y_time_gd_transformed_random), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed ), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random ), ] gd_accuracy = np.zeros((len(gd_method_list, ))) gd_precision = np.zeros((len(gd_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis gd_nearest_neighbors_percentiles_matrix = np.zeros( (len(picked_neighbors), len(gd_method_list))) for i in range(len(picked_neighbors)): for j in range(len(gd_method_list)): y = gd_method_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_nearest_neighbors_percentiles_matrix[ i, j] = stats.percentileofscore(nearest_neighbors_y_dist, nn_dist) gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j]) # ============================== KL divergence for j in range(len(gd_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) x = picked_neighbors[i, :] y = gd_method_results[j][i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [i for i in nn_x_indices if i in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) gd_accuracy[j] = np.mean(per_sample_accuracy) gd_precision[j] = np.mean(per_sample_precision) logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j], gd_accuracy[j]) gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors))) processed_indices = list() kl_gd_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_performance_file): with open(kl_gd_performance_file, 'rb') as f: gd_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_method_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0) gd_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_performance_file, 'wb') as f: pickle.dump((gd_kl, processed_indices), f) # This should be fast gd_avg_kl = np.mean(gd_kl, axis=1) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time, gd_avg_kl, gd_distance_percentiles), f)
def main(parameters = settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== Some starting stuff def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y) ** 2, axis=1) return np.argsort(y_distances)[:n] kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters) with open(kernelized_results_file, 'rb') as f: kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \ kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f) ind = [4, 24, 49] kernelized_method_list = [ kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:] for i in ind] kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind] kernelized_accuracy = np.zeros((len(kernelized_method_list,))) kernelized_precision = np.zeros((len(kernelized_method_list,))) kernelized_per_item_time = np.zeros((len(kernelized_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list))) for i in range(len(picked_neighbors)): for j in range(len(kernelized_method_list)): y = kernelized_method_results[j][i, :] kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1))) kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist, kernelized_dist) kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(kernelized_method_list)): logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j]) # ============================== Accuracy and precision for j in range(len(kernelized_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors),)) per_sample_precision = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] y = kernelized_method_results[j][i,:] x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len([k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[kernelized_indices] per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels) kernelized_accuracy[j] = np.mean(per_sample_accuracy) kernelized_precision[j] = np.mean(per_sample_precision) kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors) logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j], kernelized_accuracy[j]) kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors))) processed_indices = list() kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters) if os.path.isfile(kl_kernelized_performance_file): with open(kl_kernelized_performance_file, 'rb') as f: kernelized_kl, processed_indices = pickle.load(f) # ============================== KL divergence # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(kernelized_method_results)): # Single file with p matrix new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0) kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_kernelized_performance_file, 'wb') as f: pickle.dump((kernelized_kl, processed_indices), f) # This should be fast kernelized_avg_kl = np.mean(kernelized_kl, axis=1) output_file = generate_kernelized_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision, kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
def generate_idw_power_plot_filename(parameters=settings.parameters): return idw_power_plot_file_prefix + generate_data.combine_prefixes( settings.nn_accuracy_parameter_set, parameters)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) letter_samples, _, _ = generate_data.load_letters(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis # ============== KL Divergence gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_letter_test_GD.generate_letter_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (letters_y_gd_transformed, letters_y_gd_variance_recalc_transformed, letters_y_gd_transformed_random, letters_y_gd_variance_recalc_transformed_random, letters_y_gd_early_exagg_transformed_random, letters_y_gd_early_exagg_transformed, letters_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, letters_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_letters_results = [ letters_y_gd_transformed, letters_y_gd_transformed_random, letters_y_gd_variance_recalc_transformed, letters_y_gd_variance_recalc_transformed_random, letters_y_gd_early_exagg_transformed, letters_y_gd_early_exagg_transformed_random, letters_y_gd_variance_recalc_early_exagg_transformed, letters_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_letter_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: letters_y_time_gd_transformed, letters_y_time_gd_variance_recalc_transformed, \ letters_y_time_gd_transformed_random, \ letters_y_time_gd_variance_recalc_transformed_random, \ letters_y_time_gd_early_exagg_transformed_random, \ letters_y_time_gd_early_exagg_transformed, \ letters_y_time_gd_variance_recalc_early_exagg_transformed_random, \ letters_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(letters_y_time_gd_transformed), np.mean(letters_y_time_gd_transformed_random), np.mean(letters_y_time_gd_variance_recalc_transformed), np.mean(letters_y_time_gd_variance_recalc_transformed_random), np.mean(letters_y_time_gd_early_exagg_transformed), np.mean(letters_y_time_gd_early_exagg_transformed_random), np.mean(letters_y_time_gd_variance_recalc_early_exagg_transformed), np.mean( letters_y_time_gd_variance_recalc_early_exagg_transformed_random), ] gd_letters_kl = np.zeros((len(gd_method_list), len(letter_samples))) processed_indices = list() kl_gd_letters_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_letters_performance_file): with open(kl_gd_letters_performance_file, 'rb') as f: gd_letters_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(letter_samples), )) for i in range(len(letter_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, letter_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_letters_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_letters_results[j][i, :].reshape((1, -1))), axis=0) gd_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_letters_performance_file, 'wb') as f: pickle.dump((gd_letters_kl, processed_indices), f) # This should be fast gd_avg_letters_kl = np.mean(gd_letters_kl, axis=1) # ============== Distance percentiles gd_letters_percentiles_matrix = np.zeros( (len(letter_samples), len(gd_method_list))) gd_letters_distance_matrix = np.zeros( (len(letter_samples), len(gd_method_list))) for i in range(len(letter_samples)): for j in range(len(gd_method_list)): y = gd_letters_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_letters_distance_matrix[i, j] = nn_dist gd_letters_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) gd_letters_distance_percentiles = np.mean(gd_letters_percentiles_matrix, axis=0) gd_letters_distances = np.mean(gd_letters_distance_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s: %f, %f", gd_method_list[j], gd_letters_distances[j], gd_letters_distance_percentiles[j]) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_time, gd_avg_letters_kl, gd_letters_distance_percentiles), f)
def generate_nn_postprocess_filename(parameters): output_file_prefix = '../results/cluster_attr_nn_postprocess_' return output_file_prefix + generate_data.combine_prefixes( neural_network_commons.nn_model_prefixes | settings.x_neighbors_selection_parameter_set, parameters)
def generate_nn_kl_temp_filename(parameters): output_file_prefix = '../results/letter_nn_kl_temp_' return output_file_prefix + generate_data.combine_prefixes( neural_network_commons.nn_model_prefixes | settings.letter_parameter_set, parameters)
def generate_letter_results_filename(letter_results_file_prefix, parameters=settings.parameters): return letter_results_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, parameters)
def generate_lion_power_performance_filename(parameters=settings.parameters): return lion_power_performance_prefix + generate_data.combine_prefixes( settings.nn_accuracy_parameter_set, parameters)
def train_or_load_models(regenerate_model1=False, regenerate_model2=False, regenerate_model3=False, parameters=settings.parameters): X_mnist = generate_data.load_x_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) keras_random_seed = parameters.get("keras_random_seed", settings.parameters["keras_random_seed"]) # Reproducibility: parallel threads can bring uncontrolled randomness # Luckily, models here are small, no need for parallel threads etc. session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) session = tf.Session(config=session_conf) tf.keras.backend.set_session(session) model1_weights_file_prefix = '../results/model1' model1_json_file_prefix = '../results/model1' model2_weights_file_prefix = '../results/model2' model2_json_file_prefix = '../results/model2' model3_weights_file_prefix = '../results/model3' model3_json_file_prefix = '../results/model3' model1_weights_file = model1_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.hd5') model1_json_file = model1_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.json') model2_weights_file = model2_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.hd5') model2_json_file = model2_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.json') model3_weights_file = model3_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.hd5') model3_json_file = model3_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters, postfix='.json') if not os.path.isfile(model1_weights_file) or regenerate_model1: # 2 layers, 250 nodes per layer, ReLu activation, dropout regularization with rate of 0.25.] set_all_random_seeds(keras_random_seed) model1 = keras.models.Sequential() model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model1.add(keras.layers.Dropout(0.25)) model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model1.add(keras.layers.Dropout(0.25)) model1.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal')) model1.compile(loss='mean_squared_error', optimizer='adam') model1.fit(X_mnist, Y_mnist, epochs=5000, verbose=1, validation_data=(X_mnist, Y_mnist)) with open(model1_json_file, "w") as f: f.write(model1.to_json()) model1.save_weights(model1_weights_file) else: with open(model1_json_file, "r") as f: model1 = keras.models.model_from_json(f.read()) model1.load_weights(model1_weights_file) model1.compile(loss='mean_squared_error', optimizer='adam') Y_nn1_mnist = model1.predict(X_mnist) if not os.path.isfile(model2_weights_file) or regenerate_model2: # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.] set_all_random_seeds(keras_random_seed) model2 = keras.models.Sequential() model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model2.add(keras.layers.Dropout(0.5)) model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model2.add(keras.layers.Dropout(0.5)) model2.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal')) model2.compile(loss='mean_squared_error', optimizer='adam') model2.fit(X_mnist, Y_mnist, epochs=5000, verbose=1, validation_data=(X_mnist, Y_mnist)) with open(model2_json_file, "w") as f: f.write(model2.to_json()) model2.save_weights(model2_weights_file) else: with open(model2_json_file, "r") as f: model2 = keras.models.model_from_json(f.read()) model2.load_weights(model2_weights_file) model2.compile(loss='mean_squared_error', optimizer='adam') Y_nn2_mnist = model2.predict(X_mnist) if not os.path.isfile(model3_weights_file) or regenerate_model3: # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.] set_all_random_seeds(keras_random_seed) model3 = keras.models.Sequential() model3.add(keras.layers.Dense(500, activation='tanh', kernel_initializer='normal', input_dim=X_mnist.shape[1])) model3.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal')) model3.compile(loss='mean_squared_error', optimizer='adam') model3.fit(X_mnist, Y_mnist, epochs=5000, verbose=1, validation_data=(X_mnist, Y_mnist)) with open(model3_json_file, "w") as f: f.write(model3.to_json()) model3.save_weights(model3_weights_file) else: with open(model3_json_file, "r") as f: model3 = keras.models.model_from_json(f.read()) model3.load_weights(model3_weights_file) model3.compile(loss='mean_squared_error', optimizer='adam') Y_nn3_mnist = model3.predict(X_mnist) return {"models" : (model1, model2, model3), "Y_predicted" : (Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist)}
def generate_cache_filename(parameters=settings.parameters): cache_file_prefix = '../results/kernelized_tsne_parameters_cache' return cache_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters)
def generate_nn_postprocess_filename(parameters): output_file_prefix = '../results/outlier_nn_postprocess_' return output_file_prefix + generate_data.combine_prefixes( neural_network_commons.nn_model_prefixes | settings.outlier_parameter_set, parameters)
def generate_cluster_results_filename(parameters=settings.parameters): cluster_results_file_prefix = '../results/cluster_attr_kernelized_' return cluster_results_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_y_mnist(parameters=parameters) outlier_samples, _ = generate_data.load_outliers(parameters=parameters) nn_results_file = exp_outlier_test_NN.generate_outlier_results_filename( parameters) with open(nn_results_file, 'rb') as f: nn_outliers_results, nn_models_orig, nn_method_list = pickle.load(f) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis # ================ KL DIVERGENCE =================== nn_outliers_kl = np.zeros((len(nn_method_list), len(outlier_samples))) processed_indices = list() kl_nn_outliers_performance_file = generate_nn_kl_temp_filename(parameters) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(outlier_samples), )) for i in range(len(outlier_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(nn_outliers_results)): # Single file with p matrix new_Y = np.concatenate( (nn_models_orig[j], nn_outliers_results[j][i, :].reshape( (1, -1))), axis=0) nn_outliers_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_nn_outliers_performance_file, 'wb') as f: pickle.dump((nn_outliers_kl, processed_indices), f) # This should be fast nn_avg_outliers_kl = np.mean(nn_outliers_kl, axis=1) # ================ DISTANCE MATRICES =================== nn_outliers_percentiles_matrix = np.zeros( (len(outlier_samples), len(nn_method_list))) nn_outliers_distance_matrix = np.zeros( (len(outlier_samples), len(nn_method_list))) for i in range(len(outlier_samples)): for j in range(len(nn_method_list)): y = nn_outliers_results[j][i, :] nn_dist = np.min( np.sqrt(np.sum((nn_models_orig[j] - y)**2, axis=1))) nn_outliers_distance_matrix[i, j] = nn_dist nn_outliers_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) nn_outliers_distance_percentiles = np.mean(nn_outliers_percentiles_matrix, axis=0) nn_outliers_distances = np.mean(nn_outliers_distance_matrix, axis=0) for j in range(len(nn_method_list)): print(nn_method_list[j], nn_outliers_distances[j], nn_outliers_distance_percentiles[j]) output_file = generate_nn_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((nn_method_list, nn_avg_outliers_kl, nn_outliers_distance_percentiles), f)
def generate_letter_results_filename(parameters=settings.parameters): letter_results_file_prefix = '../results/letter_nn_' return letter_results_file_prefix + generate_data.combine_prefixes( neural_network_commons.nn_model_prefixes | settings.letter_parameter_set, parameters)
def generate_gd_postprocess_filename(parameters): output_file_prefix = '../results/letter_gd_postprocess_' return output_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, parameters)
def main(parameters=settings.parameters, regenerate=False): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) letter_A_samples, _ = generate_data.load_A_letters(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis kernelized_results_file = exp_letter_A_test_kernelized.generate_letter_A_results_filename( parameters) with open(kernelized_results_file, 'rb') as f: kernelized_detailed_method_results, kernelized_detailed_tsne_time, kernelized_detailed_method_list = pickle.load( f) ind = [4, 24, 49] kernelized_method_list = [ kernelized_detailed_method_list[i][:10] + kernelized_detailed_method_list[i][-8:] for i in ind ] kernelized_letters_results = [ kernelized_detailed_method_results[i] for i in ind ] # =========== DISTANCE PERCENTILES ========== kernelized_letters_percentiles_matrix = np.zeros( (len(letter_A_samples), len(kernelized_method_list))) kernelized_letters_distance_matrix = np.zeros( (len(letter_A_samples), len(kernelized_method_list))) for i in range(len(letter_A_samples)): for j in range(len(kernelized_method_list)): y = kernelized_letters_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) kernelized_letters_distance_matrix[i, j] = nn_dist kernelized_letters_percentiles_matrix[i, j] = stats.percentileofscore( nearest_neighbors_y_dist, nn_dist) kernelized_letters_distance_percentiles = np.mean( kernelized_letters_percentiles_matrix, axis=0) kernelized_letters_distances = np.mean(kernelized_letters_distance_matrix, axis=0) kernelized_per_item_time = kernelized_detailed_tsne_time / len( letter_A_samples) for j in range(len(kernelized_method_list)): logging.info("%s: %f, %f", kernelized_method_list[j], kernelized_letters_distances[j], kernelized_letters_distance_percentiles[j]) kernelized_letters_kl = np.zeros( (len(kernelized_method_list), len(letter_A_samples))) processed_indices = list() kl_kernelized_tsne_letters_performance_file = generate_kernelized_kl_temp_filename( parameters) if os.path.isfile( kl_kernelized_tsne_letters_performance_file) and not regenerate: with open(kl_kernelized_tsne_letters_performance_file, 'rb') as f: kernelized_letters_kl, processed_indices = pickle.load(f) # =========== KL DIVERGENCE ========== # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(letter_A_samples), )) for i in range(len(letter_A_samples)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_A_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, letter_A_samples[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(kernelized_letters_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, kernelized_letters_results[j][i, :].reshape( (1, -1))), axis=0) kernelized_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient( p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_kernelized_tsne_letters_performance_file, 'wb') as f: pickle.dump((kernelized_letters_kl, processed_indices), f) # This should be fast kernelized_avg_letters_kl = np.mean(kernelized_letters_kl, axis=1) output_file = generate_kernelized_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((kernelized_method_list, kernelized_avg_letters_kl, kernelized_per_item_time, kernelized_letters_distance_percentiles), f)
def generate_idw_power_filename(parameters=settings.parameters): return idw_power_performance_file_prefix +\ generate_data.combine_prefixes(settings.x_neighbors_selection_parameter_set, parameters)
def generate_time_results_filename(parameters=settings.parameters): letter_A_results_file_prefix = '../results/letter_A_time_gd_' return letter_A_results_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_A_parameter_set, parameters)
def generate_kernelized_kl_temp_filename(parameters): output_file_prefix = '../results/cluster_attr_kernelized_kl_temp_' return output_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters)
import settings import logging import numpy as np import generate_data import pickle input_prefixes = ( './cluster-results/cluster_attr_gd_', './outlier-results/outlier_gd_', './letter-results/letter_gd_', './letter-A-results/letter_A_gd_', ) output_files = ( '../results/cluster_attr_gd_' + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, settings.parameters), '../results/outlier_gd_' + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, settings.parameters), '../results/letter_gd_' + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_parameter_set, settings.parameters), '../results/letter_A_gd_' + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.letter_A_parameter_set, settings.parameters), ) output_time_files = ( '../results/cluster_attr_time_gd_' + generate_data.combine_prefixes( settings.tsne_parameter_set
def generate_gd_postprocess_filename(parameters): output_file_prefix = '../results/cluster_attr_gd_postprocess_' return output_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters)
def generate_kernelized_kl_temp_filename(parameters): output_file_prefix = '../results/outlier_kernelized_kl_temp_' return output_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters)
def generate_outlier_results_filename(parameters=settings.parameters): cluster_results_file_prefix = '../results/outlier_kernelized_' return cluster_results_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.outlier_parameter_set, parameters)
def generate_cluster_results_filename(cluster_results_file_prefix, parameters=settings.parameters): return cluster_results_file_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters)