def get_common_info(parameters): res = {} res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters) res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters) res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters) res['labels_mnist'] = generate_data.load_labels_mnist( parameters=parameters) res['picked_neighbors'] = generate_data.load_picked_neighbors( parameters=parameters) res['picked_neighbors_labels'] = generate_data.load_picked_neighbors_labels( parameters=parameters) res['accuracy_nn'] = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) D_Y = distance.squareform(distance.pdist(res['Y_mnist'])) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself res['nearest_neighbors_y_dist'] = np.min(D_Y, axis=1) # Actually, whatever axis return res
def main(regenerate_model1=False, regenerate_model2=False, regenerate_model3=False, parameters=settings.parameters): models_and_results = neural_network_commons.train_or_load_models(regenerate_model1=regenerate_model1, regenerate_model3=regenerate_model3,regenerate_model2=regenerate_model2,parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters) model1, model2, model3 = models_and_results["models"] Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist = models_and_results["Y_predicted"] Y_neighb1_mnist = model1.predict(picked_neighbors) Y_neighb2_mnist = model2.predict(picked_neighbors) Y_neighb3_mnist = model3.predict(picked_neighbors) nn_method_results = [Y_neighb1_mnist, Y_neighb2_mnist, Y_neighb3_mnist] nn_models_orig = [Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist] nn_method_list = ['NN - 2L; 250N; ReLu; D0.25','NN - 2L; 500N; ReLu; D0.5', 'NN - 1L; 500N; tanh'] output_file = generate_cluster_results_filename(parameters) with open(output_file, 'wb') as f: pickle.dump((nn_method_results, nn_models_orig, nn_method_list), f)
def main(parameters=settings.parameters, regenerate=False): picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) X_mnist = generate_data.load_x_mnist(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) result = dict() output_file = \ cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(output_prefix, parameters) if os.path.isfile(output_file) and not regenerate: with open(output_file, "rb") as f: result = pickle.load(f) logging.info("Previous result loaded") else: logging.info("No previous result or regeneration requested") for fname_prefix in original_files_prefixes: cluster_results_file = \ cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(fname_prefix, parameters) logging.info("Processing file: %s", cluster_results_file) with open(cluster_results_file, 'rb') as f: res = pickle.load(f) for i in res.keys(): logging.info("Processing method: %s", i) if i not in result or regenerate: precision = calc_precision(res[i]["EmbeddedPoints"], X_mnist, Y_mnist, picked_neighbors, precision_nn) logging.info("%s precision: %f (accuracy was %f)", i, precision, res[i]["Accuracy"]) result[i] = precision with open(output_file, "wb") as f: pickle.dump(result, f)
def main(parameters=settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== ACCURACY def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] gd_method_list = [ r'Closest $Y_{init}$', r'Random $Y_{init}$', r'Closest $Y_{init}$; new $\sigma$', r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE', r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE', r'Random $Y_{init}$; new $\sigma$; EE' ] gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename( parameters=parameters) with open(gd_results_file, 'rb') as f: (picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) gd_method_results = [ picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, ] input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename( parameters) with open(input_time_file, 'rb') as f: picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \ picked_neighbors_y_time_gd_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_early_exagg_transformed, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \ picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f) gd_time = [ np.mean(picked_neighbors_y_time_gd_transformed), np.mean(picked_neighbors_y_time_gd_transformed_random), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed), np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed), np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed ), np.mean( picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random ), ] gd_accuracy = np.zeros((len(gd_method_list, ))) gd_precision = np.zeros((len(gd_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis gd_nearest_neighbors_percentiles_matrix = np.zeros( (len(picked_neighbors), len(gd_method_list))) for i in range(len(picked_neighbors)): for j in range(len(gd_method_list)): y = gd_method_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) gd_nearest_neighbors_percentiles_matrix[ i, j] = stats.percentileofscore(nearest_neighbors_y_dist, nn_dist) gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(gd_method_list)): logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j]) # ============================== KL divergence for j in range(len(gd_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) x = picked_neighbors[i, :] y = gd_method_results[j][i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [i for i in nn_x_indices if i in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) gd_accuracy[j] = np.mean(per_sample_accuracy) gd_precision[j] = np.mean(per_sample_precision) logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j], gd_accuracy[j]) gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors))) processed_indices = list() kl_gd_performance_file = generate_gd_kl_temp_filename(parameters) if os.path.isfile(kl_gd_performance_file): with open(kl_gd_performance_file, 'rb') as f: gd_kl, processed_indices = pickle.load(f) # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape( (1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma( distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(gd_method_results)): # Single file with p matrix new_Y = np.concatenate( (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0) gd_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_gd_performance_file, 'wb') as f: pickle.dump((gd_kl, processed_indices), f) # This should be fast gd_avg_kl = np.mean(gd_kl, axis=1) output_file = generate_gd_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time, gd_avg_kl, gd_distance_percentiles), f)
def main(parameters = settings.parameters): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters) Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) # =================== Some starting stuff def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y) ** 2, axis=1) return np.argsort(y_distances)[:n] kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters) with open(kernelized_results_file, 'rb') as f: kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \ kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f) ind = [4, 24, 49] kernelized_method_list = [ kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:] for i in ind] kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind] kernelized_accuracy = np.zeros((len(kernelized_method_list,))) kernelized_precision = np.zeros((len(kernelized_method_list,))) kernelized_per_item_time = np.zeros((len(kernelized_method_list, ))) # ============================== Distance percentiles D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list))) for i in range(len(picked_neighbors)): for j in range(len(kernelized_method_list)): y = kernelized_method_results[j][i, :] kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1))) kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist, kernelized_dist) kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(kernelized_method_list)): logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j]) # ============================== Accuracy and precision for j in range(len(kernelized_method_results)): per_sample_accuracy = np.zeros((len(picked_neighbors),)) per_sample_precision = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] y = kernelized_method_results[j][i,:] x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len([k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[kernelized_indices] per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels) kernelized_accuracy[j] = np.mean(per_sample_accuracy) kernelized_precision[j] = np.mean(per_sample_precision) kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors) logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j], kernelized_accuracy[j]) kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors))) processed_indices = list() kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters) if os.path.isfile(kl_kernelized_performance_file): with open(kl_kernelized_performance_file, 'rb') as f: kernelized_kl, processed_indices = pickle.load(f) # ============================== KL divergence # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix. per_sample_KL = np.zeros((len(picked_neighbors),)) for i in range(len(picked_neighbors)): if i in processed_indices: logging.info("Sample %d already processed. Results loaded.", i) continue logging.info("Processing sample %d", i) distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes( settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep) distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p' # Make sure you can load them one-by-one. if os.path.isfile(distance_matrix_file): logging.info("\tP-matrix file found. Loading.") with open(distance_matrix_file, 'rb') as f: new_P, _ = pickle.load(f) else: logging.info("\tP-matrix file not found. Creating and saving.") new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0) new_D = distance.squareform(distance.pdist(new_X)) new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity) with open(distance_matrix_file, 'wb') as f: pickle.dump((new_P, new_sigmas), f) # For all of methods P-matrix is shared. for j in range(len(kernelized_method_results)): # Single file with p matrix new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0) kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y) processed_indices.append(i) with open(kl_kernelized_performance_file, 'wb') as f: pickle.dump((kernelized_kl, processed_indices), f) # This should be fast kernelized_avg_kl = np.mean(kernelized_kl, axis=1) output_file = generate_kernelized_postprocess_filename(parameters) with open(output_file, "wb") as f: pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision, kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
def generate_idw_power_performance(*, regenerate=False, recursive_regenerate=False, parameters=settings.parameters): global_idw_power_performance = dict() # Start from scratch global_idw_power_performance_abs = dict() # Start from scratch global_idw_accuracy = dict() global_idw_precision = dict() start_time = datetime.datetime.now() logging.info("IDW power experiment started: %s", start_time) idw_power_performance_file = generate_idw_power_filename(parameters) idw_power_plot_file = generate_idw_power_plot_filename(parameters) X_mnist = generate_data.load_x_mnist( parameters=parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) Y_mnist = generate_data.load_y_mnist( parameters=parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) labels_mnist = generate_data.load_labels_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] distance_matrix = distance.squareform(distance.pdist(X_mnist)) np.fill_diagonal(distance_matrix, np.inf) # We are not interested in distance to itself nn_x_distance = np.min(distance_matrix, axis=1) # Any axis will do radius_x = dict() for p in idw_percentile_options: radius_x[p] = np.percentile(nn_x_distance, p) if os.path.isfile(idw_power_performance_file) and not regenerate: with open(idw_power_performance_file, 'rb') as f: global_idw_power_performance, global_idw_power_performance_abs, global_idw_accuracy = pickle.load( f) else: logging.info("Regeneration requested") for p in idw_power_options: if p in global_idw_power_performance: logging.info("Loaded p %f", p) continue logging.info("Processing p %f", p) interpolator = dTSNE_mnist.generate_embedding_function( embedding_function_type='weighted-inverse-distance', function_kwargs={'power': p}) per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): expected_label = picked_neighbor_labels[i] result = interpolator(picked_neighbors[i], verbose=0) nn_indices = get_nearest_neighbors_in_y(result, Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) y = result x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) cur_acc = np.mean(per_sample_accuracy) cur_prec = np.mean(per_sample_precision) y_sum_square_dist = 0.0 y_sum_abs_dist = 0.0 y_abs_dist = 0.0 y_count = 0.0 for i in range(len(X_mnist)): distances = distance_matrix[i, :].copy() # distances[i] = np.inf #Not interested in distance to itself # Step 1. Find nearest neighbors in the neighborhood. neighbor_indices = list(range(X_mnist.shape[0])) neighbor_indices.remove(i) num_neighbors = len(neighbor_indices) weights = 1 / distances[neighbor_indices]**p weights = weights / np.sum(weights) cur_y_result = weights.dot(Y_mnist[neighbor_indices, :]) y_sum_square_dist += np.sum(cur_y_result - Y_mnist[i, :])**2 y_sum_abs_dist += np.sqrt(np.sum(cur_y_result - Y_mnist[i, :])**2) y_count += 1.0 global_idw_power_performance[p] = y_sum_square_dist / y_count global_idw_power_performance_abs[p] = y_sum_abs_dist / y_count global_idw_accuracy[p] = cur_acc global_idw_precision[p] = cur_prec # Just in case it will become unstable due to too few neighbors # lion_power_plot_data[(p, perc)]['PowerSquareDistSum'] = y_sum_square_dist # lion_power_plot_data[(p, perc)]['PowerSquareDistCount'] = y_count with open(idw_power_performance_file, 'wb') as f: pickle.dump((global_idw_power_performance, global_idw_power_performance_abs, global_idw_accuracy, global_idw_precision), f) EPS = 1e-5 y = list() x_global = list() for cur_power in idw_power_options: closest_power = [ i for i in global_idw_power_performance_abs if np.abs(i - cur_power) < EPS ] if len(closest_power) > 0: x_global.append(cur_power) y.append(global_idw_power_performance[closest_power[0]]) idw_optimal_power = x_global[np.argmin(y)] with open(idw_power_plot_file, 'wb') as f: pickle.dump((x_global, y, idw_optimal_power), f) logging.info("IDW optimal power: %f", idw_optimal_power) end_time = datetime.datetime.now() logging.info("IDW power experiment ended: %s", end_time) logging.info("IDW power experiment duration: %s", end_time - start_time)
def main(parameters=settings.parameters, regenerate_parameters_cache=False): step = 0.01 choice_K = np.arange(step, 2 + step, step) # Let's try those K. logging.info("Started loading.") Y_mnist = generate_data.load_y_mnist(parameters=parameters) X_mnist = generate_data.load_x_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=parameters) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=parameters) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) labels_mnist = generate_data.load_labels_mnist(parameters=parameters) baseline_accuracy = generate_data.get_baseline_accuracy( parameters=parameters) logging.info("Loaded everything.") D_Y = distance.squareform(distance.pdist(Y_mnist)) # Now find distance to closest neighbor np.fill_diagonal(D_Y, np.inf) # ... but not to itself nearest_neighbors_y_dist = np.min(D_Y, axis=1) # Actually, whatever axis def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] # Implementing carefully. Not the fastest, but the most reliable way. kernel_tsne_mapping = kernelized_tsne.generate_kernelized_tsne_mapping_function( parameters=parameters, regenerate_parameters_cache=regenerate_parameters_cache) kernelized_detailed_tsne_method_list = [ "Kernelized tSNE; K=%.2f" % (k) for k in choice_K ] kernelized_detailed_tsne_method_results = list() kernelized_detailed_tsne_accuracy = np.zeros( (len(kernelized_detailed_tsne_method_list), )) kernelized_detailed_tsne_precision = np.zeros( (len(kernelized_detailed_tsne_method_list), )) kernelized_detailed_tsne_time = np.zeros( (len(kernelized_detailed_tsne_method_list), )) for j in range(len(choice_K)): k = choice_K[j] logging.info("%f", k) embedder_start_time = datetime.datetime.now() kernelized_detailed_tsne_method_results.append( kernel_tsne_mapping(picked_neighbors, k=k)) embedder_end_time = datetime.datetime.now() kernelized_detailed_tsne_time[j] = ( embedder_end_time - embedder_start_time).total_seconds() logging.info("%f complete", k) #kernelized_detailed_tsne_method_results = [kernel_tsne_mapping(picked_neighbors, k=k) for k in choice_K] logging.info("%s", kernelized_detailed_tsne_method_list[j]) per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): if i % 200 == 0: logging.info("%d", i) expected_label = picked_neighbor_labels[i] y = kernelized_detailed_tsne_method_results[j][i, :] x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) kernelized_indices = get_nearest_neighbors_in_y( kernelized_detailed_tsne_method_results[j][i, :], Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[kernelized_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len(obtained_labels) kernelized_detailed_tsne_accuracy[j] = np.mean(per_sample_accuracy) kernelized_detailed_tsne_precision[j] = np.mean(per_sample_precision) logging.info("%s :\t%f\t%f\t%f s", kernelized_detailed_tsne_method_list[j], kernelized_detailed_tsne_precision[j], kernelized_detailed_tsne_accuracy[j], kernelized_detailed_tsne_time[j]) # Accuracy-vs-power plot legend_list = list() f, ax = plt.subplots() f.set_size_inches(6, 3) x = [k for k in choice_K] # Ensuring order y = kernelized_detailed_tsne_accuracy # plt.title("IDW - Accuracy vs Power") # We'd better use figure caption # ax.legend([h1,h2,h3,h4,h5,h6], ["Closest Training Set Image"]+idw_method_list) plt.plot(x, y, c='blue') h = plt.axhline(y=baseline_accuracy, c='black', linestyle='--') plt.legend([h], ["Baseline Accuracy (%.4f)" % baseline_accuracy]) plt.xlabel("Kernelized tSNE: K parameter") plt.ylabel("10-NN Accuracy") plt.ylim([0, 1]) plt.xlim([0, 2]) f.tight_layout() plt.savefig("../figures/kernelized-tsne-K-vs-accuracy.png") ind = [4, 24, 49] kernelized_tsne_method_list = [ kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:] for i in ind ] kernelized_tsne_method_results = [ kernelized_detailed_tsne_method_results[i] for i in ind ] kernelized_tsne_nearest_neighbors_percentiles_matrix = np.zeros( (len(picked_neighbors), len(kernelized_tsne_method_list))) for i in range(len(picked_neighbors)): for j in range(len(kernelized_tsne_method_list)): y = kernelized_tsne_method_results[j][i, :] nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1))) kernelized_tsne_nearest_neighbors_percentiles_matrix[ i, j] = stats.percentileofscore(nearest_neighbors_y_dist, nn_dist) kernelized_tsne_distance_percentiles = np.mean( kernelized_tsne_nearest_neighbors_percentiles_matrix, axis=0) for j in range(len(kernelized_tsne_method_list)): print(kernelized_tsne_method_list[j], kernelized_tsne_distance_percentiles[j]) output_file = generate_cluster_results_filename(parameters) with open(output_file, 'wb') as f: pickle.dump( (kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list), f)
def generate_lion_power_performance(*, regenerate=False, recursive_regenerate=False, parameters=settings.parameters): start_time = datetime.datetime.now() logging.info("LION power experiment started: %s", start_time) accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"]) precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"]) lion_power_performance_data_file = generate_lion_power_performance_filename( parameters) lion_power_plot_data_file = generate_lion_power_plot_filename(parameters) lion_power_performance_data = dict() # Start from scratch X_mnist = generate_data.load_x_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) Y_mnist = generate_data.load_y_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) labels_mnist = generate_data.load_labels_mnist( parameters=settings.parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) def get_nearest_neighbors_in_y(y, Y_mnist, n=10): y_distances = np.sum((Y_mnist - y)**2, axis=1) return np.argsort(y_distances)[:n] dTSNE_mnist = generate_data.load_dtsne_mnist( parameters=settings.parameters) picked_neighbors = generate_data.load_picked_neighbors( parameters=settings.parameters) picked_neighbor_labels = generate_data.load_picked_neighbors_labels( parameters=settings.parameters) distance_matrix = distance.squareform(distance.pdist(X_mnist)) np.fill_diagonal(distance_matrix, np.inf) # We are not interested in distance to itself nn_x_distance = np.min(distance_matrix, axis=1) # Any axis will do radius_x = dict() for p in lion_percentile_options: radius_x[p] = np.percentile(nn_x_distance, p) logging.info("Radius X: %s", radius_x) if os.path.isfile(lion_power_performance_data_file) and not regenerate: with open(lion_power_performance_data_file, 'rb') as f: lion_power_performance_data = pickle.load(f) for perc in lion_percentile_options: for p in lion_power_options: logging.info("Processing percentile and power: %f, %d", p, perc) key = str(perc) + ";" + "%.3f" % (p) logging.info("Key: %s", key) if key not in lion_power_performance_data: lion_power_performance_data[key] = dict() if 'Accuracy' not in lion_power_performance_data[key]: logging.info( "Accuracy not found for power %f percentile %d. \tCalculating...", p, perc) interpolator = dTSNE_mnist.generate_lion_tsne_embedder( verbose=0, random_state=0, function_kwargs={ 'radius_x_percentile': perc, 'power': p }) per_sample_accuracy = np.zeros((len(picked_neighbors), )) per_sample_precision = np.zeros((len(picked_neighbors), )) for i in range(len(picked_neighbors)): # if i%100==0: # print("\tPower: ",p,"Processing:",i) expected_label = picked_neighbor_labels[i] result = interpolator(picked_neighbors[i], verbose=0) nn_indices = get_nearest_neighbors_in_y(result, Y_mnist, n=accuracy_nn) obtained_labels = labels_mnist[nn_indices] per_sample_accuracy[i] = sum( obtained_labels == expected_label) / len( obtained_labels) y = result x = picked_neighbors[i, :] nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn) nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn) matching_indices = len( [k for k in nn_x_indices if k in nn_y_indices]) per_sample_precision[i] = (matching_indices / precision_nn) cur_acc = np.mean(per_sample_accuracy) cur_prec = np.mean(per_sample_precision) # print('================= ',p,perc, cur_acc) lion_power_performance_data[key]['Accuracy'] = cur_acc lion_power_performance_data[key]['Precision'] = cur_prec with open(lion_power_performance_data_file, 'wb') as f: pickle.dump(lion_power_performance_data, f) else: logging.info( "Accuracy FOUND for power %f percentile %d. Using loaded.", p, perc) if 'PowerSquareDist' not in lion_power_performance_data[ key] or regenerate: logging.info( "Power performance not found for power %f percentile %d.\tCalculating...", p, perc) y_sum_square_dist = 0.0 y_sum_abs_dist = 0.0 y_count = 0.0 for i in range(len(X_mnist)): distances = distance_matrix[i, :].copy() distances[ i] = np.inf # Not interested in distance to itself # Step 1. Find nearest neighbors in the neighborhood. neighbor_indices = np.where(distances <= radius_x[perc])[0] num_neighbors = len(neighbor_indices) if num_neighbors >= 2: # Below 2? Cannot interpolate # We are good weights = 1 / distances[neighbor_indices]**p weights = weights / np.sum(weights) cur_y_result = weights.dot( Y_mnist[neighbor_indices, :]) y_sum_square_dist += np.sum(cur_y_result - Y_mnist[i, :])**2 y_sum_abs_dist += np.sqrt( np.sum(cur_y_result - Y_mnist[i, :])**2) y_count += 1.0 new_dict = dict() new_dict['PowerSquareDist'] = y_sum_square_dist / y_count new_dict['PowerAbsDist'] = y_sum_abs_dist / y_count # Just in case it will become unstable due to too few neighbors new_dict['PowerSquareDistSum'] = y_sum_square_dist new_dict['PowerSquareDistCount'] = y_count for ndk in new_dict.keys(): lion_power_performance_data[key][ndk] = new_dict[ndk] with open(lion_power_performance_data_file, 'wb') as f: pickle.dump(lion_power_performance_data, f) else: logging.info( "Power FOUND for power %f percentile %d. Using loaded.", p, perc) logging.info("%s %s", key, lion_power_performance_data[key]) lion_optimal_power = dict() lion_power_plot_y = dict() for perc in lion_percentile_options: y = list() for cur_power in lion_power_options: key = str(perc) + ";%.3f" % (cur_power) # print(cur_power, perc, lion_power_plot_data[key]) y.append(lion_power_performance_data[key]['PowerSquareDist']) lion_power_plot_y[perc] = y lion_optimal_power[perc] = lion_power_options[np.argmin(y)] with open(lion_power_plot_data_file, 'wb') as f: pickle.dump( (lion_power_options, lion_power_plot_y, lion_optimal_power), f) logging.info("LION optimal power: %s", lion_optimal_power) end_time = datetime.datetime.now() logging.info("LION power experiment ended: %s", end_time) logging.info("LION power experiment duration: %s", end_time - start_time)
def main(regenerate, only_time): dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters) Y_mnist= generate_data.load_y_mnist(parameters=parameters) picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters) # Doing it from scratch takes REALLY long time. If possible, save results & pre-load # These are consequences of parallelization # input_files = ['gd_results' + str(100 * i) + '_' + str(100 * i + 100) + '.p' for i in range(10)] output_file = generate_cluster_results_filename(parameters) output_time_file = generate_time_results_filename(parameters) first_sample_inc = 0 # Change only if it is one of "Other notebooks just for parallelization" last_sample_exclusive = len(picked_neighbors) if os.path.isfile(output_file) and not regenerate: logging.info("Found previous partially completed test. Starting from there.") with open(output_file, 'rb') as f: (picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) with open(output_time_file, 'rb') as f: (picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, picked_neighbors_y_time_gd_transformed_random, picked_neighbors_y_time_gd_variance_recalc_transformed_random, picked_neighbors_y_time_gd_early_exagg_transformed_random, picked_neighbors_y_time_gd_early_exagg_transformed, picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f) else: logging.info("No previous partially completed test, or regeneration requested. Starting from scratch.") covered_samples = list() # Let's build all possible combinations. Later we'll decide what to plot picked_neighbors_y_gd_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_gd_variance_recalc_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_gd_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_gd_variance_recalc_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_gd_early_exagg_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_gd_early_exagg_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random = np.zeros( (len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_gd_variance_recalc_early_exagg_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) picked_neighbors_y_time_gd_transformed = np.zeros((len(picked_neighbors), )) picked_neighbors_y_time_gd_variance_recalc_transformed = np.zeros((len(picked_neighbors), )) picked_neighbors_y_time_gd_transformed_random = np.zeros((len(picked_neighbors), )) picked_neighbors_y_time_gd_variance_recalc_transformed_random = np.zeros((len(picked_neighbors), )) picked_neighbors_y_time_gd_early_exagg_transformed_random = np.zeros((len(picked_neighbors), )) picked_neighbors_y_time_gd_early_exagg_transformed = np.zeros((len(picked_neighbors), )) picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random = np.zeros((len(picked_neighbors), )) picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed = np.zeros((len(picked_neighbors), )) picked_random_starting_positions = np.zeros((len(picked_neighbors), Y_mnist.shape[1])) for i in range(first_sample_inc, last_sample_exclusive): np.random.seed(i) # We reset random seed every time. Otherwise, if you load partial results from file, everything # will depend on which parts were loaded, random sequence will "shift" depend on that, and reproducibility will be lost. # I.e. if put seed(0) before the loop and start from scratch, then you will have some random sequence [abc] for sample 0, # other (continuation of that sequence) [def] for sample 1, etc. But if you already loaded sample 0 from file, you will # have [abc] for sample 1, [def] for sample 2, etc. Reproducibility should not depend on what parts are loaded. # Hence, random seed every time, and it depends on ABSOLUTE sample number. logging.info(" ====================== Sample %d\n\n", i) if i in covered_samples and not regenerate: logging.info("Already loaded.") else: neighbor = picked_neighbors[i].reshape((1, -1)) embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_transformed[i, :] = dTSNE_mnist.transform(neighbor, y='closest', verbose=2, optimizer_kwargs={'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_transformed[i] = (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time: %f s", picked_neighbors_y_time_gd_transformed[i]) embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_variance_recalc_transformed[i, :] = dTSNE_mnist.transform(neighbor, keep_sigmas=False, y='closest', verbose=2, optimizer_kwargs={'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_variance_recalc_transformed[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR): %f s", picked_neighbors_y_time_gd_variance_recalc_transformed[i]) # Let's pick random starts at any point. not necessary near the center. y_start = np.array([[ np.random.uniform(np.min(Y_mnist[:, 0]), np.max(Y_mnist[:, 0])), np.random.uniform(np.min(Y_mnist[:, 1]), np.max(Y_mnist[:, 1])) ]]) picked_random_starting_positions[i, :] = y_start embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_transformed_random[i, :] = dTSNE_mnist.transform(neighbor, y=y_start, # y='random', verbose=2, optimizer_kwargs={ 'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (random): %f s", picked_neighbors_y_time_gd_transformed_random[i]) embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_variance_recalc_transformed_random[i, :] = dTSNE_mnist.transform(neighbor, keep_sigmas=False, y=y_start, # y='random', verbose=2, optimizer_kwargs={ 'early_exaggeration': None}) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_variance_recalc_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR,random): %f s", picked_neighbors_y_time_gd_variance_recalc_transformed_random[i]) embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_early_exagg_transformed_random[i, :] = dTSNE_mnist.transform(neighbor, y=y_start, # y='random', verbose=2) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_early_exagg_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (EE,random): %f s", picked_neighbors_y_time_gd_early_exagg_transformed_random[i]) embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_early_exagg_transformed[i, :] = dTSNE_mnist.transform(neighbor, y='closest', verbose=2) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_early_exagg_transformed[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (EE): %f s", picked_neighbors_y_time_gd_early_exagg_transformed[i]) embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random[i, :] = \ dTSNE_mnist.transform(neighbor, y=y_start, keep_sigmas=False, verbose=2) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR, EE, random): %f s", picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random[i]) embedder_start_time = datetime.datetime.now() picked_neighbors_y_gd_variance_recalc_early_exagg_transformed[i, :] = dTSNE_mnist.transform(neighbor, keep_sigmas=False,y='closest',verbose=2) embedder_end_time = datetime.datetime.now() picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed[i] = \ (embedder_end_time - embedder_start_time).total_seconds() logging.info("Time (VR, EE): %f s", picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed[i]) covered_samples.append(i) # Re-saving even if it is a loaded sample logging.info("Saving...") # Gradient descent results take a long while. Let's cache. if not only_time: with open(output_file, 'wb') as f: pickle.dump((picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed, picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random, picked_neighbors_y_gd_early_exagg_transformed_random, picked_neighbors_y_gd_early_exagg_transformed, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random, picked_random_starting_positions, picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples), f) with open(output_time_file, 'wb') as f: pickle.dump((picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, picked_neighbors_y_time_gd_transformed_random, picked_neighbors_y_time_gd_variance_recalc_transformed_random, picked_neighbors_y_time_gd_early_exagg_transformed_random, picked_neighbors_y_time_gd_early_exagg_transformed, picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples), f)