def cluster_tsne(self, fVectors, n_components, perplexity): tsne = t_sne(n_components=n_components, perplexity=perplexity, metric='cosine') #tsne = t_sne(n_components=n_components, perplexity=perplexity, metric=self.hierarchcal_distance) embedding = tsne.fit_transform(fVectors) return embedding
def clusterBasedOnTime(self): codeEditsPerSession = self.dataProxy.getCodeTreesPerSession() #codeEditsPerSession = self.dataProxy.getRunEvents() timeDeltasPerSession = self.convertSessionsToTimeDeltas(codeEditsPerSession) # Generate session labels dateLabelMap = {} for index, datesForSession in enumerate(self.sessionDates): for date in datesForSession: dateLabelMap[date] = list(self.colorMapSessions.keys())[index] colorLabels = [self.colorMapSessions[dateLabelMap[session["_id"]["timestamp"]]] if session["_id"]["timestamp"] in dateLabelMap.keys() else self.colorMapSessions["other"] for session in codeEditsPerSession] # Convert to time bins maxTimeDelta = max(map(max, timeDeltasPerSession)) test = self.convertTimeDeltaListToBins(timeDeltasPerSession[0], maxTimeDelta, 100) print(test) binnedTimes = list(map(lambda x: self.convertTimeDeltaListToBins(x, maxTimeDelta, 100), timeDeltasPerSession)) # Convert to equal size numpy array length = max(map(len, binnedTimes)) binnedTimesPerSession = np.array(binnedTimes) tsne = t_sne(n_components=2, perplexity=8, metric='euclidean') embedding = tsne.fit_transform(binnedTimesPerSession) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(embedding[:, 0], embedding[:, 1], s=100, c=colorLabels) plt.show()
def learning_rates(folder, modification='', create=False, learning_rates=np.arange(5, 1000, 5)): """ Function used to generate or load t-SNE transformations with a range of different learning rates. Parameters ------------- folder: The name of the folder the pickles should be put in / are in modification: the modification done to the dataset, a string used in the name of the corresponding pickles. learning_rates: the learning rates rates you want to create transformations of create: true is you want to create the transformations, false if you want to load them pkl: true if you want to make a pickle for every value of learning_rate, otherwise false Output ------------- l_Z: A ilst of the t-SNE transformations learning_rates: a vector with the corresponding values of learning rate l_times: a vector with the corresponding values of computational time l_kl_divergence: a vector with the corresponding values kl divergence l_differences: a vector with the corresponding values of difference in 2d distance """ if create: l_Z = [] l_times = np.zeros(len(learning_rates)) l_kl_divergence = np.zeros(len(learning_rates)) X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb")) for i, l in enumerate(learning_rates): tsne = t_sne(learning_rate=l, random_state=123) start_time = time.time() l_Z.append(tsne.fit_transform(X)) l_times[i] = time.time() - start_time l_kl_divergence[i] = tsne.kl_divergence_ pickle.dump(l_Z, open(folder + "/l_Z_tsne" + modification + ".pkl", "wb")) pickle.dump( learning_rates, open(folder + "/learning_rates" + modification + ".pkl", "wb")) pickle.dump(l_times, open(folder + "/l_times" + modification + ".pkl", "wb")) pickle.dump( l_kl_divergence, open(folder + "/l_kl_divergence" + modification + ".pkl", "wb")) else: l_Z = pickle.load( open(folder + "/l_Z_tsne" + modification + ".pkl", "rb")) learning_rates = pickle.load( open(folder + "/learning_rates" + modification + ".pkl", "rb")) l_times = pickle.load( open(folder + "/l_times" + modification + ".pkl", "rb")) l_kl_divergence = pickle.load( open(folder + "/l_kl_divergence" + modification + ".pkl", "rb")) X_2d_tsne = pickle.load( open(folder + "/X_2d" + modification + ".pkl", "rb")) l_differences = HL.get_differences(X_2d_tsne, l_Z) return l_Z, learning_rates, l_times, l_kl_divergence, l_differences
def early_exaggeration(folder, modification='', create=False, early_exaggeration=np.arange(1, 80, 1)): """ Function used to generate or load t-SNE transformations with a range of different early exaggeration rates. Parameters ------------- folder: The name of the folder the pickles should be put in / are in modification: the modification done to the dataset, a string used in the name of the corresponding pickles. early_exaggeration: the early exaggeration rates you want to create transformations of create: true is you want to create the transformations, false if you want to load them pkl true if you want to make a pickle for every value of early_exaggeration, otherwise false Output ------------- e_Z: A ilst of the t-SNE transformations early_exaggeration: a vector with the corresponding values of early_exaggeration e_times: a vector with the corresponding values of computational time e_kl_divergence: a vector with the corresponding values kl divergence e_differences: a vector with the corresponding values of difference in 2d distance """ if create: e_Z = [] e_times = np.zeros(len(early_exaggeration)) e_kl_divergence = np.zeros(len(early_exaggeration)) X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb")) for i, e in enumerate(early_exaggeration): tsne = t_sne(early_exaggeration=e, random_state=123) start_time = time.time() e_Z.append(tsne.fit_transform(X)) e_times[i] = time.time() - start_time e_kl_divergence[i] = tsne.kl_divergence_ pickle.dump(e_Z, open(folder + "/e_Z_tsne" + modification + ".pkl", "wb")) pickle.dump( early_exaggeration, open(folder + "/early_exaggeration" + modification + ".pkl", "wb")) pickle.dump(e_times, open(folder + "/e_times" + modification + ".pkl", "wb")) pickle.dump( e_kl_divergence, open(folder + "/e_kl_divergence" + modification + ".pkl", "wb")) else: e_Z = pickle.load( open(folder + "/e_Z_tsne" + modification + ".pkl", "rb")) early_exaggeration = pickle.load( open(folder + "/early_exaggeration" + modification + ".pkl", "rb")) e_times = pickle.load( open(folder + "/e_times" + modification + ".pkl", "rb")) e_kl_divergence = pickle.load( open(folder + "/e_kl_divergence" + modification + ".pkl", "rb")) X_2d_tsne = pickle.load( open(folder + "/X_2d" + modification + ".pkl", "rb")) e_differences = HL.get_differences(X_2d_tsne, e_Z) return e_Z, early_exaggeration, e_times, e_kl_divergence, e_differences
def calculateSessionStats(self, eventsPerSession): stats = list(map(self.caclulateEventHistogramForSession, eventsPerSession)) '''changeVsRuns = list(map(lambda hist: [hist["changedWorkspace"] if "changedWorkspace" in hist else 0, hist["runClicked"] if "runClicked" in hist else 0], stats))''' '''changeVsRuns = list(map(lambda hist: [hist["changedWorkspace"] if "changedWorkspace" in hist else 0, hist["simStart"] if "simStart" in hist else 0], stats))''' changeVsRuns = list(map(lambda hist: [hist["changedWorkspace"] if "changedWorkspace" in hist else 0, hist["runClicked"] + hist["simStart"] if ("runClicked" in hist and "simStart" in hist) else hist["runClicked"] if "runClicked" in hist else hist["simStart"] if "simStart" in hist else 0], stats)) print(changeVsRuns) changeVsRuns = np.array([np.array(elem) for elem in changeVsRuns]) eventNames = ["blocklyBlockCreate", "blocklyBlockDelete", "blocklyBlockMove", "blocklyChange", "changedWorkspace", "runClicked", "simStart"] barColors = ["red", "green", "blue", "yellow", "orange", "purple", "brown"] # calculate regression line q, m = polyfit(changeVsRuns[:, 0], changeVsRuns[:, 1], 1) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(changeVsRuns[:, 0], changeVsRuns[:, 1], '.') ax.plot(changeVsRuns[:, 0], q + m * changeVsRuns[:, 0], '-') #ax.scatter(changeVsRuns[:, 0], changeVsRuns[:, 1]) ax.set_xlabel('#changedWorkspace') ax.set_ylabel('#runClicked') plt.show() fig, ax = plt.subplots(10, 13, sharey=True) freqtables = [] for i in range(10): for j in range(13): index = i*13 + j freq_table = list(map(lambda name: stats[index][name] if name in stats[index] else 0, eventNames)) freqtables.append(freq_table) x = range(len(freq_table)) ax[i, j].bar(x, freq_table, color=barColors) custom_lines = list(map(lambda color: Line2D([0], [0], color=color, lw=4), barColors)) fig.legend(custom_lines, eventNames) plt.show() freqtables = np.array([np.array(elem) for elem in freqtables]) tsne = t_sne(n_components=2, perplexity=10, metric='euclidean') embedding = tsne.fit_transform(freqtables) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(embedding[:, 0], embedding[:, 1], s=100) plt.show()
def threshold(folder, modification='', create=False, threshold=np.logspace(-14, -1, 50)): """ Function used to generate or load t-SNE transformations with a range of different thresholds (tol/min_grad_norm). Parameters ------------- folder: The name of the folder the pickles should be put in / are in modification: the modification done to the dataset, a string used in the name of the corresponding pickles. threshold: the thresholds you want to create transformations of create: true is you want to create the transformations, false if you want to load them pkl: true if you want to make a pickle for every value of threshold, otherwise false Output ------------- t_Z: A ilst of the t-SNE transformations learning_rates: a vector with the corresponding values of learning rate t_times: a vector with the corresponding values of computational time t_kl_divergence: a vector with the corresponding values kl divergence t_differences: a vector with the corresponding values of difference in 2d distance """ if create: t_Z = [] t_times = np.zeros(len(threshold)) t_kl_divergence = np.zeros(len(threshold)) X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb")) for i, t in enumerate(threshold): tsne = t_sne(min_grad_norm=t, random_state=123) start_time = time.time() t_Z.append(tsne.fit_transform(X)) t_times[i] = time.time() - start_time t_kl_divergence[i] = tsne.kl_divergence_ pickle.dump(t_Z, open(folder + "/t_Z_tsne" + modification + ".pkl", "wb")) pickle.dump(threshold, open(folder + "/threshold" + modification + ".pkl", "wb")) pickle.dump(t_times, open(folder + "/t_times" + modification + ".pkl", "wb")) pickle.dump( t_kl_divergence, open(folder + "/t_kl_divergence" + modification + ".pkl", "wb")) else: t_Z = pickle.load( open(folder + "/t_Z_tsne" + modification + ".pkl", "rb")) threshold = pickle.load( open(folder + "/threshold" + modification + ".pkl", "rb")) t_times = pickle.load( open(folder + "/t_times" + modification + ".pkl", "rb")) t_kl_divergence = pickle.load( open(folder + "/t_kl_divergence" + modification + ".pkl", "rb")) X_2d_tsne = pickle.load( open(folder + "/X_2d" + modification + ".pkl", "rb")) t_differences = HL.get_differences(X_2d_tsne, t_Z) return t_Z, threshold, t_times, t_kl_divergence, t_differences
def cluster_tsne(self, affinity_matrix, color_labels, title="TSNE", n_components=2, perplexity=30): tsne = t_sne(n_components=n_components, metric="precomputed", perplexity=perplexity) embedding = tsne.fit_transform(affinity_matrix) return embedding
def main(): spike_nums_dur = load_data() spike_nums = build_spike_nums_and_peak_nums(spike_nums_dur)[0] nb_activation = firing_feature(spike_nums, 50) # print(f"nb_activation size is {nb_activation.shape}") corr_matrix = get_pearson_correlation_matrix(nb_activation) # svm = sns.heatmap(corr_matrix) # fig = svm.get_figure() # save_formats = ["pdf"] # if isinstance(save_formats, str): # save_formats = [save_formats] # # path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000" # for save_format in save_formats: # fig.savefig(f'{path_results}/test_batta' # f'.{save_format}', # format=f"{save_format}", # facecolor=fig.get_facecolor()) ## DO HDBSCAN CLUSTERING ON CORRELATION MATRIX ( ACTIVATION FEATURE) ## clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='precomputed', min_cluster_size=3, min_samples=None, p=None) # metric='precomputed' euclidean clusterer.fit(corr_matrix) labels = clusterer.labels_ # print(f"labels.shape: {labels.shape}") print(f"N clusters hdbscan: {labels.max()+1}") print(f"labels: {labels}") print(f"With no clusters hdbscan: {len(np.where(labels == -1)[0])}") n_clusters = 0 if labels.max() + 1 > 0: n_clusters = labels.max() + 1 if n_clusters > 0: n_epoch_by_cluster = [ len(np.where(labels == x)[0]) for x in np.arange(n_clusters) ] print( f"Number of epochs by clusters hdbscan: {' '.join(map(str, n_epoch_by_cluster))}" ) corr_matrix_order = np.copy(corr_matrix) labels_indices_sorted = np.argsort(labels) corr_matrix_order = corr_matrix_order[labels_indices_sorted, :] corr_matrix_order = corr_matrix_order[:, labels_indices_sorted] mean_corr_values = np.zeros(n_clusters) for i in np.arange(0, n_clusters - 1): tmp = corr_matrix_order[np.where(labels == i)[0], :] tmp = tmp[:, np.where(labels == i)[0]] mean_corr_values[i] = np.mean(tmp) print(f" {mean_corr_values}") # print(f"{np.where(mean_corr_values>0.6)}") # print(f"{np.max(mean_corr_values[np.where(n_epoch_by_cluster>5)])}") # print(f"{np.where(labels==7)}") # print(f"{tmp}") # Generate figure: correlation matrix ordered by cluster svm = sns.heatmap(corr_matrix_order) svm.set_yticklabels(labels_indices_sorted) svm.set_xticklabels(labels_indices_sorted) fig = svm.get_figure() save_formats = ["pdf"] if isinstance(save_formats, str): save_formats = [save_formats] path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper" for save_format in save_formats: fig.savefig(f'{path_results}/test_hdbscan' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) ## DO T-SNE CLUSTERING ON CORRELATION MATRIX ## tsne = t_sne(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(corr_matrix) # first figure: plot t-sne without color df_subset = pd.DataFrame() df_subset['tsne-2d-one'] = tsne_results[:, 0] df_subset['tsne-2d-two'] = tsne_results[:, 1] df_subset['color'] = labels plt.figure(figsize=(16, 10)) svm = sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", data=df_subset, legend="full", alpha=1) fig = svm.get_figure() path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper" for save_format in save_formats: fig.savefig(f'{path_results}/tsne_cluster' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close() # second figure: plot t-sne with color from previous hdbscan result df_subset = pd.DataFrame() df_subset['tsne-2d-one'] = tsne_results[:, 0] df_subset['tsne-2d-two'] = tsne_results[:, 1] df_subset['color'] = labels plt.figure(figsize=(16, 10)) svm = sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue="color", palette=sns.color_palette("hls", labels.max() + 2), data=df_subset, legend="full", alpha=1) fig = svm.get_figure() path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper" for save_format in save_formats: fig.savefig( f'{path_results}/tsne_colors_from_previous_hdbscan_clustering' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close() # DO CLUSTERING ON T-SNE RESULTS TO COLOR THE T-SNE FIGURE ## clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='euclidean', min_cluster_size=3, min_samples=None, p=None) clusterer.fit(tsne_results) labels_hdbscan_on_tsne = clusterer.labels_ print( f"N clusters hdbscan on t-sne results: {labels_hdbscan_on_tsne.max()+1}" ) # print(f"labels: {labels_hdbscan_on_tsne}") df_subset = pd.DataFrame() df_subset['tsne-2d-one'] = tsne_results[:, 0] df_subset['tsne-2d-two'] = tsne_results[:, 1] df_subset['color'] = labels_hdbscan_on_tsne plt.figure(figsize=(16, 10)) svm = sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue="color", palette=sns.color_palette( "hls", labels_hdbscan_on_tsne.max() + 2), data=df_subset, legend="full", alpha=1) # plt.show() fig = svm.get_figure() path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper" for save_format in save_formats: fig.savefig( f'{path_results}/tsne_colors_from_post_tsne_clustering' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close()
def spotdist_function(ms, param): ################################################################################################################### # CHOOSE METHOD TO USE # method_homemade = True # The one to use if want to run on traces be careful with risk of memory error method_battaglia = False # Run faster for raster_dur / raster but EMD is not normalized # DECIDE ON WHICH DATA TO WORK data_to_use = "traces" possible_data_to_use = [ "raster_dur", "raster", "traces", "artificial_raster" ] if data_to_use not in possible_data_to_use: data_to_use = "raster_dur" raise Exception( "Can not run SpotDist on this data, by default use of raster_dur") # DECIDE EPOCH LENGTH ONLY FOR NON ARTIFICIAL DATA len_epoch = 250 # If you want to work on artificial data random_pattern_order = True known_pattern_order = False # This option is obsolete, do not use use_one_shuffle_per_pattern = True do_general_shuffling_on_full_raster = False fuse_raster_with_noise = True # SET SAVING PATH path_results = param.path_results time_str = param.time_str ################################################################################################################### ################################ # IF WORK ON ARTIFICIAL DATA # ################################ if data_to_use == "artificial_raster": # DEFINE RASTER # n_cells = 50 len_pattern = 100 if random_pattern_order: n_epochs = 100 # Put something that 4 can divide n_frames = len_pattern * n_epochs if known_pattern_order: n_epochs = 12 # Do not change, only 12 epochs are generated n_frames = len_pattern * n_epochs art_raster_dur = np.zeros((n_cells, n_frames), dtype="int8") art_raster_dur_noise = np.zeros((n_cells, n_frames), dtype="int8") rand_art_raster_dur = np.zeros((n_cells, n_frames), dtype="int8") art_raster_dur_pattern_shuffle = np.zeros((n_cells, n_frames), dtype="int8") noise_matrix = np.zeros((n_cells, n_frames), dtype="int8") rand_art_raster_dur_noise = np.zeros((n_cells, n_frames), dtype="int8") n_epochs = n_frames // len_pattern # to make things easy for now, the number of frames should be divisible by the length of epochs if (n_frames % len_pattern) != 0: raise Exception( "number of frames {n_frames} not divisible by {len_epoch}") ############################################ # CREATE PATTERNS ASSEMBLIES AND SEQUENCES # ############################################ # create pattern#1 = sequence in order pattern1 = np.zeros((n_cells, len_pattern)) for i in range(n_cells): pattern1[i, i] = 1 pattern1[i, i + 50] = 1 # create pattern#1 shuffle = sequence in a shuffle order pattern1_shuffle = np.copy(pattern1) np.random.shuffle(pattern1_shuffle) # create pattern#2 = assemblies in order pattern2 = np.zeros((n_cells, len_pattern)) pattern2[13:26, 2:4] = 1 pattern2[0:13, 14:16] = 1 pattern2[39:50, 26:28] = 1 pattern2[26:39, 38:40] = 1 pattern2[13:26, 50:52] = 1 pattern2[39:50, 62:64] = 1 pattern2[26:39, 74:76] = 1 pattern2[0:13, 86:88] = 1 # create pattern#2 shuffle = assemblies in shuffle order pattern2_shuffle = np.copy(pattern2) np.random.shuffle(pattern2_shuffle) # create pattern#3 = sequence together with noise pattern3 = np.zeros((n_cells, len_pattern)) n_cells_in_sequence = 40 noisy_cells = n_cells - n_cells_in_sequence for i in range(n_cells_in_sequence): pattern3[i, i:i + 2] = 1 pattern3[i, 20 + i:i + 22] = 1 pattern3[n_cells_in_sequence:n_cells, :] = generate_poisson_pattern( noisy_cells, len_pattern, 10, 50, 1, 2) # create pattern#3 shuffle pattern3_shuffle = np.copy(pattern3) np.random.shuffle(pattern3_shuffle) # create pattern#4 = assemblies together with noise pattern4 = np.zeros((n_cells, len_pattern)) cells_in_assemblies = 41 cells_with_noise = n_cells - cells_in_assemblies pattern4[11:22, 2:4] = 1 pattern4[0:11, 14:16] = 1 pattern4[36:41, 26:28] = 1 pattern4[22:36, 38:40] = 1 pattern4[11:22, 50:52] = 1 pattern4[36:41, 62:64] = 1 pattern4[22:36, 74:76] = 1 pattern4[0:11, 86:88] = 1 pattern4[41:50, :] = generate_poisson_pattern(cells_with_noise, len_pattern, 10, 50, 1, 2) # create pattern#2 shuffle = assemblies in shuffle order pattern4_shuffle = np.copy(pattern4) np.random.shuffle(pattern4_shuffle) ######################################### # USE PATTERNS ASSEMBLIES AND SEQUENCES # ######################################### if known_pattern_order: # CREATE ARTIFICIAL RASTER FROM KNOWN COMBINATION OF PATTERN art_raster_dur[:, 0:100] = pattern1 art_raster_dur[:, 100:200] = generate_poisson_pattern( n_cells, len_pattern, 20, 50, 1, 2) art_raster_dur[:, 200:300] = pattern2 art_raster_dur[:, 300:400] = pattern1 art_raster_dur[:, 400:500] = generate_poisson_pattern( n_cells, len_pattern, 10, 50, 1, 2) art_raster_dur[:, 500:600] = generate_poisson_pattern( n_cells, len_pattern, 10, 50, 1, 2) art_raster_dur[:, 600:700] = pattern1 art_raster_dur[:, 700:800] = pattern2 art_raster_dur[:, 800:900] = generate_poisson_pattern( n_cells, len_pattern, 10, 50, 1, 2) art_raster_dur[:, 900:1000] = generate_poisson_pattern( n_cells, len_pattern, 10, 50, 1, 2) art_raster_dur[:, 1000:1100] = pattern2 art_raster_dur[:, 1100:1200] = generate_poisson_pattern( n_cells, len_pattern, 10, 50, 1, 2) if random_pattern_order: # CREATE ARTIFICIAL RASTER COMBINATION OF THESE ASSEMBLIES SEQUENCES PLUS NOISE # Half of the epochs are noise pattern, the other half if equally divided in patterns n_patterns = 2 n_epochs_noise = n_epochs // 2 n_epochs_pattern = n_epochs - n_epochs_noise # n_epochs_pattern = int(n_epochs_pattern) n_epochs_pattern1 = n_epochs_pattern // n_patterns n_epochs_pattern2 = n_epochs_pattern // n_patterns pattern_id = np.zeros(n_epochs) pattern_id[0:n_epochs_noise] = 0 pattern_id[n_epochs_noise:(n_epochs_noise + n_epochs_pattern1)] = 1 pattern_id[(n_epochs_noise + n_epochs_pattern1):(n_epochs_noise + n_epochs_pattern1 + n_epochs_pattern2)] = 2 np.random.shuffle(pattern_id) for i in range(n_epochs): if pattern_id[i] == 0: art_raster_dur[:, np.arange((i * len_pattern), (i * len_pattern) + len_pattern )] = generate_poisson_pattern( n_cells, len_pattern, 10, 50, 1, 2) art_raster_dur_pattern_shuffle[:, np.arange( (i * len_pattern), (i * len_pattern) + len_pattern )] = generate_poisson_pattern( n_cells, len_pattern, 10, 50, 1, 2) if pattern_id[i] == 1: art_raster_dur[:, np.arange((i * len_pattern), (i * len_pattern) + len_pattern)] = pattern3 art_raster_dur_pattern_shuffle[:, np.arange( (i * len_pattern), (i * len_pattern) + len_pattern )] = pattern3_shuffle if pattern_id[i] == 2: art_raster_dur[:, np.arange((i * len_pattern), (i * len_pattern) + len_pattern)] = pattern4 art_raster_dur_pattern_shuffle[:, np.arange( (i * len_pattern), (i * len_pattern) + len_pattern )] = pattern4_shuffle if use_one_shuffle_per_pattern is False: rand_art_raster_dur = np.copy(art_raster_dur) elif use_one_shuffle_per_pattern is True: rand_art_raster_dur = np.copy(art_raster_dur_pattern_shuffle) if do_general_shuffling_on_full_raster is True: np.random.shuffle(rand_art_raster_dur) rand_art_raster_dur.astype(int) # CREATE ARTIFICIAL RASTER COMBINATION OF NOISE ONLY for i in np.arange(n_epochs): tmp_patt_noise = np.zeros((n_cells, len_pattern)) patt_num_noise = np.random.randint(3) n_cells_to_clear = np.random.randint(np.round(n_cells / 5), n_cells) cell_to_clear_indices = np.random.randint(0, n_cells, size=n_cells_to_clear) # print(f"pattern number is {patt_num}") if patt_num_noise == 0: tmp_patt_noise = generate_poisson_pattern( n_cells, len_pattern, 10, 40, 1, 2) tmp_patt_noise[cell_to_clear_indices, :] = 0 if patt_num_noise == 1: tmp_patt_noise = generate_poisson_pattern( n_cells, len_pattern, 25, 40, 1, 2) tmp_patt_noise[cell_to_clear_indices, :] = 0 if patt_num_noise == 2: tmp_patt_noise = generate_poisson_pattern( n_cells, len_pattern, 30, 40, 1, 2) tmp_patt_noise[cell_to_clear_indices, :] = 0 noise_matrix[:, np.arange( (i * len_pattern), (i * len_pattern) + len_pattern)] = tmp_patt_noise if fuse_raster_with_noise is True: art_raster_dur_noise = np.copy(art_raster_dur) art_raster_dur_noise[np.where(noise_matrix == 1)] = 1 rand_art_raster_dur_noise = np.copy(rand_art_raster_dur) rand_art_raster_dur_noise[np.where(noise_matrix == 1)] = 1 rand_art_raster_dur = rand_art_raster_dur_noise data = rand_art_raster_dur # PLOT ALL THESE RASTER # # noise-only pattern plot_spikes_raster( spike_nums=noise_matrix, param=None, file_name=f"poisson_noise_raster", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, path_results=path_results, save_formats=["pdf", "png"]) # Artificial raster dur ordered plot_spikes_raster( spike_nums=art_raster_dur, param=None, file_name=f"ordered_raster_with_patterns", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, path_results=path_results, save_formats=["pdf", "png"]) # Artificial raster dur with intra pattern shuffle of cell order plot_spikes_raster( spike_nums=art_raster_dur_pattern_shuffle, param=None, file_name=f"raster_with_one_shuffle_per_pattern", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, path_results=path_results, save_formats=["pdf", "png"]) # Add an additional shuflle on the order of all cell plot_spikes_raster( spike_nums=rand_art_raster_dur, param=None, file_name=f"raster_with_patterns_full_shuffle", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, path_results=path_results, save_formats=["pdf", "png"]) # Artificial raster dur ordered with random noise plot_spikes_raster( spike_nums=art_raster_dur_noise, param=None, file_name=f"ordered_raster_with_patterns_and_noise", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, path_results=path_results, save_formats=["pdf", "png"]) # Artificial raster dur shuffled with random noise plot_spikes_raster( spike_nums=rand_art_raster_dur_noise, param=None, file_name=f"raster_with_patterns_full_shuffle_and_noise", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, path_results=path_results, save_formats=["pdf", "png"]) ################################# # WORK ON REAL DATA: RASTER_DUR # ################################# if data_to_use == "raster_dur": print(f"Loading raster_dur") spike_nums_dur = load_data_rasterdur(ms) # automatic way # spike_nums_dur = spike_nums_dur[:20, :2500] # TO TEST THE CODE n_cells, n_frames = spike_nums_dur.shape print(f"spike_nums_dur has {n_cells} cells and {n_frames} frames") data = spike_nums_dur ############################# # WORK ON REAL DATA: RASTER # ############################# if data_to_use == "raster": print(f"Loading raster") spike_nums = load_data_raster(ms) # automatic way # spike_nums = spike_nums[:20, :2500] # TO TEST THE CODE n_cells, n_frames = spike_nums.shape print(f"spike_nums has {n_cells} cells and {n_frames} frames") data = spike_nums ############################# # WORK ON REAL DATA: TRACES # ############################# if data_to_use == "traces": print(f"Loading traces") traces = load_data_traces(ms) # automatic way traces = traces[:100, :10000] # TO TEST THE CODE n_cells, n_frames = traces.shape print(f"traces has {n_cells} cells and {n_frames} frames") data = traces ##################### # COMPUTE DISTANCES # ##################### n_epochs = n_frames // len_epoch # to make things easy for now, the number of frames should be divisible by the length of epochs if (n_frames % len_epoch) != 0: raise Exception( "number of frames {n_frames} not divisible by {len_epoch}") if method_battaglia: method = "battaglia" distances = SPOT_Dist_Battaglia(data, len_epoch=len_epoch)[0] if method_homemade: method = "homemade" distances = SPOT_Dist_JD_RD(data, len_epoch=len_epoch, distance_metric="EMD_Battaglia") # Plot Distance matrix # ax = sns.heatmap(distances, annot=True) ax = sns.heatmap(distances) fig = ax.get_figure() save_formats = ["pdf", "png"] if isinstance(save_formats, str): save_formats = [save_formats] for save_format in save_formats: fig.savefig( f'{path_results}/{ms.description}_distances_matrix_{method}_SPOTDist_on_{data_to_use}_with_{len_epoch}_frame_epochs' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close() ################## ### CLUSTERING ### ################## # HDBSCAN is supposed to be be blind to Inf value, replace missing values by np.Inf for clustering distances[np.where(np.isnan(distances))] = np.Inf # DO HDBSCAN ON DISTANCES MATRIX - CONSIDER PRECOMPUTED DISTANCES clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='precomputed', min_cluster_size=2, min_samples=None, p=None) # metric='precomputed' euclidean clusterer.fit(distances) labels = clusterer.labels_ # print(f"labels.shape: {labels.shape}") print(f"N clusters hdbscan: {labels.max()+1}") print(f"labels: {labels}") print(f"With no clusters hdbscan: {len(np.where(labels == -1)[0])}") n_clusters = 0 if labels.max() + 1 > 0: n_clusters = labels.max() + 1 if n_clusters > 0: n_epoch_by_cluster = [[len(np.where(labels == x)[0])] for x in np.arange(n_clusters)] print( f"Number of epochs by clusters hdbscan: {' '.join(map(str, n_epoch_by_cluster))}" ) distances_order = np.copy(distances) labels_indices_sorted = np.argsort(labels) distances_order = distances_order[labels_indices_sorted, :] distances_order = distances_order[:, labels_indices_sorted] # Generate figure: dissimilarity matrice ordered by cluster # Replace Inf values by NaN for better visualization distances_order[np.where(np.isinf(distances_order))] = np.nan # svm = sns.heatmap(distances_order, annot=True) # if you want the value svm = sns.heatmap(distances_order) svm.set_yticklabels(labels_indices_sorted) svm.set_xticklabels(labels_indices_sorted) fig = svm.get_figure() # plt.show() save_formats = ["pdf", "png"] if isinstance(save_formats, str): save_formats = [save_formats] path_results = path_results for save_format in save_formats: fig.savefig( f'{path_results}/distances_matrix_hdbscan_ordered' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close() coords = [] color = [] for i in range(n_epochs): coords.append([[i * len_epoch, i * len_epoch + len_epoch]]) color.append( cm.nipy_spectral( float(labels[i] + 2) / (len(np.unique(labels)) + 2))) if data_to_use == "artificial_raster": plot_spikes_raster( spike_nums=art_raster_dur_noise, param=None, file_name=f"raster_with_patterns_colored", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, span_area_coords=coords, span_area_colors=color, path_results=path_results, save_formats=["pdf", "png"]) if data_to_use == "raster_dur" or data_to_use == "raster": plot_spikes_raster( spike_nums=data, param=None, file_name=f"raster_with_patterns_colored", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, span_area_coords=coords, span_area_colors=color, path_results=path_results, save_formats=["pdf", "png"]) # IF NO NaN DO T-SNE CLUSTERING ON DISTANCES VALUES - EUCLIDEAN DISTANCES # todo: find a way to do t-SNE anyway missing_values = np.isnan(distances) inf_values = np.isinf(distances) if (not np.any(missing_values)) and (not np.any(inf_values)): do_tsne_clustering = True print(f" do tsne clustering is {do_tsne_clustering}") elif bool(np.any(missing_values)) or bool(np.any(inf_values)): do_tsne_clustering = False print(f" do tsne clustering is {do_tsne_clustering}") if do_tsne_clustering is True: tsne = t_sne(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(distances) # first figure: plot t-sne without color df_subset = pd.DataFrame() df_subset['tsne-2d-one'] = tsne_results[:, 0] df_subset['tsne-2d-two'] = tsne_results[:, 1] df_subset['color'] = labels plt.figure(figsize=(16, 10)) svm = sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", data=df_subset, legend="full", alpha=1) fig = svm.get_figure() path_results = path_results for save_format in save_formats: fig.savefig(f'{path_results}/tsne_cluster' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close() # second figure: plot t-sne with color from previous hdbscan result df_subset = pd.DataFrame() df_subset['tsne-2d-one'] = tsne_results[:, 0] df_subset['tsne-2d-two'] = tsne_results[:, 1] df_subset['color'] = labels plt.figure(figsize=(16, 10)) svm = sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue="color", palette=sns.color_palette( "hls", len(np.unique(labels))), data=df_subset, legend="full", alpha=1) fig = svm.get_figure() path_results = path_results for save_format in save_formats: fig.savefig( f'{path_results}/tsne_colors_from_previous_hdbscan_clustering' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close() # DO CLUSTERING ON T-SNE RESULTS TO COLOR THE T-SNE FIGURE ## clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='euclidean', min_cluster_size=3, min_samples=None, p=None) clusterer.fit(tsne_results) labels_hdbscan_on_tsne = clusterer.labels_ print( f"N clusters hdbscan on t-sne results: {labels_hdbscan_on_tsne.max()+1}" ) # print(f"labels: {labels_hdbscan_on_tsne}") df_subset = pd.DataFrame() df_subset['tsne-2d-one'] = tsne_results[:, 0] df_subset['tsne-2d-two'] = tsne_results[:, 1] df_subset['color'] = labels_hdbscan_on_tsne plt.figure(figsize=(16, 10)) svm = sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue="color", palette=sns.color_palette( "hls", len(np.unique(labels_hdbscan_on_tsne))), data=df_subset, legend="full", alpha=1) # plt.show() fig = svm.get_figure() path_results = path_results for save_format in save_formats: fig.savefig( f'{path_results}/tsne_colors_from_post_tsne_clustering' f'.{save_format}', format=f"{save_format}", facecolor=fig.get_facecolor()) plt.close() ############################################### ######## RETRIEVE GOOD ORDER OF RASTER ######## ############################################### # Get the number of "true clusters" epochs with label = -1 are not in cluster if labels.max() + 1 > 0: n_clusters = labels.max() + 1 if n_clusters > 0: n_epoch_by_cluster = [[len(np.where(labels == x)[0])] for x in np.arange(n_clusters)] # Keep all the epochs belongings to the same clusters kept_epochs = [] for i in range(n_clusters): kept_epochs.append(np.where(labels == i)) # print(f"Epochs kept to represent the clusters are {kept_epochs}") # Get 1 raster per cluster corresponding to the sum of all rasters from the epochs of the cluster patterns_raster = np.zeros((n_cells, len_epoch, n_clusters)) for i in range(n_clusters): raster_cluster_i = np.zeros((n_cells, len_epoch)) epochs_in_cluster_i = kept_epochs[i] epochs_in_cluster_i = epochs_in_cluster_i[0] n_epoch_in_cluster_i = n_epoch_by_cluster[i] n_epoch_in_cluster_i = n_epoch_in_cluster_i[0] # print(f" Epochs in cluster {i} are {epochs_in_cluster_i}") # print(f" Cluster {i} contain {n_epoch_in_cluster_i} epochs") for j in range(n_epoch_in_cluster_i): start_epoch = epochs_in_cluster_i[j] * len_epoch end_epoch = epochs_in_cluster_i[j] * len_epoch + len_epoch # print(f" Epoch {j} of cluster {i} starts at {start_epoch} ends at {end_epoch}") raster_cluster_i = raster_cluster_i + data[:, start_epoch:end_epoch] raster_cluster_i = np.true_divide(raster_cluster_i, n_epoch_in_cluster_i) patterns_raster[:, :, i] = raster_cluster_i for i in range(n_clusters): pattern_i = patterns_raster[:, :, i] # Plot this raster plot_spikes_raster( spike_nums=pattern_i, param=None, file_name=f"raster_pattern_{i}", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, plot_with_amplitude=True, path_results=path_results, save_formats=["pdf", "png"]) for i in range(n_clusters): pattern_i = patterns_raster[:, :, i] max_values_vector = np.amax(pattern_i, axis=1) order = np.argsort(-max_values_vector) sorted_pattern_i = np.copy(pattern_i) sorted_pattern_i = sorted_pattern_i[order, :] # Plot this raster plot_spikes_raster( spike_nums=sorted_pattern_i, param=None, file_name=f"raster_ordered_pattern_{i}", # y_ticks_labels=np.arange(n_cells), # y_ticks_labels_size=2, save_raster=True, show_raster=False, without_activity_sum=True, plot_with_amplitude=True, path_results=path_results, save_formats=["pdf", "png"])
def myTSNE(X,y): t1 = clock() clf = manifold.t_sne(n_components=4, init='pca', random_state=0) newRep = clf.fit_transform(X) t2 = clock() return t2-t1
def analyze(self, dataset_id, f_analyzer, s_analyzer, log_id="log", save_results=False, embedding_dims=2, func_method="hadamard", func_incremental=False, clustering_method="t-sne"): # Get functional embedding and distance matrix f_embedding, f_code_trees, f_labels, f_steplabels, f_vectors = f_analyzer.analyze( dataset_id, log_id=log_id, method=func_method, incremental=func_incremental, save_results=False, embedding_dims=embedding_dims, clustering_method=clustering_method) f_dmat = pairwise_distances(f_vectors, metric="cosine") f_dmat = np.divide(f_dmat, scipy.linalg.norm(f_dmat)) # plot distance matrix plt.title("functional distance matrix") plt.imshow(f_dmat, cmap='gist_ncar', interpolation='none') plt.show() # get structural embedding and distance s_embedding, s_dmat = s_analyzer.analyze(dataset_id, log_id=log_id, save_results=False, embedding_dims=embedding_dims) s_dmat = np.divide(s_dmat, scipy.linalg.norm(s_dmat)) # plot distance matrix plt.title("structural distance matrix") plt.imshow(s_dmat, cmap='gist_ncar', interpolation='none') plt.show() # Combine functional and structural distance matrix #dmat = np.divide(np.add(f_dmat, np.multiply(s_dmat, 1)), 2) #f_dmat = np.log(np.add(f_dmat, 1)) #dmat = np.minimum(np.divide(f_dmat, scipy.linalg.norm(f_dmat)), s_dmat) #dmat = np.divide(np.add(np.log(np.add(np.multiply(f_dmat, 100), 1)), np.log(np.add(np.multiply(s_dmat, 100), 1))), 2) #dmat = np.divide(dmat, scipy.linalg.norm(dmat)) #normalize min_contrib = 100 divisor = np.add(s_dmat, f_dmat) #f_weights = np.nan_to_num(np.divide(np.add(s_dmat, np.divide(divisor, min_contrib)), np.add(divisor, np.divide(divisor, min_contrib*2)))) f_weights = np.nan_to_num(np.divide(s_dmat, divisor)) #s_weights = np.nan_to_num(np.divide(np.add(f_dmat, np.divide(divisor, min_contrib)), np.add(divisor, np.divide(divisor, min_contrib*2)))) s_weights = np.nan_to_num(np.divide(f_dmat, divisor)) dmat = np.add(np.multiply(f_dmat, f_weights), np.multiply(s_dmat, s_weights)) plt.title("combined dmat") plt.imshow(dmat, cmap='gist_ncar', interpolation='none') plt.show() if clustering_method == "t-sne": # Cluster using t-SNE tsne = t_sne(n_components=embedding_dims, metric="precomputed", perplexity=10) embedding = tsne.fit_transform(dmat) elif clustering_method == "umap": reducer = umap.UMAP(metric="precomputed", min_dist=0.99, n_neighbors=100) embedding = reducer.fit_transform(dmat) if save_results: utils = AnalysisUtils() utils.save_experiment(self.experimentId, embedding, f_code_trees, f_labels, f_steplabels) print("done")
def perplexity(folder=None, modification='', per=np.arange(2, 150, 2), create=False, pkl=True, X=None, X_2d_tsne=None): """ Function used to generate or load t-SNE transformations with a range of different perplexities. Parameters ------------- folder: The name of the folder the pickles should be put in / are in modification: the modification done to the dataset, a string used in the name of the corresponding pickles. per: the perplexities you want to create transformations of create: true is you want to create the transformations, false if you want to load them pkl: true if you want to make a pickle for every value of per, otherwise false X: the data you want to transform X_2d_tsne: the original 2D version of X Output ------------- p_Z: A ilst of the t-SNE transformations per: a vector with the corresponding values of perplexity p_times: a vector with the corresponding values of computational time p_kl_divergence: a vector with the corresponding values kl divergence p_differences: a vector with the corresponding values of difference in 2d distance """ if create: p_Z = [] p_times = np.zeros(len(per)) p_kl_divergence = np.zeros(len(per)) if X is None: X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb")) for i, p in enumerate(per): tsne = t_sne(perplexity=p, random_state=123) start_time = time.time() p_Z.append(tsne.fit_transform(X)) p_times[i] = time.time() - start_time p_kl_divergence[i] = tsne.kl_divergence_ if pkl: pickle.dump( p_Z, open(folder + "/p_Z_tsne" + modification + ".pkl", "wb")) pickle.dump(per, open(folder + "/per" + modification + ".pkl", "wb")) pickle.dump( p_times, open(folder + "/p_times" + modification + ".pkl", "wb")) pickle.dump( p_kl_divergence, open(folder + "/p_kl_divergence" + modification + ".pkl", "wb")) else: p_Z = pickle.load( open(folder + "/p_Z_tsne" + modification + ".pkl", "rb")) per = pickle.load(open(folder + "/per" + modification + ".pkl", "rb")) p_times = pickle.load( open(folder + "/p_times" + modification + ".pkl", "rb")) p_kl_divergence = pickle.load( open(folder + "/p_kl_divergence" + modification + ".pkl", "rb")) if X_2d_tsne is None: X_2d_tsne = pickle.load( open(folder + "/X_2d" + modification + ".pkl", "rb")) p_differences = HL.get_differences(X_2d_tsne, p_Z) return p_Z, per, p_times, p_kl_divergence, p_differences