def biclustering(dist, genes_1, genes_2, x_label, y_label, out_file, experiment, id_convertor, n_clusters=3, precent_visualize=0.1): model = SpectralBiclustering(n_clusters=n_clusters, n_components=12, n_best=6, init='random', random_state=1) m, n = dist.shape assert m == len(genes_1) and n == len(genes_2) model.fit(dist) rows = [(idx, clust_id) for idx, clust_id in enumerate(model.row_labels_)] selected_rows = random.choices(rows, k=int(precent_visualize * len(rows))) selected_rows_name = [genes_1[idx] for idx, _ in selected_rows] selected_rows_clust_ids = [clust_id for _, clust_id in selected_rows] selected_rows_indices = [idx for idx, _ in selected_rows] # Slect columns cols = [(idx, clust_id) for idx, clust_id in enumerate(model.column_labels_)] selected_cols = random.choices(cols, k=int(precent_visualize * len(cols))) selected_cols_names = [genes_2[idx] for idx, _ in selected_cols] selected_cols_clust_ids = [clust_id for _, clust_id in selected_cols] selected_cols_indices = [idx for idx, _ in selected_cols] # Selected dist selected_dist = dist[selected_rows_indices] [:, selected_cols_indices] # Sort rows sorted_rows_indices = np.argsort(selected_rows_clust_ids) selected_dist = selected_dist[sorted_rows_indices, :] selected_row_names = [selected_rows_name[i] for i in sorted_rows_indices] #selected_row_names = selected_rows_name[sorted_rows_indices] # sort columns sorted_cols_indices = np.argsort(selected_cols_clust_ids) selected_dist = selected_dist[:, sorted_cols_indices] selected_cols_names = [selected_cols_names[i] for i in sorted_cols_indices] result = pd.DataFrame(selected_dist, columns=selected_cols_names, index=selected_rows_name) ax = sns.heatmap(result, cmap="Greens_r", square=True) plt.title("Biclustering Results") ax.set_yticklabels([]) ax.set_xticklabels([]) ax.tick_params(left=False, bottom=False) ax.set_ylabel('{} genes'.format(x_label)) ax.set_xlabel('{} genes'.format(y_label)) figure = ax.get_figure() figure.savefig(out_file) plt.close() for bic in range(n_clusters*n_clusters): #print(bic) r = list(model.rows_[bic]) rows = [i for (i, b) in zip(genes_1, r) if b] c = list(model.columns_[bic]) columns = [i for (i, b) in zip(genes_2, c) if b] rows = id_convertor.ints2ids([int(k) for k in rows]) columns = id_convertor.ints2ids([int(k) for k in columns]) cluster_path = os.path.join(experiment, f'{bic}_{x_label}_{y_label}_biclustering.csv') with open(cluster_path, 'w') as fout: fout.write(','.join(rows)) fout.write("\n") fout.write(','.join(columns))
def get_bicluster(self, data): # Biclustering model = SpectralBiclustering(n_clusters=data.shape[1], random_state=0) print(data.sum(axis=0)) print(data.sum(axis=1)) model.fit(data.fillna(0)) fit_data = data.iloc[np.argsort(model.row_labels_)] fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)] return fit_data
def spectral_biclust(E, ngenes=3, nconditions=1, spectral_method="bistochastic", n=6, n_best_ratio=0.5, **kwargs): """ Note: - method was moved from sklearn.cluster.bicluster.SpectralBiclustering """ n_best = max([int(n*n_best_ratio), 1]) spectral = SpectralBiclustering(n_clusters=(nconditions,ngenes), method=spectral_method, n_components=n, n_best=n_best) spectral.fit(standardize(E)) bics = [] for columns, rows in zip(spectral.columns_, spectral.rows_): genes = E.columns[columns] conditions = E.index[rows] bics.append(Bicluster(genes, conditions)) return bics
def test_spectral_biclustering(): # Test Kluger methods on a checkerboard dataset. S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=0) non_default_params = {'method': ['scale', 'log'], 'svd_method': ['arpack'], 'n_svd_vecs': [20], 'mini_batch': [True]} for mat in (S, csr_matrix(S)): for param_name, param_values in non_default_params.items(): for param_value in param_values: model = SpectralBiclustering( n_clusters=3, n_init=3, init='k-means++', random_state=0, ) model.set_params(**dict([(param_name, param_value)])) if issparse(mat) and model.get_params().get('method') == 'log': # cannot take log of sparse matrix with pytest.raises(ValueError): model.fit(mat) continue else: model.fit(mat) assert model.rows_.shape == (9, 30) assert model.columns_.shape == (9, 30) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model)
def test_perfect_checkerboard(): # XXX Previously failed on build bot (not reproducible) model = SpectralBiclustering(3, svd_method="arpack", random_state=0) S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1
def test_perfect_checkerboard(): # XXX test always skipped raise SkipTest("This test is failing on the buildbot, but cannot" " reproduce. Temporarily disabling it until it can be" " reproduced and fixed.") model = SpectralBiclustering(3, svd_method="arpack", random_state=0) S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1
plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") # shuffle clusters rng = np.random.RandomState(0) row_idx = rng.permutation(data.shape[0]) col_idx = rng.permutation(data.shape[1]) data = data[row_idx][:, col_idx] plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.1f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap=plt.cm.Blues) plt.title("Checkerboard structure of rearranged data")
plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original datasets") #Shuffle clusters rng = np.random.RandomState(0) row_idx = rng.permutation(data.shape[0]) col_idx = rng.permutation(data.shape[1]) data = data[row_idx][:, col_idx] plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffle datasets") models = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) models.fit(data) score = consensus_score(models.biclusters_, (rows[:, row_idx], colums[:, col_idx])) print("consensus score: {: .1f}".format(score)) fit_data = data[np.argsort(models.row_labels_)] fit_data = fit_data[:, np.argsort(models.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(np.outer( np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap=plt.cm.Blues)
def test_wrong_shape(): model = SpectralBiclustering() data = np.arange(27).reshape((3, 3, 3)) with pytest.raises(ValueError): model.fit(data)
def test_errors(args): data = np.arange(25).reshape((5, 5)) model = SpectralBiclustering(**args) with pytest.raises(ValueError): model.fit(data)
# Calculate Bi-clusters via spectral clustering # NOTE THAT YOUR DATA NEEDS TO BE NORMALIZED from sklearn.cluster import SpectralCoclustering, SpectralBiclustering nClusters = 3 #clus = SpectralCoclustering(n_clusters=nClusters) clus = SpectralBiclustering(n_clusters=nClusters) clus.fit(data) clabel = clus.column_labels_.astype(float) rlabel = clus.row_labels_.astype(float)
def test_spectralbiclustering_parameter_validation(params, type_err, err_msg): """Check parameters validation in `SpectralBiClustering`""" data = np.arange(25).reshape((5, 5)) model = SpectralBiclustering(**params) with pytest.raises(type_err, match=err_msg): model.fit(data)
class SpectBiclustUser(): """hold data, model, and visualization methods for SpectralBiclustering on twitter user-term matrix parameters: matrix: user-term matrix cluster_args: optional arguments for SpectralBiclustering model, as dict vectorizer: sklearn.feature_extraction.text vectorizer df: DataFrame w/ 'text', 'handle', and 'description' fields handles: list/array of document identifiers """ def __init__(self, matrix, cluster_args, vectorizer, df, handles): self.model = SpectralBiclustering(**cluster_args) self.matrix = matrix self.vectorizer = vectorizer self.df = df self.handles = handles def fit(self): self.model.fit(self.matrix) # works for bi- or co-clustering! # but hardly shows up for sparse data def draw_matrix(self, filename, lognorm=False): """Draw heatmap over data matrix sorted by cluster. params: lognorm: log & center data first as in SpectralBiclustering(method='log')""" if lognorm: data = lognormalize(self.matrix) # sort data = data[np.argsort(self.model.row_labels_)] data = data[:, np.argsort(self.model.column_labels_)] try: plt.matshow(data, cmap=plt.cm.Blues) except ValueError: plt.matshow(data.todense(), cmap=plt.cm.Blues) plt.savefig(filename, dpi=600) # best to call w/ axis from plt.subplots(tight_layout=True) def draw_image_matrix(self, filename, lognorm=False, percentile=False): """Draw heatmap over a reduced matrix where rows are row clusters and columns column clusters. Cells are shaded based on average values within that cluster x cluster block. params: lognorm: log & center data first as in SpectralBiclustering(method='log') percentile: instead of average, color based on some percentile of block values """ image, counts = self.get_image_matrix(lognorm, percentile) #image = image.transpose() _, ax = plt.subplots(tight_layout=True) ax.matshow(image, cmap=plt.cm.Blues) # set tick labels yticks = np.array(range(image.shape[0])) ylabels = list( map(lambda x: "\n".join(x), self.get_handles_by_cluster(3, descriptions=True))) try: # artist-style ax.set_yticks(yticks) ax.set_yticklabels(ylabels, size=3) except AttributeError: # scripting-style ax.yticks(yticks, labels=[]) xticks = np.array(range(image.shape[1])) if cv: xlabels = list( map(lambda x: "\n".join(x), self.get_terms_by_cluster(3))) else: xlabels = None try: # scripting-style ax.xticks(ticks=xticks, labels=xlabels, size=4) except AttributeError: # artist-style ax.set_xticks(xticks) ax.set_xticklabels(xlabels, rotation=90, size=4) # annotate w/ counts for i in range(counts.shape[0]): for j in range(counts.shape[1]): # don't transpose 'counts' b/c plt.matshow orients axes funny ax.annotate(counts[i, j], (j, i), size=3, ha='center') plt.savefig(filename, dpi=500) def get_image_matrix(self, lognorm=False, percentile=False): if lognorm: data = lognormalize(self.matrix) else: data = self.matrix clusters = [ pd.unique(self.model.row_labels_), pd.unique(self.model.column_labels_) ] dim = list(map(lambda x: len(x), clusters)) image = np.zeros(shape=dim) counts = np.full(shape=dim, fill_value='', dtype=object) for i in clusters[0]: for j in clusters[1]: submat = self.get_bicluster_submatrix(i, j) if percentile is False: image[i, j] = np.mean(submat) else: image[i, j] = np.percentile(submat, percentile) counts[i, j] = f"{submat.shape[0]}x{submat.shape[1]}" return image, counts def get_bicluster_submatrix(self, i, j): rows = np.where(np.equal(self.model.row_labels_, i))[0] columns = np.where(np.equal(self.model.column_labels_, j))[0] return self.matrix[rows][:, columns] def print_by_cluster(self, n_terms, words=True): if words: top_terms = self.get_terms_by_cluster(n_terms) else: top_terms = self.get_handles_by_cluster(n_terms) for i in range(len(top_terms)): print(i) # loop thru handles for h in top_terms[i]: # print in full desc = self.get_handle_description(h, 1000) desc = "; ".join(desc.split("\n")) print(f" {h} -- {desc}") def get_terms_by_cluster(self, n_terms=5): """Get list of top terms for each term/column cluster""" # get cluster indices col_clusters = pd.unique(self.model.column_labels_) col_clusters = np.sort(col_clusters) # get term frequencies freq = np.sum(self.matrix, axis=0) # grr matrices if len(freq.shape) > 1: freq = np.array(freq) freq = freq[0] # get int->string vocabulary words = self.vectorizer.get_feature_names() top_terms = [] for c in col_clusters: # get term indices term_inds = np.where(np.equal(self.model.column_labels_, c))[0] # get frequencies term_freqs = freq[term_inds] # get top frequencies top_inds = term_inds[np.argsort(term_freqs)[-1 * n_terms:]] top_cluster_terms = [words[i] for i in top_inds] top_cluster_terms.reverse() top_terms.append(top_cluster_terms) return top_terms def get_handles_by_cluster(self, n, descriptions=False): """Get list of top terms for each user/row cluster. 'descriptions=True' prints descriptions in lieu of handles.""" # get cluster indices row_clusters = pd.unique(self.model.row_labels_) row_clusters = np.sort(row_clusters) # get usage frequencies freq = np.sum(self.matrix, axis=1) # grr matrices if len(freq.shape) > 1: freq = np.array(freq) freq = freq[:, 0] # get int->string vocabulary (here it's rec_handles) top_handles = [] for c in row_clusters: # get term indices handle_inds = np.where(np.equal(self.model.row_labels_, c))[0] # get frequencies handle_freqs = freq[handle_inds] # get top frequencies top_inds = handle_inds[np.argsort(handle_freqs)[-1 * n:]] if descriptions: top_cluster_handles = [ self.get_handle_description(rec_handles[i]) for i in top_inds ] else: top_cluster_handles = [self.handles[i] for i in top_inds] top_cluster_handles.reverse() top_handles.append(top_cluster_handles) return top_handles def get_handle_description(self, handle, nchar=24): description = self.df.query('handle == @handle').iloc[0, :].description if pd.isna(description): description = str(description) else: description = description[:nchar] return description # 'bags' should be list of lemmatized tweets, not aggregated by handle def print_tweets_by_block(self, row, col, n, word_bags): """Print a sample of tweets corresponding to an intersection of row and column clusters -- i.e. a block in the image matrix. params: row, col: cluster indices n: # tweets to print word_bags: list of lemmatized tweets (not aggregated by handle) """ if word_bags is None: # TODO could get lemmas from dataframe... pass # get row/col indices from clusterer rows = np.where(np.equal(self.model.row_labels_, row))[0] cols = np.where(np.equal(self.model.column_labels_, col))[0] # resolve row indices to users via rec_handles users = [rec_handles[i] for i in rows] # get df row indices from users df_user_inds = np.where(np.isin(self.df.handle.values, users))[0] # resolve col indices to lemmas from vectorizer vocabulary = self.vectorizer.get_feature_names() lemmas = np.array([vocabulary[i] for i in cols], dtype=object) # sanity check print(f"example terms: {list(lemmas[:10])}\n") # get df row indices by finding lemmas in list of word bags df_inds = [] for i in df_user_inds: b = np.array(word_bags[i], dtype=object) if len(np.intersect1d(b, lemmas)) > 0: df_inds.append(i) # print sample of 'n' tweets df_inds = np.random.choice(df_inds, size=min(n, len(df_inds)), replace=False) for i in df_inds: handle = self.df.iloc[i, :].handle descr = self.df.iloc[i, :].description print(f"@{handle} - {descr}") print(" " + self.df.iloc[i, :].text + "\n")
interval = 0 while end <= len(band[0]): print interval, for i in channels: for j in range(63): if i <= j: pearson_data[interval].append( pearsonr(band[i][start:end], band[j][start:end])[0]) start = end end += length_of_intervals interval += 1 p = np.array(pearson_data) spectral_model = SpectralBiclustering() spectral_model.fit(p) fit_data = p[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(p, cmap=plt.cm.Blues) plt.matshow(fit_data, cmap=plt.cm.Blues) plt.matshow(np.outer( np.sort(spectral_model.row_labels_) + 1, np.sort(spectral_model.column_labels_) + 1), cmap=plt.cm.Blues) with open('media/pearson_30sec_bandpassMedian_clipped_2016.json', 'w+') as f: pearson_data_r = np.array(pearson_data) p = [[float(column) for column in row] for row in pearson_data_r] f.write(simplejson.dumps({'name': 's5d2nap', 'data': p})) plt.show()