def random_distribution(n): #make up some data data = np.random.normal(scale=n, size=(n, n)) data[0:n / 2,0:n / 2] += 75 data[n / 2:, n / 2:] = np.random.poisson(lam=n,size=data[n / 2:, n / 2:].shape) #cluster the rows row_dist = ssd.squareform(ssd.pdist(data)) row_Z = sch.linkage(row_dist) row_idxing = sch.leaves_list(row_Z) row_labels = ['bar{}'.format(i) for i in range(n)] #cluster the columns col_dist = ssd.squareform(ssd.pdist(data.T)) col_Z = sch.linkage(col_dist) col_idxing = sch.leaves_list(col_Z) #make the dendrogram col_labels = ['foo{}'.format(i) for i in range(n)] data = data[:,col_idxing][row_idxing,:] heatmap = pdh.DendroHeatMap(heat_map_data=data,left_dendrogram=row_Z, top_dendrogram=col_Z, heatmap_colors=("#ffeda0", "#feb24c", "#f03b20"), window_size="auto", color_legend_displayed=False, label_color="#777777") heatmap.row_labels = row_labels heatmap.col_labels = col_labels heatmap.title = 'An example heatmap' heatmap.show()#heatmap.save("example.png")
def get_clustdist_path(self, feature_ids=None, labeling_name=None, class_ids=None, vmin=-3.0, vmax=3.0, root_dir='.'): if not(labeling_name): labeling_name = 'one_class' #labeling = self.labeling_dict[labeling_name] (fm, sample_names, feature_names, target, target_names) =\ self.get_dataset(feature_ids, labeling_name, class_ids) #fistr = '_'.join([str(self.feature_ids.index(f)) for f in # feature_names]) #listr = '_'.join([str(labeling.class_names.index(t)) # for t in target_names]) #lab_str = 'feati_' + fistr + '_' + labeling_name + '_' + listr #png_f = os.path.join(self.heatmap_dir, 'fm_clustered_%s.png' % # (lab_str)) d = os.path.join(root_dir, self.HEATMAP_D) if not(os.path.exists(d)): os.makedirs(d) img_format = 'png' file_path = os.path.join(d, 'fm_clustered.%s' % (img_format)) # reorder feature matrix rows (objects) object_indices = hierarchy.leaves_list(self.clust_object(fm)) fm = fm[object_indices, :] # reorder standardized feature matrix columns (feats) feat_indices = hierarchy.leaves_list(self.clust_feat(fm)) fm = fm[:, feat_indices] # add labels of all available labelings (reordered using object_is) #lablists = [[l.labels[i] for i in object_indices] # for l in self.labeling_dict.values() # if not l.name == 'one_class'] lablists = [[target[i] for i in object_indices]] class_names = [target_names] # reorder the feature and object ids fs = [feature_names[i] for i in feat_indices] gs = [sample_names for i in object_indices] heatmap.heatmap_labeled_fig(fm, fs, gs, lablists, class_names, file_path, vmin=vmin, vmax=vmax) return file_path
def cluster(df, metric="euclidean", method="single", row=True, column=True): row_linkmat, col_linkmat = None, None if row: distmat = dist.pdist(df, metric) row_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(row_linkmat), :] if column: df = df.T distmat = dist.pdist(df, metric) col_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(col_linkmat), :].T return df, row_linkmat, col_linkmat
def reorder(C): print 'reorder...' Y = 1 - C Z = linkage(Y, method='average') ivl = leaves_list(Z) ivl = ivl[::-1] return C[:, ivl][ivl, :]
def check_leaves_list_iris(self, method): # Tests leaves_list(Z) on the Iris data set X = eo['iris'] Y = pdist(X) Z = linkage(X, method) node = to_tree(Z) assert_equal(node.pre_order(), leaves_list(Z))
def make_cdt_file(basename, data, clusters=None, sep_col=True): data = data.copy() if sep_col: prefixes = set(col[: col.find("_sl")] for col in data.columns) for prefix in prefixes: data[prefix + "_sep"] = pd.Series() data = data.sort_index(axis=1) data.insert(0, "GID", "NONE") data.insert(1, "FBgn", data.index) data.insert(2, "NAME", data.index) data.insert(3, "CHROMOSOME", "NONE") data.insert(4, "ARM", "L") data.insert(5, "POSITION", 0) data.insert(6, "GWEIGHT", 1.0) for i, row in enumerate(data.index): data.ix[row, "GID"] = "GENE{}X".format(i) data.ix[row, "FBgn"] = fbgn_lookup.get(row, "???") if row in fbgn_map: pos = fbgn_map[row].split("..")[0] chrom, pos = pos.split(":") arm = "R" if chrom.endswith("R") else "L" if chrom[-1] in "RL": chrom = chrom[:-1] data.ix[row, "CHROMOSOME"] = chrom data.ix[row, "ARM"] = arm data.ix[row, "POSITION"] = int(pos) if clusters is not None: data = data.ix[hierarchy.leaves_list(clusters)] data.to_csv(basename, sep="\t", index=False, float_format="%.5f")
def rearrange(X, optimal = True, method = "average"): metric_kwargs = {} Y = squareform(X, force="tovector") Z = [(int(l), int(r), max(0., d), int(n)) for (l, r, d, n) in linkage(Y, method=method, metric=None)] leaves = list(leaves_list(Z)) N = len(leaves) root = len(Z)+N-1 assert len(X) == N # bar-joseph optimal ordering if optimal: import barjoseph leaves = barjoseph.optimal(root, **{ "S": lambda i, j: exp(-X[i][j]), "left": lambda i: None if i < N else Z[i-N][0], "right": lambda i: None if i < N else Z[i-N][1], "is_leaf": lambda i: i < N, "is_empty": lambda v: v is None, }) assert list(sorted(leaves)) == list(range(N)) return leaves
def make_cdt_file(basename, data, clusters=None, sep_col = True): data = data.copy() if sep_col: prefixes = set(col[:col.find('_sl')] for col in data.columns) for prefix in prefixes: data[prefix+"_sep"] = pd.Series() data = data.sort_index(axis=1) data.insert(0, 'GID', 'NONE') data.insert(1, 'FBgn', data.index) data.insert(2, 'NAME', data.index) data.insert(3, 'CHROMOSOME', 'NONE') data.insert(4, 'ARM', 'L') data.insert(5, 'POSITION', 0) data.insert(6, 'GWEIGHT', 1.0) for i, row in enumerate(data.index): data.ix[row,'GID'] = 'GENE{}X'.format(i) data.ix[row, 'FBgn'] = fbgn_lookup.get(row, '???') if row in fbgn_map: pos = fbgn_map[row].split('..')[0] chrom, pos = pos.split(':') arm = 'R' if chrom.endswith('R') else 'L' if chrom[-1] in 'RL': chrom = chrom[:-1] data.ix[row, 'CHROMOSOME'] = chrom data.ix[row, 'ARM'] = arm data.ix[row, 'POSITION'] = int(pos) if clusters is not None: data = data.ix[hierarchy.leaves_list(clusters)] data.to_csv(basename, sep='\t', index=False, float_format='%.5f')
def get_factor_reorder(self, c, rotate='oblimin'): # reorder factors based on correlation matrix phi=get_attr(self.results['factor_tree_Rout_%s' % rotate][c],'Phi') if phi is None: return list(range(c)) new_order = list(leaves_list(linkage(squareform(np.round(1-phi,3))))) return new_order[::-1] # reversing because it works better for task EFA
def to_dict(self, correlation_matrix, linkage_matrix): from scipy.cluster import hierarchy tree = hierarchy.to_tree(linkage_matrix, rd=False) leaves_list = hierarchy.leaves_list(linkage_matrix) d = {} # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/ # https://gist.github.com/mdml/7537455 def add_node(node): if node.is_leaf(): return cluster_id = node.get_id() - len(linkage_matrix) - 1 row = linkage_matrix[cluster_id] d[cluster_id+1] = { 'datasets': [i+1 for i in sorted(node.pre_order())], 'height': row[2], } # Recursively add the current node's children if node.left: add_node(node.left) if node.right: add_node(node.right) add_node(tree) return d
def plot_correlations(booklist): from mpl_toolkits.axes_grid1 import make_axes_locatable fig, ax = plt.subplots(figsize=(20,20)) books = booklist if len(booklist)>0 else np.unique(np.array(tanach['book'])) mesh = [] for b in books: wds = words(b) gem = gematriaze(wds) mesh.append(gem) minsize = min(*[len(mesh[i]) for i in range(len(mesh))]) mesh = [mesh[i][0:minsize] for i in range(len(mesh))] meshnum = np.array(mesh) plot_matr = np.dot(meshnum, meshnum.T) Z = sch.linkage(plot_matr) leaves = sch.leaves_list(Z) plot_matr = plot_matr[leaves][:,leaves] ax.set_yticks(np.arange(len(books))+0.5) ax.set_yticklabels(np.array(books)[leaves], fontsize=20) ax.set_xticks(np.arange(len(books))+0.5) ax.set_xticklabels(np.array(books)[leaves], rotation='vertical',fontsize=20) # pc = ax.pcolormesh(nmeshnum,vmin=0, vmax=np.max(meshnum)) pc = ax.pcolormesh(plot_matr) div = make_axes_locatable(ax) cax = div.append_axes("right", size="2%", pad=0.05) cbar = plt.colorbar(pc, cax=cax) fig.tight_layout()
def classify_by_scores(M, threshold, loci, return_file_names=None): M_array = ssd.squareform(M) Z = linkage(M_array, method='average') root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids)-1: break cnt += 1 clusters = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] clusters = sorted(clusters, key=lambda x: len(x), reverse=True) if return_file_names: clusters_fn = [] for cluster in clusters: clusters_fn.append([os.path.basename(loci[i].file_name) for i in cluster]) singles_fn = [ os.path.basename(loci[i].file_name) for i in singles] return singles_fn, clusters_fn else: return singles, clusters
def _get_cluster(components, my_inds=None): if my_inds is None: my_inds = list(components.keys()) dist = distance.pdist([components[ind] for ind in my_inds]) hcomp = hierarchy.complete(dist) ll = hierarchy.leaves_list(hcomp) return ll
def hierarchial_cluster(self, method, metric): clusters = hierarchy.linkage(self.perc_ids, method=method, metric=metric) ordering = hierarchy.leaves_list(clusters) self.perc_ids = self.perc_ids[ordering, :] self.perc_ids = self.perc_ids[:, ordering] self.perc_aln = self.perc_aln[ordering, :] self.perc_aln = self.perc_aln[:, ordering] self.genomes = self.genomes[ordering]
def cluster_rows(self, method="ward"): display_data = self.display_data rows = len(display_data) if rows < 2: # don't attempt to cluster less than 2 rows return Z = linkage(self.display_data, method) self.row_order = leaves_list(Z)
def cluster(matrix) : Z = hier.linkage(matrix, method='average') leaves = hier.leaves_list(Z) newmat=matrix[leaves,:] newmat=newmat[:,leaves] return leaves, newmat
def plot_zmatrix(ax, zmatrix): from matplotlib import pylab lm = hier.linkage(zmatrix) ord = np.array(hier.leaves_list(lm)) ax.imshow((zmatrix[ord])[:, ord], interpolation='nearest', cmap=pylab.cm.Greys) return ord
def __cluster_columns__(self, column_distance, column_linkage): columns = zip(*self.data) self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance) self.data_order = hcluster.leaves_list(self.column_clustering) self.data = self.__reorder_data__(self.data, self.data_order) self.original_data = self.__reorder_data__(self.original_data, self.data_order) if self.header: self.header = self.__reorder_data__([self.header], self.data_order)[0] return
def heatmap_plot_zscore_bigneuron(df_zscore_features, df_all, output_dir, title=None): print "heatmap plot:bigneuron" #taiwan metric ='nt_type' mtypes = np.unique(df_all[metric]) print mtypes mtypes_pal = sns.color_palette("hls", len(mtypes)) mtypes_lut = dict(zip(mtypes, mtypes_pal)) mtypes_colors = df_all[metric].map(mtypes_lut) linkage = hierarchy.linkage(df_zscore_features, method='ward', metric='euclidean') data = df_zscore_features.transpose() row_linkage = hierarchy.linkage(data, method='ward', metric='euclidean') feature_order = hierarchy.leaves_list(row_linkage) #print data.index matchIndex = [data.index[x] for x in feature_order] #print matchIndex data = data.reindex(matchIndex) pl.figure() g = sns.clustermap(data, row_cluster = False, col_linkage=linkage, method='ward', metric='euclidean', linewidths = 0.0,col_colors = [mtypes_colors], cmap = sns.cubehelix_palette(light=1, as_cmap=True),figsize=(40,10)) pl.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) pl.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) #g.ax_heatmap.set_xticklabels([]) pl.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.95) # !!!!! if title: pl.title(title) location ="best" num_cols=1 # Legend for row and col colors for label in mtypes: g.ax_row_dendrogram.bar(0, 0, color=mtypes_lut[label], label=label, linewidth=0.0) g.ax_row_dendrogram.legend(loc=location, ncol=num_cols,borderpad=0) filename = output_dir + '/zscore_feature_heatmap.png' pl.savefig(filename, dpi=300) #pl.show() print("save zscore matrix heatmap figure to :" + filename) pl.close() print "done clustering and heatmap plotting" return linkage
def matrix_tree(data,color): normed_data = data.values condition_link = linkage(normed_data) feature_link = linkage(normed_data.T) condition_order = leaves_list(condition_link) feature_order = leaves_list(feature_link) conditions = data.index.values[condition_order] features = data.columns.values[feature_order] color_matrix = normed_data.T[feature_order,:][:,condition_order] plot_matrix_tree(color_matrix, condition_link, feature_link, conditions, features, color)
def reorder(C): """ Reorder consensus matrix. :param C: Consensus matrix. :type C: `numpy.ndarray` """ Y = 1 - C Z = linkage(squareform(Y), method='average') ivl = leaves_list(Z) ivl = ivl[::-1] return C[:, ivl][ivl, :]
def hclustering(dataArray, method, p = None): if p is not None: distanceMatrix = pdist(dataArray, method, p) else: distanceMatrix = pdist(dataArray, method) distanceSquareMatrix = squareform(distanceMatrix) linkageMatrix = hier.linkage(distanceSquareMatrix) heatmapOrder = hier.leaves_list(linkageMatrix) orderedDataMatrix = dataArray[:, heatmapOrder] orderedDataMatrix = orderedDataMatrix[heatmapOrder, :] # print linkageMatrix return heatmapOrder, orderedDataMatrix, distanceSquareMatrix
def plot_polar(self, data, n_top=3, overplot=False, labels=None, palette='husl'): n_panels = data.shape[1] if labels is None: labels = [] for i in range(n_panels): labels.extend(data.iloc[:, i].order(ascending=False) .index[:n_top]) labels = np.unique(labels) data = data.loc[labels, :] # Use hierarchical clustering to order from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, leaves_list dists = pdist(data, metric='correlation') pairs = linkage(dists) order = leaves_list(pairs) data = data.iloc[order, :] labels = [labels[i] for i in order] theta = np.linspace(0.0, 2 * np.pi, len(labels), endpoint=False) if overplot: fig, ax = plt.subplots(1, 1, subplot_kw=dict(polar=True)) fig.set_size_inches(10, 10) else: fig, axes = plt.subplots(1, n_panels, sharex=False, sharey=False, subplot_kw=dict(polar=True)) fig.set_size_inches((6 * n_panels, 6)) # A bit silly to import seaborn just for this... # should extract just the color_palette functionality. import seaborn as sns colors = sns.color_palette(palette, n_panels) for i in range(n_panels): if overplot: alpha = 0.2 else: ax = axes[i] alpha = 0.8 ax.set_ylim(data.values.min(), data.values.max()) d = data.iloc[:, i].values ax.fill(theta, d, color=colors[i], alpha=alpha, ec='k', linewidth=0) ax.fill(theta, d, alpha=1.0, ec=colors[i], linewidth=2, fill=False) ax.set_xticks(theta) ax.set_xticklabels(labels, fontsize=18) [lab.set_fontsize(18) for lab in ax.get_yticklabels()] ax.set_title('Cluster %d' % i, fontsize=22, y=1.12) plt.tight_layout() return plt
def plot_distances(model): from scipy.spatial.distance import squareform, pdist from scipy.cluster import hierarchy from matplotlib import pyplot as plt D = pdist(model.doc_topic_) doc_order = hierarchy.leaves_list(hierarchy.linkage(D)) D = pdist(model.doc_topic_[doc_order, :]) plt.imshow(squareform(D), interpolation='none') plt.colorbar() plt.show() return doc_order
def plot_delta(x,deltas,mean=True,probability=False,cluster=False,plot_cluster=False,cluster_kwargs={},ytick_filter=lambda x: x): p = len(deltas.keys()) n = x.shape[0] a = np.zeros((p,n)) yticks = [ytick_filter(k) for k in deltas.keys()] for i,k in enumerate(deltas.keys()): mu,var = deltas[k] if mean: a[i,:] = mu else: a[i,:] = 1-scipy.stats.norm.cdf(0,mu,np.sqrt(var)) a[np.abs(a-.5) < .475] = 0.5 if cluster: l = linkage(a,**cluster_kwargs) ind = leaves_list(l) a = a[ind,:] yticks = [yticks[j] for j in ind] if plot_cluster: ax = plt.subplot2grid((1,6),(0,0),colspan=1,rowspan=1) dendrogram(l,no_labels=True,orientation='left',ax=ax) if mean: lim = np.max(np.abs(a)) vmin = -lim vmax = lim else: vmin = 0 vmax = 1 if plot_cluster: ax = plt.subplot2grid((1,6),(0,1),colspan=4,rowspan=1) else: ax = plt.subplot2grid((1,5),(0,0),colspan=4,rowspan=1) plt.imshow(a,cmap="RdBu",interpolation="none",vmin=vmin,vmax=vmax,origin='lower',aspect="auto") plt.yticks(range(p),yticks) i = np.arange(0,n,1.*n/5) plt.xticks(i,[x[j].round(2) for j in i]) if plot_cluster: if probability: cbarAx,kwargs = mpl.colorbar.make_axes(ax) cbar = mpl.colorbar.ColorbarBase(cbarAx,cmap='RdBu',ticks=[0,.5,1],**kwargs) cbar.ax.set_yticklabels(['p(less\n than parent)\n>97.5%', 'no difference', 'p(greater\n than parent)\n>97.5%'],fontsize=15) else: plt.colorbar() else: plt.colorbar()
def ClusterSimilarityMatrix(sim_mat, method='average'): n = len(sim_mat) flat_dist_mat = ssd.squareform(1.0-sim_mat) res_linkage = hcluster.linkage(flat_dist_mat, method=method) res_order = hcluster.leaves_list(res_linkage) seriated_sim = np.zeros((n,n)) a,b = np.triu_indices(n,k=1) seriated_sim[a,b] = sim_mat[ [res_order[i] for i in a], [res_order[j] for j in b]] seriated_sim[b,a] = seriated_sim[a,b] for i in range(n): seriated_sim[i,i] = sim_mat[i,i] return seriated_sim, res_order, res_linkage
def plot_heatmap(X, X_hat, mask, filename, data_transform, value_name): available = np.invert(np.isnan(X.values)) rmse = calc_unobserved_rmse(X, X_hat.values, mask) r2 = calc_unobserved_r2(X, X_hat.values, mask) u, s, vt = np.linalg.svd(X_hat - X_hat.values.mean()) approx_rank = np.where(np.cumsum(s**2) > (s**2).sum() * 0.95)[0][0] + 1 correlations = np.asarray(X.corr()) correlations[np.isnan(correlations)] = 0 col_linkage = linkage(distance.pdist(correlations), method='average') col_order = leaves_list(col_linkage) correlations = np.asarray(X.T.corr()) correlations[np.isnan(correlations)] = 0 row_linkage = linkage(distance.pdist(correlations), method='average') row_order = leaves_list(row_linkage) X_reorder = X.reindex(X.index[row_order])[X.columns[col_order]] Xhat_reorder = X_hat.reindex(X.index[row_order])[X.columns[col_order]] df = pd.concat([X_reorder, Xhat_reorder], keys=['original', 'inferred']) try: df = df.rename_axis( ['Unobserved', '%s %s' % (data_transform, value_name)]) except: pass df_mask = np.vstack([mask, mask]) fig, ax = plt.subplots(figsize=(8, 12)) _ = plt.title( '%.1f%% of entries available; %.1f%% observed; RMSE=%.3f; r^2=%.1f%%\nsize: %d x %d; approx. rank: %d' % (np.average(available) * 100, np.average(mask) * 100, rmse, r2 * 100, X.shape[0], X.shape[1], approx_rank), fontsize=10) ax = sns.heatmap(df, mask=df_mask, ax=ax, cmap=sns.cm.rocket_r) _ = ax.axhline(X.shape[0], color='blue') _ = plt.tight_layout() bottom, top = ax.get_ylim() ax.set_ylim( bottom + 0.5, top - 0.5 ) # sorry, this may cut off the bottom row...some sort of matplotlib bug plt.savefig(filename) plt.close()
def cluster_kmer_dists(kmers_dists, kmers_scores, kmers, out_pdf): ''' Plot a clustered heatmap of k-mer distances and scores.''' # cluster kmer_cluster = hierarchy.linkage(kmers_dists, method='single', metric='euclidean') order = hierarchy.leaves_list(kmer_cluster) # re-order distance matrix kmers_dists_reorder = kmers_dists[order,:] kmers_dists_reorder = kmers_dists_reorder[:,order] # plot plot_kmer_dists(kmers_dists_reorder, kmers_scores[order], kmers[order], out_pdf)
def clustering(data): thres = 25 #Create the distance matrix for the array of sample vectors. #Look up 'squareform' if you want to submit your own distance matrices as they need to be translated into reduced matrices reduced_data = PCA(n_components=2).fit_transform(data) data = pd.DataFrame.as_matrix(data) data_dist = pdist(data, metric='euclidean') # computing the distance Y = linkage(data_dist, method='complete') fig = plt.figure(figsize=(8, 8)) # x ywidth height ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6]) Z1 = dendrogram(Y, orientation='right') ax1.set_xticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Z2 = dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) #Compute and plot the heatmap axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = squareform(data_dist) D = D[idx1, :] D = D[:, idx2] im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.RdYlGn) axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar. axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6]) plt.colorbar(im, cax=axcolor) #From the heatmap it is evident there are about 4 clusters. thres = 26 plt.figure(figsize=(20, 12)) dendrogram(Y, color_threshold=thres, show_leaf_counts=True) plt.yticks(np.arange(0, 35, step=0.5)) plt.xticks([]) clusters1 = fcluster(Y, t=thres, criterion='distance') col = np.array(clusters1) col = col / float(np.max(col)) col = np.array([round(x, 2) for x in col]) print col plt.figure() plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=col) plt.show() print "cluster1", clusters1 print "leaves", leaves_list(Y) return
def heatmap(self): print(genes) print(timepoints) matrix = self.values Z = linkage(matrix, "ward") ZT = linkage(matrix.T, "ward") fig, ax = plt.subplots() title = input("Please input Clustered Dendrogram plot title: ") plt.title(title) plt.xlabel("sample") plt.ylabel("distance") dendrogram(ZT, labels=timepoints, show_leaf_counts=False, leaf_rotation=90., leaf_font_size=12., show_contracted=True) fig.savefig("dendrogram.png") plt.close(fig) idx_rows = leaves_list(Z) data = matrix[idx_rows, :] # idx_columns = leaves_list(ZT) idx_columns = range(10) data = data[:, idx_columns] X = (data - np.average(data, axis=0)) / np.std(data, axis=0) m = np.max(np.abs(X)) y = np.arange(0, 100) fig, ax = plt.subplots() title = input("Please input Heatmap title: ") ax.set_title(title) im = ax.pcolor(X, cmap="viridis", vmin=-m, vmax=m - 0.5) ax.grid(False) ax.set_xticks(np.arange(0.5, X.shape[1] + 0.5), ) ax.set_xticklabels(timepoints, rotation=50) ax.set_yticks(np.arange(0.5, len(genes), 5), genes) ax.set_yticklabels(genes) cbar = fig.colorbar(im, ax=ax) fig.subplots_adjust(left=0.05, bottom=0.15, right=1.0, top=0.95) fig.savefig("Heatmap_clustered.png") # Save the image plt.close(fig) # Close the canvas print("Clustered Heatmap for Complete...")
def clustdist_json(self, feature_ids=None, labeling_name=None, class_ids=None): if not(labeling_name): labeling_name = 'one_class' # fm is normalized! (fm, sample_names, feature_names, target, target_names) =\ self.get_dataset(feature_ids, labeling_name, class_ids) # reorder feature matrix rows (objects) object_indices = hierarchy.leaves_list(self.clust_object(fm)) fm = fm[object_indices, :] # reorder standardized feature matrix columns (feats) feat_indices = hierarchy.leaves_list(self.clust_feat(fm)) fm = fm[:, feat_indices] # add labels of all available labelings (reordered using object_is) lablist = [target[i] for i in object_indices] #class_names = [target_names] # reorder the feature and object ids fs = [feature_names[i] for i in feat_indices] gs = [sample_names for i in object_indices] json_data = {} json_data['feature-names'] = fs #json_data['object-names'] = gs --> not yet needed on client side json_data['object-labels'] = lablist json_data['class-names'] = target_names json_data['max-value'] = fm.max(); json_data['min-value'] = fm.min(); # add feature list per feature-name for index, item in enumerate(fs): json_data[item] = list(fm[:, index]) return json.dumps(json_data)
def plot_heatmap(X, obs_frac, filename, combined_datasets=False): available = np.invert(np.isnan(X.values)) mask = get_mask(X, obs_frac) X_hat = pd.DataFrame(complete_matrix(X, mask, offset=True), index=X.index, columns=X.columns) rmse = calc_unobserved_rmse(X, X_hat.values, mask) r2 = calc_unobserved_r2(X, X_hat.values, mask) correlations = np.asarray(X.corr()) correlations[np.isnan(correlations)] = 0 col_linkage = linkage(distance.pdist(correlations), method='average') col_order = leaves_list(col_linkage) correlations = np.asarray(X.T.corr()) correlations[np.isnan(correlations)] = 0 row_linkage = linkage(distance.pdist(correlations), method='average') row_order = leaves_list(row_linkage) X_reorder = X.reindex(X.index[row_order])[X.columns[col_order]] Xhat_reorder = X_hat.reindex(X.index[row_order])[X.columns[col_order]] df = pd.concat([X_reorder, Xhat_reorder], keys=['original', 'inferred']) if combined_datasets: df = df.rename_axis( ['Unobserved', 'VaccineStatus', 'Neutralization[Titers]']) else: df = df.rename_axis(['Unobserved', 'Neutralization[Titers]']) df_mask = np.vstack([mask, mask]) fig, ax = plt.subplots(figsize=(8, 12)) _ = plt.title( '%.1f%% of available entries unobserved; RMSE=%.3f; r^2=%.1f%%' % (100 - np.average(mask) * 100, rmse, r2 * 100)) ax = sns.heatmap(df, mask=df_mask, ax=ax, cmap=sns.cm.rocket_r) _ = ax.axhline(X.shape[0], color='blue') _ = plt.tight_layout() bottom, top = ax.get_ylim() ax.set_ylim( bottom + 0.5, top - 0.5 ) # sorry, this may cut off the bottom row...some sort of matplotlib bug plt.savefig(filename) plt.close()
def hierarchicalReorder(inMatrix, useCorr = True): inMatrix = np.array(inMatrix) allargs = [] for axis in [0,1]: inMatrix = inMatrix.T if useCorr: corr = np.corrcoef(inMatrix) else: corr = np.array(inMatrix) cl = hierarchy.linkage(corr) args = hierarchy.leaves_list(cl) allargs.append(args) inMatrix = inMatrix[args] return inMatrix, allargs[1], allargs[0]
def GetOptimalOrder(matrix): ''' Get an otpimal order for the Annoation matrix and the names from a hierarchical clustering leaf index return: idxOpt: Optimal index ''' dist = yule_distance(matrix) dist = (dist + dist.T) / 2. dist = dist - np.diag(np.diag(dist)) Z = linkage(squareform(dist), method='average', metric='precomputed') return leaves_list(Z).astype(np.int32)
def cluster_symmetric(self, method='average'): if self.shape[0] != self.shape[1]: raise ValueError('data matrix not square') elif self.shape[0] == 0: raise ValueError('no rows or columns in data matrix') elif self.shape[0] < 3: print('less than 3 rows and 3 columns. no clustering performed.') else: si = hierarchy.leaves_list( fastcluster.linkage( distance.squareform(np.float64(1) - self.matrix, checks=False), method)).astype('int64') self.reorder(si, axis=0) self.reorder(si, axis=1)
def sort(self, method=None): if method == "none": order = range(self.ncols) elif method == "similarity": norm = self.data / (self.colsums + c_epsilon * np.ones(self.ncols)) # note: linkage assumes things to cluster = rows; we want cols order = sch.leaves_list(sch.linkage( norm.transpose(), metric="braycurtis")) elif method == "usimilarity": # note: linkage assumes things to cluster = rows; we want cols order = sch.leaves_list(sch.linkage( self.data.transpose(), metric="braycurtis")) elif method == "dominant": maxes = [(max(self.data[i, :]), i) for i in range(self.nrows)] maxes.sort() ranks = [None for k in maxes] for i, (s, i2) in enumerate(maxes): ranks[i2] = i argmax = [] for c in range(self.ncols): argmax.append(sorted(range(self.nrows), key=lambda r: self.data[r][c])[-1]) order = sorted(range(self.ncols), key=lambda c: ( ranks[argmax[c]], self.data[argmax[c], c]), reverse=True) elif method == "sum": order = sorted(range(self.ncols), key=lambda c: self.colsums[c], reverse=True) elif method == "metadata": order = sorted(range(self.ncols), key=lambda c: self.metarow[c]) else: sys.exit("Can't sort with method: %s" % (method)) self.data = self.data[:, order] self.colheads = subseq(self.colheads, order) if self.metarow is not None: self.metarow = subseq(self.metarow, order) self.update()
def run(): counts = 'phage_kmer_count_k4_c0_s2255.csv' headers = 'phage_kmer_headers_k4_c0_s2255.txt' data = normalize_rows(np.loadtxt(counts, delimiter=',')) row_labels = np.array([header.split('|')[3] for header in open(headers, 'r').readlines()[1:]]) col_labels = np.array(kmers(4)) N = 50 data = data[:N,:N] row_labels = row_labels[:N] col_labels = col_labels[:N] # cluster the rows row_dist = ssd.squareform(ssd.pdist(data)) row_Z = sch.linkage(row_dist) row_idxing = sch.leaves_list(row_Z) #cluster the columns col_dist = ssd.squareform(ssd.pdist(data.T)) col_Z = sch.linkage(col_dist) col_idxing = sch.leaves_list(col_Z) #make the dendrogram data = data[:,col_idxing][row_idxing,:] row_labels = list(row_labels[np.array(row_idxing)]) col_labels = list(col_labels[np.array(col_idxing)]) heatmap = pdh.DendroHeatMap(heat_map_data=data, left_dendrogram=row_Z, top_dendrogram=col_Z) heatmap.colormap = heatmap.redBlackBlue heatmap.row_labels = row_labels heatmap.col_labels = col_labels heatmap.title = 'Bacteirophage 4-mer hierarchical clustering' heatmap.export('phage_heatmap.png') heatmap.show()
def reorder(C): """ Reorder consensus matrix. :param C: Consensus matrix. :type C: `numpy.matrix` """ c_vec = np.array([C[i, j] for i in xrange(C.shape[0] - 1) for j in xrange(i + 1, C.shape[1])]) # convert similarities to distances Y = 1 - c_vec Z = linkage(Y, method="average") # get node ids as they appear in the tree from left to right(corresponding to observation vector idx) ivl = leaves_list(Z) ivl = ivl[::-1] return C[:, ivl][ivl, :]
def genomes_hclust(dist_dict, args): """Genomes hierarchical clustering and vizualisation""" logger.info("Clustering genomes") dist_arr = array(dist_dict_to_2dlist(dist_dict)) names = array(list(dist_dict)) lm = linkage(squareform(dist_arr), method="single", optimal_ordering=True) if args.plot_dendrogram: plot_dendrogram(lm, names, args.prefix) if args.print_clusters: cluster_ids = fcluster(lm, t=args.cluster_threshold, criterion="distance") clustered_genomes = sorted(list(zip(names, cluster_ids)), key=lambda x: x[1]) cluster_dict = dict() with open("%s.clstr" % args.prefix, 'w') as handle: for genome, cluster_id in clustered_genomes: handle.write("%s\t%s\n" % (genome, cluster_id)) if cluster_id in cluster_dict.keys(): cluster_dict[cluster_id].append(genome) else: cluster_dict[cluster_id] = [genome] metrics = None if args.checkm_file: with open(args.checkm_file, 'r') as f: metrics = dict(map(genome_metric, f.readlines()[3:-1])) with open("%s.repr.clstr" % args.prefix, 'w') as handle: for cluster_id, genomes in cluster_dict.items(): if len(genomes) == 1 or metrics is None: handle.write("%s\t%s\n" % (genomes[0], cluster_id)) else: cluster_metrics = sorted( {x: metrics[os.path.splitext(x)[0]] for x in genomes}.items(), key=lambda x: -x[1]) handle.write("%s\t%s\n" % (cluster_metrics[0][0], cluster_id)) if args.heatmap: order = leaves_list(lm) dist_arr = dist_arr[order, ] dist_arr = dist_arr[:, order] names = names[order] plot_heatmap(args, names, dist_arr)
def sort_by_clust(data, n_clusters=10, ncomp=3, output='labels'): """ Given data (n_samples, n_features), reduce dimensionality by SVD and do Agglomerative clusterisation with ward linkage. If output is `sort`, return leaves of the clasterisation tree. If output is `labels`, convert to flat clusters and return labels. Input: ------- - data: data points, 2D array (n_samples, n_features) - n_clusters [10]: if output is `labels` cut the agglomerative clustering tree at this number of clusters (based on cophenetic distances); if `None`, try find optimal number of clusters from Calinski-Harabasz criterion (slow) - ncomp [3]: number of SVD components to use in dimensionality reduction step - output [labels]: if output is `labels`, return flat clusters, if output is `sort`, return leaves of the agglomerative clustering tree as a 1D array. """ u, s, vh = linalg.svd(data, False) u = u[:, :ncomp] nsamples = len(u) Z = sp_hierarchy.linkage(u, method='ward') if 'sort' in output: #Z = sp_clust.hierarchy.linkage(u, method='ward') return sp_hierarchy.leaves_list(Z) else: if n_clusters is not None: labels = sp_hierarchy.fcluster(Z, n_clusters, criterion='maxclust') else: # this is just a dumb guess. must be tested though # gap statistic or CH index? (how to calculate in Python?) # i.e. sklearn.metrics.calinski_harabasz_score # calc labels for several n_clusters, find max CH score. (nclust>=2) nsignals_per_cluster = range(2, 50, 2) nc_acc = [] ch_acc = [] for nsc in nsignals_per_cluster: nc = np.int(np.ceil(nsamples / nsc)) nc = max(2, nc) labels = sp_hierarchy.fcluster(Z, nc, criterion='maxclust') ch = skmetrics.calinski_harabasz_score(u, labels) ch_acc.append(ch) nc_acc.append(nc) k = np.argmax(ch_acc) labels = sp_hierarchy.fcluster(Z, nc_acc[k], criterion='maxclust') #dcoph = sp_hierarchy.cophenet(Z) #th = np.percentile(dcoph,5) #labels = sp_hierarchy.fcluster(Z,th,criterion='distance') return labels
def hierarchical_cluster(df, compute_dist=True, pdist_kws=None, method='average', min_cluster_size=3, cluster_kws=None): """ plot hierarchical clustering and heatmap :df: a correlation matrix parse_heatmap: int (optional). If defined, devides the columns of the heatmap based on cutting the dendrogram """ # if compute_dist = False, assume df is a distance matrix. Otherwise # compute distance on df rows if compute_dist == True: if pdist_kws is None: pdist_kws= {'metric': 'correlation'} if pdist_kws['metric'] == 'abscorrelation': # convert to absolute correlations dist_vec = abs_pdist(df) elif pdist_kws['metric'] == 'sqcorrelation': # convert to squared correlations dist_vec = squareform(1-df.T.corr()**2) else: dist_vec = pdist(df, **pdist_kws) dist_df = pd.DataFrame(squareform(dist_vec), index=df.index, columns=df.index) else: assert df.shape[0] == df.shape[1] dist_df = df dist_vec = squareform(df.values) #clustering. This works the same as hclust link = linkage(dist_vec, method=method) #dendrogram # same as order.dendrogram(as.dendrogram(hclust output)) in R reorder_vec = leaves_list(link) clustered_df = dist_df.iloc[reorder_vec, reorder_vec] # clustering if cluster_kws is None: cluster_kws = {'minClusterSize': 3, 'verbose': 0, 'pamStage': False} labels = dynamicTreeCut(dist_df, func='hybrid', method=method, **cluster_kws) labels = reorder_labels(labels, link) return {'linkage': link, 'distance_df': dist_df, 'clustered_df': clustered_df, 'reorder_vec': reorder_vec, 'labels': labels}
def cluster_sp_agglomerative(content): """ Agglomerative Clustering """ if content['transpose'] == 1: content['data'] = list(map(list, zip(*content['data']))) dataMatrix = numpy.array(content['data']) linkageMatrix = hier.linkage(dataMatrix, method=content['sp_method'], metric=content['sp_metric'], optimal_ordering=content['sp_ordering'] == 1) heatmapOrder = hier.leaves_list(linkageMatrix) orderedDataMatrix = dataMatrix[heatmapOrder,:] return httpWrapper( json.dumps({ 'result': orderedDataMatrix.tolist(), 'order': heatmapOrder.tolist(), 'dendo': hier.dendrogram(linkageMatrix, no_plot=True) }, ignore_nan=True ))
def cluster_kmer_dists(kmers_dists, kmers_scores, kmers, out_pdf): ''' Plot a clustered heatmap of k-mer distances and scores.''' # cluster kmer_cluster = hierarchy.linkage(kmers_dists, method='single', metric='euclidean') order = hierarchy.leaves_list(kmer_cluster) # re-order distance matrix kmers_dists_reorder = kmers_dists[order, :] kmers_dists_reorder = kmers_dists_reorder[:, order] # plot plot_kmer_dists(kmers_dists_reorder, kmers_scores[order], kmers[order], out_pdf)
def reorder_labels(labels, link): """ reorder labels based on a linkage matrix reorder labels based on dendrogram position reindex so the clusters are in order based on their proximity in the dendrogram """ reorder_vec = leaves_list(link) cluster_swap = {} last_group = 1 for i in labels[reorder_vec]: if i not in cluster_swap.keys(): cluster_swap[i] = last_group last_group += 1 cluster_reindex = np.array([cluster_swap[i] for i in labels]) return cluster_reindex
def sort_distance_matrix(distance_matrix, embeddings, names, method='complete'): assert method in ['ward', 'single', 'average', 'complete'] np.fill_diagonal(distance_matrix, 0.) cond_distance_matrix = squareform(distance_matrix, checks=False) linkage_matrix = hierarchy.linkage(cond_distance_matrix, method='complete', optimal_ordering=True) res_order = hierarchy.leaves_list(linkage_matrix) distance_matrix = distance_matrix[res_order][:, res_order] embeddings = [embeddings[i] for i in res_order] names = [names[i] for i in res_order] np.fill_diagonal(distance_matrix, np.nan) return distance_matrix, embeddings, names, res_order
def _cluster_samples(self, df: pd.DataFrame) -> pd.DataFrame: """ Reorder samples with clustering of given order list. Args: df: dataframe to reorder index """ # Determine samples order using hierarchical clustering Z = hierarchy.linkage( distance.pdist(df.T), method="single", metric="euclidean", optimal_ordering=False, ) order = hierarchy.leaves_list(Z) return df.iloc[:, order]
def __cluster_columns__(self, column_distance, column_linkage): self.data = [list(col) for col in zip(*self.data)] if not self.missing_values is False: self.data, missing_values_indexes = self.__impute_missing_values__(self.data) self.column_clustering = fastcluster.linkage(self.data, method=column_linkage, metric=column_distance) self.data_order = hcluster.leaves_list(self.column_clustering) if not self.missing_values is False: self.data = self.__return_missing_values__(self.data, missing_values_indexes) self.data = list(zip(*self.data)) self.data = self.__reorder_data__(self.data, self.data_order) self.original_data = self.__reorder_data__(self.original_data, self.data_order) if self.header: self.header = self.__reorder_data__([self.header], self.data_order)[0]
def plot_clustermap(self, rank, subgroups, index_with_cluster, linkage, d, nmf_method): # generate colours for n subgroups colmap = self.generate_colmap_for_subgroups(rank, subgroups) # generate subtype colour information subtype_colour_info_list = self.filtered_reddy_dataset.subtypes.subtype_colour_info_list # hacky way to get round null but need to sort this subtype_colour_info_list = [ (patient_cols, colourmap) for (patient_cols, colourmap) in subtype_colour_info_list if (patient_cols is not None) and (colourmap is not None) ] subtype_colourmaps = [ subtype_cm for (subtype_cm, _) in subtype_colour_info_list ] # get subgroup assignments in order of dendrogram for colouring index_to_cluster = dict(index_with_cluster) reordered_ind = sch.leaves_list(linkage) # plot clustermap #g = sns.clustermap(d, metric='euclidean', row_linkage=linkage, col_linkage=linkage) g = sns.clustermap( d, metric='euclidean', row_linkage=linkage, col_linkage=linkage, col_colors=subtype_colourmaps, yticklabels=False, xticklabels=False, dendrogram_ratio=(0.1, 0.1), cbar_pos=(1, .5, .03, .3), cbar_kws={ 'label': 'Distance between patients in consensus matrix' }, tree_kws={ 'colors': [colmap[index_to_cluster[ca]] for ca in reordered_ind] }) # don't show row dendrogram: g.ax_row_dendrogram.set_visible(False) # set title and labels title = "Clustermap on " + nmf_method + " Consensus Matrix of " + str( self.subset_required[0]) + " Patients (" + str( self.subset_required[1]) + " Genes) \n\n" g.fig.suptitle(title, y=1.02) g.ax_heatmap.set_xlabel("Patients") g.ax_heatmap.set_ylabel("Patients") # plot legends self.create_all_subtype_legends(g, subtype_colour_info_list)
def fig2_region_blocks(self): """Return three panels showing taxa, function, and amr for all samples.""" phyla = group_small_cols(self.wide_phyla_rel, top=4) sample_order = phyla.index[leaves_list( linkage( squareform( self.tabler.beta_diversity(phyla, metric='jensenshannon')), 'average'))] phyla['sample'] = phyla.index phyla['continent'] = self.meta['continent'] phyla = phyla.melt(id_vars=['sample', 'continent']) phyla = phyla.dropna() phyla = phyla.query('continent != "Nan"') amrs = group_small_cols(self.amrs, top=5) amrs['sample'] = amrs.index amrs['continent'] = self.meta['continent'] amrs = amrs.melt(id_vars=['sample', 'continent']) amrs = amrs.dropna() amrs = amrs.query('continent != "Nan"') def my_plot(tbl, label): return (ggplot( tbl, aes(x='sample', y='value', fill='variable', group='continent')) + geom_col() + facet_grid('.~continent', scales="free") + scale_color_brewer( type='qualitative', palette=3, direction=1) + theme_minimal() + scale_y_sqrt(expand=(0, 0)) + labs(fill=label) + theme( text=element_text(size=20), panel_grid_major=element_blank(), panel_grid_minor=element_blank(), legend_position='bottom', axis_text_x=element_blank(), axis_title_x=element_blank(), axis_text_y=element_blank(), axis_title_y=element_blank(), panel_border=element_rect(colour="black", size=2), figure_size=(32, 4), )) return [ my_plot(phyla, 'Phyla'), my_plot(self.function_groups, 'Pathways'), my_plot(amrs, 'AMR Class'), ]
def _reorder_dendrogram(z, dists, leaf_ordering): if leaf_ordering == 'optimal': z = optimal_leaf_ordering(z, dists) h = leaves_list(z) elif leaf_ordering == 'count_sort_ascending': r = dendrogram(z, get_leaves=True, count_sort='ascending', no_plot=True, no_labels=True, show_leaf_counts=False) h = r['leaves'] elif leaf_ordering == 'count_sort_descending': r = dendrogram(z, get_leaves=True, count_sort='descending', no_plot=True, no_labels=True, show_leaf_counts=False) h = r['leaves'] elif leaf_ordering == 'distance_sort_ascending': r = dendrogram(z, get_leaves=True, distance_sort='ascending', no_plot=True, no_labels=True, show_leaf_counts=False) h = r['leaves'] elif leaf_ordering == 'distance_sort_descending': r = dendrogram(z, get_leaves=True, distance_sort='descending', no_plot=True, no_labels=True, show_leaf_counts=False) h = r['leaves'] else: raise ValueError('Unsupported leaf ordering') return h
def get_hierarchical_clustering_order( reads_filename, chromosomes=None): data = [] chunksize = 10 ** 5 for chunk in csvutils.read_csv_and_yaml( reads_filename, chunksize=chunksize): chunk["bin"] = list(zip(chunk.chr, chunk.start, chunk.end)) # for some reason pivot doesnt like an Int64 state col chunk['state'] = chunk['state'].astype('float') chunk = chunk.pivot(index='cell_id', columns='bin', values='state') data.append(chunk) # merge chunks, sum cells that get split across chunks table = pd.concat(data) table = table.groupby(table.index).sum() bins = pd.DataFrame( table.columns.values.tolist(), columns=[ 'chr', 'start', 'end']) bins['chr'] = bins['chr'].astype(str) bins = sort_bins(bins, chromosomes) table = table.sort_values(bins, axis=0) data_mat = np.array(table.values) data_mat[np.isnan(data_mat)] = -1 row_linkage = hc.linkage(sp.distance.pdist(data_mat, 'cityblock'), method='ward') order = hc.leaves_list(row_linkage) samps = table.index order = [samps[i] for i in order] order = {v: i for i, v in enumerate(order)} return order
def tree_sort(old_l, distances, return_leaves=True): ## average linkage assert len(distances) == len(old_l) if len(old_l) == 1: leaves = [0] else: y = distance.squareform(distances, checks=True) Z = hierarchy.average(y) #c,coph_dists = hierarchy.cophenet(Z,y) leaves = hierarchy.leaves_list(Z) new_l = [old_l[x] for x in leaves] if not return_leaves: return new_l else: return new_l, leaves
def reorder(C): """ Reorder consensus matrix. :param C: Consensus matrix. :type C: `numpy.matrix` """ c_vec = np.array([C[i, j] for i in range(C.shape[0] - 1) for j in range(i + 1, C.shape[1])]) # convert similarities to distances Y = 1 - c_vec Z = linkage(Y, method='average') # get node ids as they appear in the tree from left to right(corresponding # to observation vector idx) ivl = leaves_list(Z) ivl = ivl[::-1] return C[:, ivl][ivl, :]
def hierarchical_clustering(norms, labels, font): Z = hac.linkage(norms, method='complete', metric=dist) labels = labels # Plot dendogram plt.figure(figsize=(25, 5)) plt.title('Hierarchical Clustering Dendrogram') #plt.xlabel('Norm') plt.ylabel('Distance') hac.dendrogram( Z, labels=labels, leaf_rotation=90., # rotates the x axis labels leaf_font_size=font, # font size for the x axis labels ) # #plt.show() index = leaves_list(Z) return index, Z
def _compute_cluster_label_order(self, values, labels, dist_metric='euclidean', linkage_method='ward'): if len(labels) == 1: return labels dist_matrix = pdist(values, metric=dist_metric) linkage_matrix = linkage(dist_matrix, method=linkage_method) # dn = dendrogram(linkage_matrix, labels=labels, distance_sort='ascending') # ordered_label = dn['ivl'] ordered_index = leaves_list(linkage_matrix) ordered_label = [labels[idx] for idx in ordered_index] return ordered_label
def cluster_patterns(patterns, n_clusters=9, cluster_track='seq_ic'): """Cluster patterns """ # Whole pipeline from this notebook sim = similarity_matrix(patterns, track=cluster_track) # cluster lm_nte_seq = linkage(1 - sim, 'ward', optimal_ordering=True) cluster = cut_tree(lm_nte_seq, n_clusters=n_clusters)[:, 0] cluster_order = np.argsort(leaves_list(lm_nte_seq)) pattern_table_nte_seq = create_pattern_table(lm_nte_seq, cluster_order, cluster, align_track='contrib/mean', logo_len=70, seqlogo_kwargs=dict(width=320), footprint_width=320, footprint_kwargs=dict()) return sim, lm_nte_seq, cluster, cluster_order, pattern_table_nte_seq
def recipes_hie(nutrition_data, algr): scaled_data = StandardScaler().fit_transform(nutrition_data) #use scipy.cluster library to get the dendrogram of the data, which helps me to determine the number of clusters plt.figure(figsize=(25, 10)) plt.title("Clusters determined by hierarchy - " + 'ward') #the second method to get the dendrogram Z = shc.ward(pdist(scaled_data)) print(shc.leaves_list(Z)) dn = shc.dendrogram(Z) #the first method to get the dendrogram #dend = shc.dendrogram(shc.linkage(scaled_data, method=algr)) plt.show() exit() #use sklearn.cluster library to group the data points into these number clusters cluster = AgglomerativeClustering(n_clusters=7, affinity='euclidean', linkage=algr) cluster.fit(scaled_data) y_pred = cluster.fit_predict(scaled_data) #use PCA method to reduce dimensions to two, and visualize pca = PCA(n_components=2) principalComponents = pca.fit_transform(scaled_data) principalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) print(principalDf) fig = plt.figure(figsize=(20, 7)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title("Clusters determined by hierarchy - " + algr, fontsize=20) ax.scatter(principalComponents[:, 0], principalComponents[:, 1], c=y_pred, cmap='Paired') ax.grid() fig.show()
def reorder_consensus_matrix(M: np.array): """Reoders the consensus matrix. Args: M (np.array): Input matrix Returns: np.array: Reordered output matrix """ M = pd.DataFrame(M) Y = 1 - M Z = linkage(squareform(Y), method="average") ivl = leaves_list(Z) ivl = ivl[::-1] reorderM = pd.DataFrame(M.values[:, ivl][ivl, :], index=M.columns[ivl], columns=M.columns[ivl]) return reorderM.values
def _order_rows(dataframe): ''' >>> df = pd.DataFrame({ ... 'cell-1': {'a':8, 'b':1, 'c': 7, 'd': 2}, ... 'cell-2': {'a':1, 'b':1, 'c': 1, 'd': 1}, ... 'cell-3': {'a':9, 'b':1, 'c': 8, 'd': 2}, ... 'cell-4': {'a':1, 'b':2, 'c': 1, 'd': 1} ... }) >>> _order_rows(df) ['a', 'c', 'b', 'd'] ''' row_labels = dataframe.index.tolist() if len(row_labels) > 1: rows_linkage = linkage(dataframe, 'ward') rows_order = leaves_list(rows_linkage).tolist() return [row_labels[i] for i in rows_order] else: return row_labels
def _cluster_markers(markers, marker_labels, marker_groups_order, s, c): # cluster markers hierarchically using mean size and color markers_order = [] for marker_group in marker_groups_order: marker_names = markers[marker_group] marker_features = [] for marker in marker_names: marker_idx = np.array(marker_labels) == marker marker_features.append( np.concatenate([s[marker_idx], c[marker_idx]])) marker_features = np.array(marker_features) # normalize marker_features = marker_features / \ np.sqrt(np.sum(marker_features ** 2)) marker_group_order = hierarchy.leaves_list( hierarchy.linkage(marker_features)) markers_order.append(marker_group[marker_group_order]) markers_order = np.concatenate(markers_order) return markers_order