Example #1
1
def random_distribution(n):

    #make up some data
    data = np.random.normal(scale=n, size=(n, n))
    data[0:n / 2,0:n / 2] += 75
    data[n / 2:, n / 2:] = np.random.poisson(lam=n,size=data[n / 2:, n / 2:].shape)
    #cluster the rows
    row_dist = ssd.squareform(ssd.pdist(data))
    row_Z = sch.linkage(row_dist)
    row_idxing = sch.leaves_list(row_Z)

    row_labels = ['bar{}'.format(i) for i in range(n)]

    #cluster the columns
    col_dist = ssd.squareform(ssd.pdist(data.T))
    col_Z = sch.linkage(col_dist)
    col_idxing = sch.leaves_list(col_Z)
    #make the dendrogram

    col_labels = ['foo{}'.format(i) for i in range(n)]

    data = data[:,col_idxing][row_idxing,:]

    heatmap = pdh.DendroHeatMap(heat_map_data=data,left_dendrogram=row_Z, top_dendrogram=col_Z, heatmap_colors=("#ffeda0", "#feb24c", "#f03b20"), window_size="auto", color_legend_displayed=False, label_color="#777777")
    heatmap.row_labels = row_labels
    heatmap.col_labels = col_labels
    heatmap.title = 'An example heatmap'
    heatmap.show()#heatmap.save("example.png")
Example #2
1
    def get_clustdist_path(self, feature_ids=None, labeling_name=None,
                           class_ids=None, vmin=-3.0, vmax=3.0, root_dir='.'):

        if not(labeling_name):
            labeling_name = 'one_class'
        #labeling = self.labeling_dict[labeling_name]

        (fm, sample_names, feature_names, target, target_names) =\
            self.get_dataset(feature_ids, labeling_name, class_ids)

        #fistr = '_'.join([str(self.feature_ids.index(f)) for f in
        #    feature_names])
        #listr = '_'.join([str(labeling.class_names.index(t))
        #        for t in target_names])
        #lab_str = 'feati_' + fistr + '_' + labeling_name + '_' + listr

        #png_f = os.path.join(self.heatmap_dir, 'fm_clustered_%s.png' %
        #        (lab_str))
        d = os.path.join(root_dir, self.HEATMAP_D)
        if not(os.path.exists(d)):
            os.makedirs(d)
        img_format = 'png'
        file_path = os.path.join(d, 'fm_clustered.%s' % (img_format))

        # reorder feature matrix rows (objects)
        object_indices = hierarchy.leaves_list(self.clust_object(fm))
        fm = fm[object_indices, :]

        # reorder standardized feature matrix columns (feats)
        feat_indices = hierarchy.leaves_list(self.clust_feat(fm))
        fm = fm[:, feat_indices]

        # add labels of all available labelings (reordered using object_is)
        #lablists = [[l.labels[i] for i in object_indices]
        #                         for l in self.labeling_dict.values()
        #                         if not l.name == 'one_class']
        lablists = [[target[i] for i in object_indices]]
        class_names = [target_names]

        # reorder the feature and object ids
        fs = [feature_names[i] for i in feat_indices]
        gs = [sample_names for i in object_indices]

        heatmap.heatmap_labeled_fig(fm, fs, gs, lablists, class_names,
                                    file_path, vmin=vmin, vmax=vmax)

        return file_path
Example #3
0
def cluster(df, metric="euclidean", method="single", row=True, column=True):
    row_linkmat, col_linkmat = None, None
    if row:
        distmat = dist.pdist(df, metric)
        row_linkmat = hier.linkage(distmat, method)
        df = df.iloc[hier.leaves_list(row_linkmat), :]
    if column:
        df = df.T
        distmat = dist.pdist(df, metric)
        col_linkmat = hier.linkage(distmat, method)
        df = df.iloc[hier.leaves_list(col_linkmat), :].T
    return df, row_linkmat, col_linkmat
def reorder(C):
    print 'reorder...'
    Y = 1 - C
    Z = linkage(Y, method='average')
    ivl = leaves_list(Z)
    ivl = ivl[::-1]
    return C[:, ivl][ivl, :]
Example #5
0
 def check_leaves_list_iris(self, method):
     # Tests leaves_list(Z) on the Iris data set
     X = eo['iris']
     Y = pdist(X)
     Z = linkage(X, method)
     node = to_tree(Z)
     assert_equal(node.pre_order(), leaves_list(Z))
Example #6
0
def make_cdt_file(basename, data, clusters=None, sep_col=True):

    data = data.copy()
    if sep_col:
        prefixes = set(col[: col.find("_sl")] for col in data.columns)
        for prefix in prefixes:
            data[prefix + "_sep"] = pd.Series()
        data = data.sort_index(axis=1)

    data.insert(0, "GID", "NONE")
    data.insert(1, "FBgn", data.index)
    data.insert(2, "NAME", data.index)
    data.insert(3, "CHROMOSOME", "NONE")
    data.insert(4, "ARM", "L")
    data.insert(5, "POSITION", 0)
    data.insert(6, "GWEIGHT", 1.0)

    for i, row in enumerate(data.index):
        data.ix[row, "GID"] = "GENE{}X".format(i)
        data.ix[row, "FBgn"] = fbgn_lookup.get(row, "???")
        if row in fbgn_map:
            pos = fbgn_map[row].split("..")[0]
            chrom, pos = pos.split(":")
            arm = "R" if chrom.endswith("R") else "L"
            if chrom[-1] in "RL":
                chrom = chrom[:-1]
            data.ix[row, "CHROMOSOME"] = chrom
            data.ix[row, "ARM"] = arm
            data.ix[row, "POSITION"] = int(pos)

    if clusters is not None:
        data = data.ix[hierarchy.leaves_list(clusters)]
    data.to_csv(basename, sep="\t", index=False, float_format="%.5f")
Example #7
0
def rearrange(X, optimal = True, method = "average"):
    metric_kwargs = {}

    Y = squareform(X, force="tovector")
    Z = [(int(l), int(r), max(0., d), int(n))
         for (l, r, d, n) in linkage(Y, method=method, metric=None)]

    leaves = list(leaves_list(Z))
    N      = len(leaves)
    root   = len(Z)+N-1

    assert len(X) == N

    # bar-joseph optimal ordering
    if optimal:
        import barjoseph
        leaves = barjoseph.optimal(root, **{
            "S":        lambda i, j: exp(-X[i][j]),
            "left":     lambda i: None if i < N else Z[i-N][0],
            "right":    lambda i: None if i < N else Z[i-N][1],
            "is_leaf":  lambda i: i < N,
            "is_empty": lambda v: v is None,
        })

    assert list(sorted(leaves)) == list(range(N))

    return leaves
Example #8
0
def make_cdt_file(basename, data, clusters=None, sep_col = True):

    data = data.copy()
    if sep_col:
        prefixes = set(col[:col.find('_sl')] for col in data.columns)
        for prefix in prefixes:
            data[prefix+"_sep"] = pd.Series()
        data = data.sort_index(axis=1)

    data.insert(0, 'GID', 'NONE')
    data.insert(1, 'FBgn', data.index)
    data.insert(2, 'NAME', data.index)
    data.insert(3, 'CHROMOSOME', 'NONE')
    data.insert(4, 'ARM', 'L')
    data.insert(5, 'POSITION', 0)
    data.insert(6, 'GWEIGHT', 1.0)

    for i, row in enumerate(data.index):
        data.ix[row,'GID'] = 'GENE{}X'.format(i)
        data.ix[row, 'FBgn'] = fbgn_lookup.get(row, '???')
        if row in fbgn_map:
            pos = fbgn_map[row].split('..')[0]
            chrom, pos = pos.split(':')
            arm = 'R' if chrom.endswith('R') else 'L'
            if chrom[-1] in 'RL':
                chrom = chrom[:-1]
            data.ix[row, 'CHROMOSOME'] = chrom
            data.ix[row, 'ARM'] = arm
            data.ix[row, 'POSITION'] = int(pos)


    if clusters is not None:
        data = data.ix[hierarchy.leaves_list(clusters)]
    data.to_csv(basename, sep='\t', index=False, float_format='%.5f')
 def get_factor_reorder(self, c, rotate='oblimin'):
     # reorder factors based on correlation matrix
     phi=get_attr(self.results['factor_tree_Rout_%s' % rotate][c],'Phi')
     if phi is None:
         return list(range(c))
     new_order = list(leaves_list(linkage(squareform(np.round(1-phi,3)))))
     return new_order[::-1] # reversing because it works better for task EFA
Example #10
0
  def to_dict(self, correlation_matrix, linkage_matrix):

    from scipy.cluster import hierarchy
    tree = hierarchy.to_tree(linkage_matrix, rd=False)
    leaves_list = hierarchy.leaves_list(linkage_matrix)

    d = {}

    # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/
    # https://gist.github.com/mdml/7537455

    def add_node(node):
      if node.is_leaf(): return
      cluster_id = node.get_id() - len(linkage_matrix) - 1
      row = linkage_matrix[cluster_id]
      d[cluster_id+1] = {
        'datasets': [i+1 for i in sorted(node.pre_order())],
        'height': row[2],
      }

      # Recursively add the current node's children
      if node.left: add_node(node.left)
      if node.right: add_node(node.right)

    add_node(tree)

    return d
def plot_correlations(booklist):
    from mpl_toolkits.axes_grid1 import make_axes_locatable
    fig, ax = plt.subplots(figsize=(20,20))

    books = booklist if len(booklist)>0 else np.unique(np.array(tanach['book']))
    mesh = []
    for b in books:
        wds = words(b)
        gem = gematriaze(wds)
        mesh.append(gem)

    minsize = min(*[len(mesh[i]) for i in range(len(mesh))])
    mesh = [mesh[i][0:minsize] for i in range(len(mesh))]
    meshnum = np.array(mesh)


    plot_matr = np.dot(meshnum, meshnum.T)

    Z = sch.linkage(plot_matr)
    leaves = sch.leaves_list(Z)

    plot_matr = plot_matr[leaves][:,leaves]

    ax.set_yticks(np.arange(len(books))+0.5)
    ax.set_yticklabels(np.array(books)[leaves], fontsize=20)

    ax.set_xticks(np.arange(len(books))+0.5)
    ax.set_xticklabels(np.array(books)[leaves], rotation='vertical',fontsize=20)
    # pc = ax.pcolormesh(nmeshnum,vmin=0, vmax=np.max(meshnum))
    pc = ax.pcolormesh(plot_matr)
    div = make_axes_locatable(ax)
    cax = div.append_axes("right", size="2%", pad=0.05)
    cbar = plt.colorbar(pc, cax=cax)

    fig.tight_layout()
Example #12
0
def classify_by_scores(M, threshold, loci, return_file_names=None):

    M_array = ssd.squareform(M)

    Z = linkage(M_array, method='average')

    root = to_tree(Z)
    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        if i >= len(leaf_ids)-1:
            break
        cnt += 1

    clusters = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)

    if return_file_names:

        clusters_fn = []

        for cluster in clusters:

            clusters_fn.append([os.path.basename(loci[i].file_name) for i in cluster])

        singles_fn = [ os.path.basename(loci[i].file_name) for i in singles]

        return singles_fn, clusters_fn

    else:

        return singles, clusters
Example #13
0
def _get_cluster(components, my_inds=None):
    if my_inds is None:
        my_inds = list(components.keys())
    dist = distance.pdist([components[ind] for ind in my_inds])
    hcomp = hierarchy.complete(dist)
    ll = hierarchy.leaves_list(hcomp)
    return ll
Example #14
0
 def hierarchial_cluster(self, method, metric):
     clusters = hierarchy.linkage(self.perc_ids, method=method, metric=metric)
     ordering = hierarchy.leaves_list(clusters)
     self.perc_ids = self.perc_ids[ordering, :]
     self.perc_ids = self.perc_ids[:, ordering]
     self.perc_aln = self.perc_aln[ordering, :]
     self.perc_aln = self.perc_aln[:, ordering]
     self.genomes = self.genomes[ordering]
Example #15
0
 def cluster_rows(self, method="ward"):
     display_data = self.display_data
     rows = len(display_data)
     if rows < 2:
         # don't attempt to cluster less than 2 rows
         return
     Z = linkage(self.display_data, method)
     self.row_order = leaves_list(Z)
Example #16
0
def cluster(matrix) : 
    Z = hier.linkage(matrix, method='average')
    
    leaves = hier.leaves_list(Z)

    newmat=matrix[leaves,:]
    newmat=newmat[:,leaves]

    return leaves, newmat
Example #17
0
def plot_zmatrix(ax, zmatrix):
    from matplotlib import pylab

    lm = hier.linkage(zmatrix)
    ord = np.array(hier.leaves_list(lm))
    
    ax.imshow((zmatrix[ord])[:, ord], interpolation='nearest', 
              cmap=pylab.cm.Greys)
    return ord
Example #18
0
 def __cluster_columns__(self, column_distance, column_linkage):
     columns = zip(*self.data)
     self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance)
     self.data_order = hcluster.leaves_list(self.column_clustering)
     self.data = self.__reorder_data__(self.data, self.data_order)
     self.original_data = self.__reorder_data__(self.original_data, self.data_order)
     if self.header:
         self.header = self.__reorder_data__([self.header], self.data_order)[0]
     return
def heatmap_plot_zscore_bigneuron(df_zscore_features, df_all, output_dir, title=None):

    print "heatmap plot:bigneuron"

    #taiwan
    metric ='nt_type'
    mtypes = np.unique(df_all[metric])
    print mtypes
    mtypes_pal = sns.color_palette("hls", len(mtypes))

    mtypes_lut = dict(zip(mtypes, mtypes_pal))
    mtypes_colors = df_all[metric].map(mtypes_lut)



    linkage = hierarchy.linkage(df_zscore_features, method='ward', metric='euclidean')

    data = df_zscore_features.transpose()
    row_linkage = hierarchy.linkage(data, method='ward', metric='euclidean')
    feature_order = hierarchy.leaves_list(row_linkage)

    #print data.index
    matchIndex = [data.index[x] for x in feature_order]
    #print matchIndex
    data = data.reindex(matchIndex)

    pl.figure()
    g = sns.clustermap(data, row_cluster = False, col_linkage=linkage, method='ward', metric='euclidean',
                       linewidths = 0.0,col_colors = [mtypes_colors],
                       cmap = sns.cubehelix_palette(light=1, as_cmap=True),figsize=(40,10))

    pl.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    pl.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    #g.ax_heatmap.set_xticklabels([])
    pl.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.95)  # !!!!!

    if title:
        pl.title(title)


    location ="best"
    num_cols=1
    # Legend for row and col colors

    for label in mtypes:
         g.ax_row_dendrogram.bar(0, 0, color=mtypes_lut[label], label=label, linewidth=0.0)
         g.ax_row_dendrogram.legend(loc=location, ncol=num_cols,borderpad=0)

    filename = output_dir + '/zscore_feature_heatmap.png'
    pl.savefig(filename, dpi=300)
    #pl.show()
    print("save zscore matrix heatmap figure to :" + filename)
    pl.close()
    print "done clustering and heatmap plotting"
    return linkage
Example #20
0
def matrix_tree(data,color):
    normed_data = data.values

    condition_link = linkage(normed_data)
    feature_link = linkage(normed_data.T)

    condition_order = leaves_list(condition_link)
    feature_order = leaves_list(feature_link)

    conditions = data.index.values[condition_order]
    features = data.columns.values[feature_order]

    color_matrix = normed_data.T[feature_order,:][:,condition_order]
    
    plot_matrix_tree(color_matrix,
                     condition_link,
                     feature_link,
                     conditions,
                     features,
                     color)
Example #21
0
def reorder(C):
    """
    Reorder consensus matrix.
    
    :param C: Consensus matrix.
    :type C: `numpy.ndarray`
    """
    Y = 1 - C
    Z = linkage(squareform(Y), method='average')
    ivl = leaves_list(Z)
    ivl = ivl[::-1]
    return C[:, ivl][ivl, :]
def hclustering(dataArray, method, p = None):
	if p is not None:
		distanceMatrix  =  pdist(dataArray, method, p)
	else:
		distanceMatrix  =  pdist(dataArray, method)
	distanceSquareMatrix  =  squareform(distanceMatrix)
	linkageMatrix  =  hier.linkage(distanceSquareMatrix)
	heatmapOrder  =  hier.leaves_list(linkageMatrix)
	orderedDataMatrix  =  dataArray[:, heatmapOrder]
	orderedDataMatrix  =  orderedDataMatrix[heatmapOrder, :]
	# print linkageMatrix
	return heatmapOrder, orderedDataMatrix, distanceSquareMatrix
Example #23
0
    def plot_polar(self, data, n_top=3, overplot=False, labels=None,
                   palette='husl'):

        n_panels = data.shape[1]

        if labels is None:
            labels = []
            for i in range(n_panels):
                labels.extend(data.iloc[:, i].order(ascending=False)
                              .index[:n_top])
            labels = np.unique(labels)

        data = data.loc[labels, :]

        # Use hierarchical clustering to order
        from scipy.spatial.distance import pdist
        from scipy.cluster.hierarchy import linkage, leaves_list
        dists = pdist(data, metric='correlation')
        pairs = linkage(dists)
        order = leaves_list(pairs)
        data = data.iloc[order, :]
        labels = [labels[i] for i in order]

        theta = np.linspace(0.0, 2 * np.pi, len(labels), endpoint=False)
        if overplot:
            fig, ax = plt.subplots(1, 1, subplot_kw=dict(polar=True))
            fig.set_size_inches(10, 10)
        else:
            fig, axes = plt.subplots(1, n_panels, sharex=False, sharey=False,
                                     subplot_kw=dict(polar=True))
            fig.set_size_inches((6 * n_panels, 6))
        # A bit silly to import seaborn just for this...
        # should extract just the color_palette functionality.
        import seaborn as sns
        colors = sns.color_palette(palette, n_panels)
        for i in range(n_panels):
            if overplot:
                alpha = 0.2
            else:
                ax = axes[i]
                alpha = 0.8
            ax.set_ylim(data.values.min(), data.values.max())
            d = data.iloc[:, i].values
            ax.fill(theta, d, color=colors[i], alpha=alpha, ec='k',
                    linewidth=0)
            ax.fill(theta, d, alpha=1.0, ec=colors[i],
                    linewidth=2, fill=False)
            ax.set_xticks(theta)
            ax.set_xticklabels(labels, fontsize=18)
            [lab.set_fontsize(18) for lab in ax.get_yticklabels()]
            ax.set_title('Cluster %d' % i, fontsize=22, y=1.12)
        plt.tight_layout()
        return plt
def plot_distances(model):
    from scipy.spatial.distance import squareform, pdist
    from scipy.cluster import hierarchy
    from matplotlib import pyplot as plt
    
    D = pdist(model.doc_topic_)
    doc_order = hierarchy.leaves_list(hierarchy.linkage(D))
    D = pdist(model.doc_topic_[doc_order, :])
    plt.imshow(squareform(D), interpolation='none')
    plt.colorbar()
    plt.show()
    return doc_order
Example #25
0
def plot_delta(x,deltas,mean=True,probability=False,cluster=False,plot_cluster=False,cluster_kwargs={},ytick_filter=lambda x: x):
    p = len(deltas.keys())
    n = x.shape[0]
    a = np.zeros((p,n))
    yticks = [ytick_filter(k) for k in deltas.keys()]

    for i,k in enumerate(deltas.keys()):
        mu,var = deltas[k]

        if mean:
            a[i,:] = mu
        else:
            a[i,:] = 1-scipy.stats.norm.cdf(0,mu,np.sqrt(var))
            a[np.abs(a-.5) < .475] = 0.5

    if cluster:
        l = linkage(a,**cluster_kwargs)
        ind = leaves_list(l)
        a = a[ind,:]
        yticks = [yticks[j] for j in ind]
        
        if plot_cluster:
            ax = plt.subplot2grid((1,6),(0,0),colspan=1,rowspan=1)
            dendrogram(l,no_labels=True,orientation='left',ax=ax)

    if mean:
        lim = np.max(np.abs(a))
        vmin = -lim
        vmax = lim
    else:
        vmin = 0
        vmax = 1

    if plot_cluster:
        ax = plt.subplot2grid((1,6),(0,1),colspan=4,rowspan=1)
    else:
        ax = plt.subplot2grid((1,5),(0,0),colspan=4,rowspan=1)
        
    plt.imshow(a,cmap="RdBu",interpolation="none",vmin=vmin,vmax=vmax,origin='lower',aspect="auto")
    plt.yticks(range(p),yticks)
    i = np.arange(0,n,1.*n/5)
    plt.xticks(i,[x[j].round(2) for j in i])
    
    if plot_cluster:
        if probability:
            cbarAx,kwargs = mpl.colorbar.make_axes(ax)
            cbar = mpl.colorbar.ColorbarBase(cbarAx,cmap='RdBu',ticks=[0,.5,1],**kwargs)
            cbar.ax.set_yticklabels(['p(less\n than parent)\n>97.5%', 'no difference', 'p(greater\n than parent)\n>97.5%'],fontsize=15)
        else:
        	plt.colorbar()
    else:
        plt.colorbar()
def ClusterSimilarityMatrix(sim_mat, method='average'):
   n = len(sim_mat)
   flat_dist_mat = ssd.squareform(1.0-sim_mat)
   res_linkage = hcluster.linkage(flat_dist_mat, method=method)
   res_order = hcluster.leaves_list(res_linkage)
   seriated_sim = np.zeros((n,n))
   a,b = np.triu_indices(n,k=1)
   seriated_sim[a,b] = sim_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
   seriated_sim[b,a] = seriated_sim[a,b]
   for i in range(n):
      seriated_sim[i,i] = sim_mat[i,i]

   return seriated_sim, res_order, res_linkage
def plot_heatmap(X, X_hat, mask, filename, data_transform, value_name):
    available = np.invert(np.isnan(X.values))
    rmse = calc_unobserved_rmse(X, X_hat.values, mask)
    r2 = calc_unobserved_r2(X, X_hat.values, mask)
    u, s, vt = np.linalg.svd(X_hat - X_hat.values.mean())
    approx_rank = np.where(np.cumsum(s**2) > (s**2).sum() * 0.95)[0][0] + 1
    correlations = np.asarray(X.corr())
    correlations[np.isnan(correlations)] = 0
    col_linkage = linkage(distance.pdist(correlations), method='average')
    col_order = leaves_list(col_linkage)
    correlations = np.asarray(X.T.corr())
    correlations[np.isnan(correlations)] = 0
    row_linkage = linkage(distance.pdist(correlations), method='average')
    row_order = leaves_list(row_linkage)
    X_reorder = X.reindex(X.index[row_order])[X.columns[col_order]]
    Xhat_reorder = X_hat.reindex(X.index[row_order])[X.columns[col_order]]
    df = pd.concat([X_reorder, Xhat_reorder], keys=['original', 'inferred'])
    try:
        df = df.rename_axis(
            ['Unobserved',
             '%s %s' % (data_transform, value_name)])
    except:
        pass
    df_mask = np.vstack([mask, mask])
    fig, ax = plt.subplots(figsize=(8, 12))
    _ = plt.title(
        '%.1f%% of entries available; %.1f%% observed; RMSE=%.3f; r^2=%.1f%%\nsize: %d x %d; approx. rank: %d'
        % (np.average(available) * 100, np.average(mask) * 100, rmse, r2 * 100,
           X.shape[0], X.shape[1], approx_rank),
        fontsize=10)
    ax = sns.heatmap(df, mask=df_mask, ax=ax, cmap=sns.cm.rocket_r)
    _ = ax.axhline(X.shape[0], color='blue')
    _ = plt.tight_layout()
    bottom, top = ax.get_ylim()
    ax.set_ylim(
        bottom + 0.5, top - 0.5
    )  # sorry, this may cut off the bottom row...some sort of matplotlib bug
    plt.savefig(filename)
    plt.close()
Example #28
0
def cluster_kmer_dists(kmers_dists, kmers_scores, kmers, out_pdf):
    ''' Plot a clustered heatmap of k-mer distances and scores.'''

    # cluster
    kmer_cluster = hierarchy.linkage(kmers_dists, method='single', metric='euclidean')
    order = hierarchy.leaves_list(kmer_cluster)

    # re-order distance matrix
    kmers_dists_reorder = kmers_dists[order,:]
    kmers_dists_reorder = kmers_dists_reorder[:,order]

    # plot
    plot_kmer_dists(kmers_dists_reorder, kmers_scores[order], kmers[order], out_pdf)
Example #29
0
def clustering(data):
    thres = 25
    #Create the distance matrix for the array of sample vectors.
    #Look up 'squareform' if you want to submit your own distance matrices as they need to be translated into reduced matrices
    reduced_data = PCA(n_components=2).fit_transform(data)
    data = pd.DataFrame.as_matrix(data)
    data_dist = pdist(data, metric='euclidean')  # computing the distance
    Y = linkage(data_dist, method='complete')
    fig = plt.figure(figsize=(8, 8))
    # x ywidth height
    ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6])
    Z1 = dendrogram(Y, orientation='right')
    ax1.set_xticks([])

    # Compute and plot second dendrogram.
    ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
    Z2 = dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    #Compute and plot the heatmap
    axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    D = squareform(data_dist)
    D = D[idx1, :]
    D = D[:, idx2]
    im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.RdYlGn)
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])
    # Plot colorbar.
    axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
    plt.colorbar(im, cax=axcolor)

    #From the heatmap it is evident there are about 4 clusters.
    thres = 26
    plt.figure(figsize=(20, 12))
    dendrogram(Y, color_threshold=thres, show_leaf_counts=True)
    plt.yticks(np.arange(0, 35, step=0.5))
    plt.xticks([])
    clusters1 = fcluster(Y, t=thres, criterion='distance')
    col = np.array(clusters1)
    col = col / float(np.max(col))
    col = np.array([round(x, 2) for x in col])
    print col
    plt.figure()
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=col)
    plt.show()
    print "cluster1", clusters1
    print "leaves", leaves_list(Y)
    return
Example #30
0
    def heatmap(self):
        print(genes)
        print(timepoints)
        matrix = self.values
        Z = linkage(matrix, "ward")
        ZT = linkage(matrix.T, "ward")

        fig, ax = plt.subplots()

        title = input("Please input Clustered Dendrogram plot title: ")
        plt.title(title)

        plt.xlabel("sample")
        plt.ylabel("distance")
        dendrogram(ZT,
                   labels=timepoints,
                   show_leaf_counts=False,
                   leaf_rotation=90.,
                   leaf_font_size=12.,
                   show_contracted=True)

        fig.savefig("dendrogram.png")
        plt.close(fig)

        idx_rows = leaves_list(Z)
        data = matrix[idx_rows, :]
        # idx_columns = leaves_list(ZT)
        idx_columns = range(10)
        data = data[:, idx_columns]

        X = (data - np.average(data, axis=0)) / np.std(data, axis=0)

        m = np.max(np.abs(X))
        y = np.arange(0, 100)

        fig, ax = plt.subplots()

        title = input("Please input Heatmap title: ")

        ax.set_title(title)
        im = ax.pcolor(X, cmap="viridis", vmin=-m, vmax=m - 0.5)
        ax.grid(False)
        ax.set_xticks(np.arange(0.5, X.shape[1] + 0.5), )
        ax.set_xticklabels(timepoints, rotation=50)
        ax.set_yticks(np.arange(0.5, len(genes), 5), genes)
        ax.set_yticklabels(genes)
        cbar = fig.colorbar(im, ax=ax)
        fig.subplots_adjust(left=0.05, bottom=0.15, right=1.0, top=0.95)
        fig.savefig("Heatmap_clustered.png")  # Save the image
        plt.close(fig)  # Close the canvas
        print("Clustered Heatmap for Complete...")
Example #31
0
    def clustdist_json(self, feature_ids=None, labeling_name=None,
                       class_ids=None):

        if not(labeling_name):
            labeling_name = 'one_class'

        # fm is normalized!
        (fm, sample_names, feature_names, target, target_names) =\
            self.get_dataset(feature_ids, labeling_name, class_ids)

        # reorder feature matrix rows (objects)
        object_indices = hierarchy.leaves_list(self.clust_object(fm))
        fm = fm[object_indices, :]

        # reorder standardized feature matrix columns (feats)
        feat_indices = hierarchy.leaves_list(self.clust_feat(fm))
        fm = fm[:, feat_indices]

        # add labels of all available labelings (reordered using object_is)
        lablist = [target[i] for i in object_indices]
        #class_names = [target_names]

        # reorder the feature and object ids
        fs = [feature_names[i] for i in feat_indices]
        gs = [sample_names for i in object_indices]

        json_data = {}
        json_data['feature-names'] = fs
        #json_data['object-names'] = gs --> not yet needed on client side
        json_data['object-labels'] = lablist
        json_data['class-names'] = target_names
        json_data['max-value'] = fm.max();
        json_data['min-value'] = fm.min();
        # add feature list per feature-name
        for index, item in enumerate(fs):
            json_data[item] = list(fm[:, index])

        return json.dumps(json_data)
Example #32
0
def plot_heatmap(X, obs_frac, filename, combined_datasets=False):
    available = np.invert(np.isnan(X.values))
    mask = get_mask(X, obs_frac)
    X_hat = pd.DataFrame(complete_matrix(X, mask, offset=True),
                         index=X.index,
                         columns=X.columns)
    rmse = calc_unobserved_rmse(X, X_hat.values, mask)
    r2 = calc_unobserved_r2(X, X_hat.values, mask)
    correlations = np.asarray(X.corr())
    correlations[np.isnan(correlations)] = 0
    col_linkage = linkage(distance.pdist(correlations), method='average')
    col_order = leaves_list(col_linkage)
    correlations = np.asarray(X.T.corr())
    correlations[np.isnan(correlations)] = 0
    row_linkage = linkage(distance.pdist(correlations), method='average')
    row_order = leaves_list(row_linkage)
    X_reorder = X.reindex(X.index[row_order])[X.columns[col_order]]
    Xhat_reorder = X_hat.reindex(X.index[row_order])[X.columns[col_order]]
    df = pd.concat([X_reorder, Xhat_reorder], keys=['original', 'inferred'])
    if combined_datasets:
        df = df.rename_axis(
            ['Unobserved', 'VaccineStatus', 'Neutralization[Titers]'])
    else:
        df = df.rename_axis(['Unobserved', 'Neutralization[Titers]'])
    df_mask = np.vstack([mask, mask])
    fig, ax = plt.subplots(figsize=(8, 12))
    _ = plt.title(
        '%.1f%% of available entries unobserved; RMSE=%.3f; r^2=%.1f%%' %
        (100 - np.average(mask) * 100, rmse, r2 * 100))
    ax = sns.heatmap(df, mask=df_mask, ax=ax, cmap=sns.cm.rocket_r)
    _ = ax.axhline(X.shape[0], color='blue')
    _ = plt.tight_layout()
    bottom, top = ax.get_ylim()
    ax.set_ylim(
        bottom + 0.5, top - 0.5
    )  # sorry, this may cut off the bottom row...some sort of matplotlib bug
    plt.savefig(filename)
    plt.close()
Example #33
0
def hierarchicalReorder(inMatrix, useCorr = True):
    inMatrix = np.array(inMatrix)
    allargs = []
    for axis in [0,1]:
        inMatrix = inMatrix.T
        if useCorr:
            corr = np.corrcoef(inMatrix)
        else:
            corr = np.array(inMatrix)
        cl = hierarchy.linkage(corr)
        args = hierarchy.leaves_list(cl)
        allargs.append(args)
        inMatrix = inMatrix[args]
    return inMatrix, allargs[1], allargs[0]
Example #34
0
def GetOptimalOrder(matrix):
    '''
        Get an otpimal order for the Annoation matrix and the names
        from a hierarchical clustering leaf index

        return:
            idxOpt: Optimal index
    '''
    dist = yule_distance(matrix)
    dist = (dist + dist.T) / 2.
    dist = dist - np.diag(np.diag(dist))
    Z = linkage(squareform(dist), method='average', metric='precomputed')

    return leaves_list(Z).astype(np.int32)
Example #35
0
 def cluster_symmetric(self, method='average'):
     if self.shape[0] != self.shape[1]:
         raise ValueError('data matrix not square')
     elif self.shape[0] == 0:
         raise ValueError('no rows or columns in data matrix')
     elif self.shape[0] < 3:
         print('less than 3 rows and 3 columns. no clustering performed.')
     else:
         si = hierarchy.leaves_list(
             fastcluster.linkage(
                 distance.squareform(np.float64(1) - self.matrix,
                                     checks=False), method)).astype('int64')
         self.reorder(si, axis=0)
         self.reorder(si, axis=1)
Example #36
0
 def sort(self, method=None):
     if method == "none":
         order = range(self.ncols)
     elif method == "similarity":
         norm = self.data / (self.colsums + c_epsilon * np.ones(self.ncols))
         # note: linkage assumes things to cluster = rows; we want cols
         order = sch.leaves_list(sch.linkage(
             norm.transpose(), metric="braycurtis"))
     elif method == "usimilarity":
         # note: linkage assumes things to cluster = rows; we want cols
         order = sch.leaves_list(sch.linkage(
             self.data.transpose(), metric="braycurtis"))
     elif method == "dominant":
         maxes = [(max(self.data[i, :]), i) for i in range(self.nrows)]
         maxes.sort()
         ranks = [None for k in maxes]
         for i, (s, i2) in enumerate(maxes):
             ranks[i2] = i
         argmax = []
         for c in range(self.ncols):
             argmax.append(sorted(range(self.nrows),
                                  key=lambda r: self.data[r][c])[-1])
         order = sorted(range(self.ncols), key=lambda c: (
             ranks[argmax[c]], self.data[argmax[c], c]), reverse=True)
     elif method == "sum":
         order = sorted(range(self.ncols),
                        key=lambda c: self.colsums[c], reverse=True)
     elif method == "metadata":
         order = sorted(range(self.ncols), key=lambda c: self.metarow[c])
     else:
         sys.exit("Can't sort with method: %s" % (method))
     self.data = self.data[:, order]
     self.colheads = subseq(self.colheads, order)
     if self.metarow is not None:
         self.metarow = subseq(self.metarow, order)
     self.update()
Example #37
0
def run():
    counts = 'phage_kmer_count_k4_c0_s2255.csv'
    headers = 'phage_kmer_headers_k4_c0_s2255.txt'

    data = normalize_rows(np.loadtxt(counts, delimiter=','))
    row_labels = np.array([header.split('|')[3] for header in open(headers, 'r').readlines()[1:]])
    col_labels = np.array(kmers(4))

    N = 50

    data = data[:N,:N]
    row_labels = row_labels[:N]
    col_labels = col_labels[:N]

    # cluster the rows
    row_dist = ssd.squareform(ssd.pdist(data))
    row_Z = sch.linkage(row_dist)
    row_idxing = sch.leaves_list(row_Z)

    #cluster the columns
    col_dist = ssd.squareform(ssd.pdist(data.T))
    col_Z = sch.linkage(col_dist)
    col_idxing = sch.leaves_list(col_Z)

    #make the dendrogram
    data = data[:,col_idxing][row_idxing,:]
    row_labels = list(row_labels[np.array(row_idxing)])
    col_labels = list(col_labels[np.array(col_idxing)])

    heatmap = pdh.DendroHeatMap(heat_map_data=data, left_dendrogram=row_Z, top_dendrogram=col_Z)
    heatmap.colormap = heatmap.redBlackBlue
    heatmap.row_labels = row_labels
    heatmap.col_labels = col_labels
    heatmap.title = 'Bacteirophage 4-mer hierarchical clustering'
    heatmap.export('phage_heatmap.png')
    heatmap.show()
Example #38
0
def reorder(C):
    """
    Reorder consensus matrix.
    
    :param C: Consensus matrix.
    :type C: `numpy.matrix`
    """
    c_vec = np.array([C[i, j] for i in xrange(C.shape[0] - 1) for j in xrange(i + 1, C.shape[1])])
    # convert similarities to distances
    Y = 1 - c_vec
    Z = linkage(Y, method="average")
    # get node ids as they appear in the tree from left to right(corresponding to observation vector idx)
    ivl = leaves_list(Z)
    ivl = ivl[::-1]
    return C[:, ivl][ivl, :]
def genomes_hclust(dist_dict, args):
    """Genomes hierarchical clustering and vizualisation"""
    logger.info("Clustering genomes")
    dist_arr = array(dist_dict_to_2dlist(dist_dict))
    names = array(list(dist_dict))
    lm = linkage(squareform(dist_arr), method="single", optimal_ordering=True)

    if args.plot_dendrogram:
        plot_dendrogram(lm, names, args.prefix)

    if args.print_clusters:
        cluster_ids = fcluster(lm,
                               t=args.cluster_threshold,
                               criterion="distance")
        clustered_genomes = sorted(list(zip(names, cluster_ids)),
                                   key=lambda x: x[1])
        cluster_dict = dict()
        with open("%s.clstr" % args.prefix, 'w') as handle:
            for genome, cluster_id in clustered_genomes:
                handle.write("%s\t%s\n" % (genome, cluster_id))
                if cluster_id in cluster_dict.keys():
                    cluster_dict[cluster_id].append(genome)
                else:
                    cluster_dict[cluster_id] = [genome]

        metrics = None
        if args.checkm_file:
            with open(args.checkm_file, 'r') as f:
                metrics = dict(map(genome_metric, f.readlines()[3:-1]))

        with open("%s.repr.clstr" % args.prefix, 'w') as handle:
            for cluster_id, genomes in cluster_dict.items():
                if len(genomes) == 1 or metrics is None:
                    handle.write("%s\t%s\n" % (genomes[0], cluster_id))
                else:
                    cluster_metrics = sorted(
                        {x: metrics[os.path.splitext(x)[0]]
                         for x in genomes}.items(),
                        key=lambda x: -x[1])
                    handle.write("%s\t%s\n" %
                                 (cluster_metrics[0][0], cluster_id))

    if args.heatmap:
        order = leaves_list(lm)
        dist_arr = dist_arr[order, ]
        dist_arr = dist_arr[:, order]
        names = names[order]
        plot_heatmap(args, names, dist_arr)
Example #40
0
def sort_by_clust(data, n_clusters=10, ncomp=3, output='labels'):
    """
    Given data (n_samples, n_features), reduce dimensionality by SVD and do
    Agglomerative clusterisation with ward linkage. If output is `sort`,
    return leaves of the clasterisation tree. If output is `labels`, convert to
    flat clusters and return labels.

    Input:
    -------
     - data: data points, 2D array (n_samples, n_features)
     - n_clusters [10]: if output is `labels` cut the agglomerative clustering tree
       at this number of clusters (based on cophenetic distances); if `None`, try
       find optimal number of clusters from Calinski-Harabasz criterion (slow)
     - ncomp [3]: number of SVD components to use in dimensionality reduction step
     - output [labels]: if output is `labels`, return flat clusters, if output is `sort`,
       return leaves of the agglomerative clustering tree as a 1D array.
    """
    u, s, vh = linalg.svd(data, False)
    u = u[:, :ncomp]
    nsamples = len(u)
    Z = sp_hierarchy.linkage(u, method='ward')
    if 'sort' in output:
        #Z = sp_clust.hierarchy.linkage(u, method='ward')
        return sp_hierarchy.leaves_list(Z)
    else:
        if n_clusters is not None:
            labels = sp_hierarchy.fcluster(Z, n_clusters, criterion='maxclust')
        else:
            # this is just a dumb guess. must be tested though
            # gap statistic or CH index? (how to calculate in Python?)
            # i.e. sklearn.metrics.calinski_harabasz_score
            # calc labels for several n_clusters, find max CH score. (nclust>=2)
            nsignals_per_cluster = range(2, 50, 2)
            nc_acc = []
            ch_acc = []
            for nsc in nsignals_per_cluster:
                nc = np.int(np.ceil(nsamples / nsc))
                nc = max(2, nc)
                labels = sp_hierarchy.fcluster(Z, nc, criterion='maxclust')
                ch = skmetrics.calinski_harabasz_score(u, labels)
                ch_acc.append(ch)
                nc_acc.append(nc)
            k = np.argmax(ch_acc)
            labels = sp_hierarchy.fcluster(Z, nc_acc[k], criterion='maxclust')
            #dcoph = sp_hierarchy.cophenet(Z)
            #th = np.percentile(dcoph,5)
            #labels = sp_hierarchy.fcluster(Z,th,criterion='distance')
        return labels
def hierarchical_cluster(df, compute_dist=True,  pdist_kws=None, 
                         method='average', min_cluster_size=3,
                         cluster_kws=None):
    """
    plot hierarchical clustering and heatmap
    :df: a correlation matrix
    parse_heatmap: int (optional). If defined, devides the columns of the 
                    heatmap based on cutting the dendrogram
    """
    
    # if compute_dist = False, assume df is a distance matrix. Otherwise
    # compute distance on df rows
    if compute_dist == True:
        if pdist_kws is None:
            pdist_kws= {'metric': 'correlation'}
        if pdist_kws['metric'] == 'abscorrelation':
            # convert to absolute correlations
            dist_vec = abs_pdist(df)
        elif pdist_kws['metric'] == 'sqcorrelation':
            # convert to squared correlations
            dist_vec = squareform(1-df.T.corr()**2)
        else:
            dist_vec = pdist(df, **pdist_kws)
        dist_df = pd.DataFrame(squareform(dist_vec), 
                               index=df.index, 
                               columns=df.index)
    else:
        assert df.shape[0] == df.shape[1]
        dist_df = df
        dist_vec = squareform(df.values)
    #clustering. This works the same as hclust
    link = linkage(dist_vec, method=method)    
    #dendrogram
    # same as order.dendrogram(as.dendrogram(hclust output)) in R
    reorder_vec = leaves_list(link)
    clustered_df = dist_df.iloc[reorder_vec, reorder_vec]
    # clustering
    if cluster_kws is None:
        cluster_kws = {'minClusterSize': 3,
                       'verbose': 0,
                       'pamStage': False}
    labels = dynamicTreeCut(dist_df, func='hybrid', method=method,  **cluster_kws)
    labels = reorder_labels(labels, link)
    return {'linkage': link, 
            'distance_df': dist_df, 
            'clustered_df': clustered_df,
            'reorder_vec': reorder_vec,
            'labels': labels}
Example #42
0
def cluster_sp_agglomerative(content):
    """ Agglomerative Clustering """
    if content['transpose'] == 1:
        content['data'] = list(map(list, zip(*content['data'])))
    dataMatrix = numpy.array(content['data'])
    linkageMatrix = hier.linkage(dataMatrix,
        method=content['sp_method'],
        metric=content['sp_metric'],
        optimal_ordering=content['sp_ordering'] == 1)
    heatmapOrder = hier.leaves_list(linkageMatrix)
    orderedDataMatrix = dataMatrix[heatmapOrder,:]
    return httpWrapper( json.dumps({
		'result': orderedDataMatrix.tolist(),
		'order': heatmapOrder.tolist(),
		'dendo': hier.dendrogram(linkageMatrix, no_plot=True)
	}, ignore_nan=True ))
Example #43
0
def cluster_kmer_dists(kmers_dists, kmers_scores, kmers, out_pdf):
    ''' Plot a clustered heatmap of k-mer distances and scores.'''

    # cluster
    kmer_cluster = hierarchy.linkage(kmers_dists,
                                     method='single',
                                     metric='euclidean')
    order = hierarchy.leaves_list(kmer_cluster)

    # re-order distance matrix
    kmers_dists_reorder = kmers_dists[order, :]
    kmers_dists_reorder = kmers_dists_reorder[:, order]

    # plot
    plot_kmer_dists(kmers_dists_reorder, kmers_scores[order], kmers[order],
                    out_pdf)
Example #44
0
def reorder_labels(labels, link):
    """ reorder labels based on a linkage matrix
    
    reorder labels based on dendrogram position
    reindex so the clusters are in order based on their proximity
    in the dendrogram
    """
    reorder_vec = leaves_list(link)
    cluster_swap = {}
    last_group = 1
    for i in labels[reorder_vec]:
        if i not in cluster_swap.keys():
            cluster_swap[i] = last_group
            last_group += 1
    cluster_reindex = np.array([cluster_swap[i] for i in labels])
    return cluster_reindex
Example #45
0
def sort_distance_matrix(distance_matrix,
                         embeddings,
                         names,
                         method='complete'):
    assert method in ['ward', 'single', 'average', 'complete']
    np.fill_diagonal(distance_matrix, 0.)
    cond_distance_matrix = squareform(distance_matrix, checks=False)
    linkage_matrix = hierarchy.linkage(cond_distance_matrix,
                                       method='complete',
                                       optimal_ordering=True)
    res_order = hierarchy.leaves_list(linkage_matrix)
    distance_matrix = distance_matrix[res_order][:, res_order]
    embeddings = [embeddings[i] for i in res_order]
    names = [names[i] for i in res_order]
    np.fill_diagonal(distance_matrix, np.nan)
    return distance_matrix, embeddings, names, res_order
Example #46
0
    def _cluster_samples(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Reorder samples with clustering of given order list.

        Args:
            df: dataframe to reorder index
        """
        # Determine samples order using hierarchical clustering
        Z = hierarchy.linkage(
            distance.pdist(df.T),
            method="single",
            metric="euclidean",
            optimal_ordering=False,
        )
        order = hierarchy.leaves_list(Z)
        return df.iloc[:, order]
Example #47
0
    def __cluster_columns__(self, column_distance, column_linkage):
        self.data = [list(col) for col in zip(*self.data)]
        if not self.missing_values is False:
            self.data, missing_values_indexes = self.__impute_missing_values__(self.data)
        
        self.column_clustering = fastcluster.linkage(self.data, method=column_linkage, metric=column_distance)
        self.data_order = hcluster.leaves_list(self.column_clustering)

        if not self.missing_values is False:
            self.data = self.__return_missing_values__(self.data, missing_values_indexes)
        
        self.data = list(zip(*self.data))
        self.data = self.__reorder_data__(self.data, self.data_order)
        self.original_data = self.__reorder_data__(self.original_data, self.data_order)
        if self.header:
            self.header = self.__reorder_data__([self.header], self.data_order)[0]
Example #48
0
 def plot_clustermap(self, rank, subgroups, index_with_cluster, linkage, d,
                     nmf_method):
     # generate colours for n subgroups
     colmap = self.generate_colmap_for_subgroups(rank, subgroups)
     # generate subtype colour information
     subtype_colour_info_list = self.filtered_reddy_dataset.subtypes.subtype_colour_info_list
     # hacky way to get round null but need to sort this
     subtype_colour_info_list = [
         (patient_cols, colourmap)
         for (patient_cols, colourmap) in subtype_colour_info_list
         if (patient_cols is not None) and (colourmap is not None)
     ]
     subtype_colourmaps = [
         subtype_cm for (subtype_cm, _) in subtype_colour_info_list
     ]
     # get subgroup assignments in order of dendrogram for colouring
     index_to_cluster = dict(index_with_cluster)
     reordered_ind = sch.leaves_list(linkage)
     # plot clustermap
     #g = sns.clustermap(d, metric='euclidean', row_linkage=linkage, col_linkage=linkage)
     g = sns.clustermap(
         d,
         metric='euclidean',
         row_linkage=linkage,
         col_linkage=linkage,
         col_colors=subtype_colourmaps,
         yticklabels=False,
         xticklabels=False,
         dendrogram_ratio=(0.1, 0.1),
         cbar_pos=(1, .5, .03, .3),
         cbar_kws={
             'label': 'Distance between patients in consensus matrix'
         },
         tree_kws={
             'colors':
             [colmap[index_to_cluster[ca]] for ca in reordered_ind]
         })
     # don't show row dendrogram: g.ax_row_dendrogram.set_visible(False)
     # set title and labels
     title = "Clustermap on " + nmf_method + " Consensus Matrix of " + str(
         self.subset_required[0]) + " Patients (" + str(
             self.subset_required[1]) + " Genes) \n\n"
     g.fig.suptitle(title, y=1.02)
     g.ax_heatmap.set_xlabel("Patients")
     g.ax_heatmap.set_ylabel("Patients")
     # plot legends
     self.create_all_subtype_legends(g, subtype_colour_info_list)
Example #49
0
    def fig2_region_blocks(self):
        """Return three panels showing taxa, function, and amr for all samples."""
        phyla = group_small_cols(self.wide_phyla_rel, top=4)
        sample_order = phyla.index[leaves_list(
            linkage(
                squareform(
                    self.tabler.beta_diversity(phyla, metric='jensenshannon')),
                'average'))]
        phyla['sample'] = phyla.index
        phyla['continent'] = self.meta['continent']
        phyla = phyla.melt(id_vars=['sample', 'continent'])
        phyla = phyla.dropna()
        phyla = phyla.query('continent != "Nan"')

        amrs = group_small_cols(self.amrs, top=5)
        amrs['sample'] = amrs.index
        amrs['continent'] = self.meta['continent']
        amrs = amrs.melt(id_vars=['sample', 'continent'])
        amrs = amrs.dropna()
        amrs = amrs.query('continent != "Nan"')

        def my_plot(tbl, label):
            return (ggplot(
                tbl,
                aes(x='sample', y='value', fill='variable', group='continent'))
                    + geom_col() + facet_grid('.~continent', scales="free") +
                    scale_color_brewer(
                        type='qualitative', palette=3, direction=1) +
                    theme_minimal() + scale_y_sqrt(expand=(0, 0)) +
                    labs(fill=label) + theme(
                        text=element_text(size=20),
                        panel_grid_major=element_blank(),
                        panel_grid_minor=element_blank(),
                        legend_position='bottom',
                        axis_text_x=element_blank(),
                        axis_title_x=element_blank(),
                        axis_text_y=element_blank(),
                        axis_title_y=element_blank(),
                        panel_border=element_rect(colour="black", size=2),
                        figure_size=(32, 4),
                    ))

        return [
            my_plot(phyla, 'Phyla'),
            my_plot(self.function_groups, 'Pathways'),
            my_plot(amrs, 'AMR Class'),
        ]
Example #50
0
def _reorder_dendrogram(z, dists, leaf_ordering):

    if leaf_ordering == 'optimal':
        z = optimal_leaf_ordering(z, dists)
        h = leaves_list(z)

    elif leaf_ordering == 'count_sort_ascending':
        r = dendrogram(z,
                       get_leaves=True,
                       count_sort='ascending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    elif leaf_ordering == 'count_sort_descending':
        r = dendrogram(z,
                       get_leaves=True,
                       count_sort='descending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    elif leaf_ordering == 'distance_sort_ascending':
        r = dendrogram(z,
                       get_leaves=True,
                       distance_sort='ascending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    elif leaf_ordering == 'distance_sort_descending':
        r = dendrogram(z,
                       get_leaves=True,
                       distance_sort='descending',
                       no_plot=True,
                       no_labels=True,
                       show_leaf_counts=False)
        h = r['leaves']

    else:
        raise ValueError('Unsupported leaf ordering')

    return h
Example #51
0
def get_hierarchical_clustering_order(
        reads_filename, chromosomes=None):
    data = []
    chunksize = 10 ** 5
    for chunk in csvutils.read_csv_and_yaml(
            reads_filename, chunksize=chunksize):
        chunk["bin"] = list(zip(chunk.chr, chunk.start, chunk.end))

        # for some reason pivot doesnt like an Int64 state col
        chunk['state'] = chunk['state'].astype('float')

        chunk = chunk.pivot(index='cell_id', columns='bin', values='state')

        data.append(chunk)

    # merge chunks, sum cells that get split across chunks
    table = pd.concat(data)
    table = table.groupby(table.index).sum()

    bins = pd.DataFrame(
        table.columns.values.tolist(),
        columns=[
            'chr',
            'start',
            'end'])

    bins['chr'] = bins['chr'].astype(str)

    bins = sort_bins(bins, chromosomes)

    table = table.sort_values(bins, axis=0)

    data_mat = np.array(table.values)

    data_mat[np.isnan(data_mat)] = -1

    row_linkage = hc.linkage(sp.distance.pdist(data_mat, 'cityblock'),
                             method='ward')

    order = hc.leaves_list(row_linkage)

    samps = table.index
    order = [samps[i] for i in order]
    order = {v: i for i, v in enumerate(order)}

    return order
Example #52
0
def tree_sort(old_l, distances, return_leaves=True):  ## average linkage
    assert len(distances) == len(old_l)

    if len(old_l) == 1:
        leaves = [0]
    else:
        y = distance.squareform(distances, checks=True)
        Z = hierarchy.average(y)
        #c,coph_dists = hierarchy.cophenet(Z,y)
        leaves = hierarchy.leaves_list(Z)

    new_l = [old_l[x] for x in leaves]

    if not return_leaves:
        return new_l
    else:
        return new_l, leaves
Example #53
0
def reorder(C):
    """
    Reorder consensus matrix.

    :param C: Consensus matrix.
    :type C: `numpy.matrix`
    """
    c_vec = np.array([C[i, j] for i in range(C.shape[0] - 1)
                     for j in range(i + 1, C.shape[1])])
    # convert similarities to distances
    Y = 1 - c_vec
    Z = linkage(Y, method='average')
    # get node ids as they appear in the tree from left to right(corresponding
    # to observation vector idx)
    ivl = leaves_list(Z)
    ivl = ivl[::-1]
    return C[:, ivl][ivl, :]
Example #54
0
def hierarchical_clustering(norms, labels, font):
    Z = hac.linkage(norms, method='complete', metric=dist)
    labels = labels
    # Plot dendogram
    plt.figure(figsize=(25, 5))
    plt.title('Hierarchical Clustering Dendrogram')
    #plt.xlabel('Norm')
    plt.ylabel('Distance')
    hac.dendrogram(
        Z,
        labels=labels,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=font,  # font size for the x axis labels
    )
    #
    #plt.show()
    index = leaves_list(Z)
    return index, Z
Example #55
0
    def _compute_cluster_label_order(self,
                                     values,
                                     labels,
                                     dist_metric='euclidean',
                                     linkage_method='ward'):

        if len(labels) == 1:
            return labels
        dist_matrix = pdist(values, metric=dist_metric)
        linkage_matrix = linkage(dist_matrix, method=linkage_method)

        # dn = dendrogram(linkage_matrix, labels=labels, distance_sort='ascending')
        # ordered_label = dn['ivl']

        ordered_index = leaves_list(linkage_matrix)
        ordered_label = [labels[idx] for idx in ordered_index]

        return ordered_label
Example #56
0
def cluster_patterns(patterns, n_clusters=9, cluster_track='seq_ic'):
    """Cluster patterns
    """
    # Whole pipeline from this notebook
    sim = similarity_matrix(patterns, track=cluster_track)

    # cluster
    lm_nte_seq = linkage(1 - sim, 'ward', optimal_ordering=True)
    cluster = cut_tree(lm_nte_seq, n_clusters=n_clusters)[:, 0]

    cluster_order = np.argsort(leaves_list(lm_nte_seq))
    pattern_table_nte_seq = create_pattern_table(lm_nte_seq, cluster_order, cluster,
                                                 align_track='contrib/mean',
                                                 logo_len=70,
                                                 seqlogo_kwargs=dict(width=320),
                                                 footprint_width=320,
                                                 footprint_kwargs=dict())
    return sim, lm_nte_seq, cluster, cluster_order, pattern_table_nte_seq
def recipes_hie(nutrition_data, algr):
    scaled_data = StandardScaler().fit_transform(nutrition_data)

    #use scipy.cluster library to get the dendrogram of the data, which helps me to determine the number of clusters
    plt.figure(figsize=(25, 10))
    plt.title("Clusters determined by hierarchy - " + 'ward')

    #the second method to get the dendrogram
    Z = shc.ward(pdist(scaled_data))
    print(shc.leaves_list(Z))
    dn = shc.dendrogram(Z)

    #the first method to get the dendrogram
    #dend = shc.dendrogram(shc.linkage(scaled_data, method=algr))

    plt.show()
    exit()

    #use sklearn.cluster library to group the data points into these number clusters
    cluster = AgglomerativeClustering(n_clusters=7,
                                      affinity='euclidean',
                                      linkage=algr)
    cluster.fit(scaled_data)
    y_pred = cluster.fit_predict(scaled_data)

    #use PCA method to reduce dimensions to two, and visualize
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(scaled_data)
    principalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    print(principalDf)

    fig = plt.figure(figsize=(20, 7))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title("Clusters determined by hierarchy - " + algr, fontsize=20)
    ax.scatter(principalComponents[:, 0],
               principalComponents[:, 1],
               c=y_pred,
               cmap='Paired')
    ax.grid()
    fig.show()
Example #58
0
File: nmf.py Project: thenmf/bignmf
    def reorder_consensus_matrix(M: np.array):
        """Reoders the consensus matrix.

        Args:
                M (np.array): Input matrix

        Returns:
                np.array: Reordered output matrix
        """

        M = pd.DataFrame(M)
        Y = 1 - M
        Z = linkage(squareform(Y), method="average")
        ivl = leaves_list(Z)
        ivl = ivl[::-1]
        reorderM = pd.DataFrame(M.values[:, ivl][ivl, :],
                                index=M.columns[ivl],
                                columns=M.columns[ivl])
        return reorderM.values
Example #59
0
def _order_rows(dataframe):
    '''
    >>> df = pd.DataFrame({
    ...   'cell-1': {'a':8, 'b':1, 'c': 7, 'd': 2},
    ...   'cell-2': {'a':1, 'b':1, 'c': 1, 'd': 1},
    ...   'cell-3': {'a':9, 'b':1, 'c': 8, 'd': 2},
    ...   'cell-4': {'a':1, 'b':2, 'c': 1, 'd': 1}
    ... })
    >>> _order_rows(df)
    ['a', 'c', 'b', 'd']

    '''
    row_labels = dataframe.index.tolist()
    if len(row_labels) > 1:
        rows_linkage = linkage(dataframe, 'ward')
        rows_order = leaves_list(rows_linkage).tolist()
        return [row_labels[i] for i in rows_order]
    else:
        return row_labels
Example #60
0
def _cluster_markers(markers, marker_labels, marker_groups_order, s, c):
    # cluster markers hierarchically using mean size and color
    markers_order = []
    for marker_group in marker_groups_order:
        marker_names = markers[marker_group]
        marker_features = []
        for marker in marker_names:
            marker_idx = np.array(marker_labels) == marker
            marker_features.append(
                np.concatenate([s[marker_idx], c[marker_idx]]))
        marker_features = np.array(marker_features)
        # normalize
        marker_features = marker_features / \
            np.sqrt(np.sum(marker_features ** 2))
        marker_group_order = hierarchy.leaves_list(
            hierarchy.linkage(marker_features))
        markers_order.append(marker_group[marker_group_order])
    markers_order = np.concatenate(markers_order)
    return markers_order