Example #1
0
def compare_clusters(args):

    ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix()
    check_symmetry(ref_df)
    linkage_ref = linkage(ref_df, 'average')
    c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df))

    outfile = open(args['output'],"w")
    outfile.write("Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n")

    for i in args['all']:
        fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix()
        check_symmetry(fst_df)
        mantel_coeff = 0.0
        p_value_mantel = 0.0
        cophenetic_pearson = 0.0
        p_value_cophenetic = 0.0
        n = 0
        try:
            mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df)
            linkage_fst = linkage(fst_df, 'average')
            c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df))
            cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst)
        except Exception as e:
            print("Error : %s" % str(e))
            mantel_coeff = "Failed"
            p_value_manel = "Failed"
            cophenetic_pearson = "Failed"
            p_value_cophenetic = "Failed"

        outfile.write(i+"\t"+str(mantel_coeff)+"\t"+str(p_value_mantel)+"\t"+str(cophenetic_pearson)+"\t"+str(p_value_cophenetic)+"\n")

    outfile.close()
def measure_cluster_accuracy(hier, data):
    """
    Generate score for Hierarchy clusters.
    The closer the value is to 1, the better the clustering preserves the original distances
    """
    score, coph_dists = cophenet(hier, pdist(data))
    print('\n', 'Cophenet distance for ', cat, '==> ', round(score, 2))
def cengci(data):
    X = data
    distMatrix = pdist(X)
    Z = linkage(X, 'ward')
    c, coph_dists = cophenet(Z, pdist(X))
    print c
    dendrogram(Z)
Example #4
0
def Hierarchical_cluster_part(csvFile):
    df = pd.read_csv(csvFile)
    data = df.as_matrix()
    data = data[:, 1:]
    # generate the linkage matrix
    Z = linkage(data, 'ward')
    c, coph_dists = cophenet(Z, pdist(data))
    print c
    ## Plotting a Dendrogram
    # calculate full dendrogram
    plt.figure(figsize=(140, 60))
    plt.title('Hierarchical Clustering Dendrogram(part)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=2.,  # font size for the x axis labels
    )
    # fancy_dendrogram(
    # Z,
    # truncate_mode='lastp',  # show only the last p merged clusters
    #p=18,  # show only the last p merged clusters
    # leaf_rotation=90.,  # rotates the x axis labels
    #  leaf_font_size=8.,  # font size for the x axis labels
    #   show_leaf_counts=True, # numbers in brackets are counts
    #    show_contracted=True,  # to get a distribution impression in truncated branches
    #     max_d = 6000  # max_d as in max_distance
    #  )
    plt.savefig(
        '/Users/CeciliaLee/Dropbox/Intren/HKIA/2/Dendrogram_Tree(part).png')
    plt.show()

    return c, Z
Example #5
0
def hierarchical_clustering(
        df: Union[pd.DataFrame, np.ndarray],
        method: str = "ward") -> Union[HierCluster, None, ValueError]:
    """Hierarchical cluster of a dataframe.

    Return clustering created using scipy from a given dataframe of
    correlations, using the HierCluster class available in
    prestools.classes.

    See Also:
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

    Args:
        df: input dataframe of correlations
        method: method to use to cluster the data ('ward', 'single',
            'complete', 'average', 'weighted', 'centroid', 'median')
            (default: 'ward')

    Returns:
        cl: instance of prestools.classes.HierCluster()
    """
    if method not in [
            "ward", "single", "complete", "average", "weighted", "centroid",
            "median"
    ]:
        return ValueError("Method not valid!")
    if df.shape == (0, 0) or df.shape == (1, 1):
        return
    cl = HierCluster()
    cl.linkage = sch.linkage(df, method=method)
    cl.pair_dist = ssd.pdist(df)
    cl.coph_dist, cl.coph_matr = sch.cophenet(cl.linkage, cl.pair_dist)

    return cl
Example #6
0
 def _get_doc_clusters(self, paint=False):
     start_time = time.time()
     self.logger.info("start make doc cluster...")
     full_d2v = np.empty(shape=(len(self.lda_d2v),
                                self.topic_conf['num_topics']))
     for i, v in enumerate(self.lda_d2v):
         v = matutils.unitvec(matutils.sparse2full(
             v, self.topic_conf['num_topics']),
                              norm='l1')
         full_d2v[i] = v
     dist_matrix = sch.distance.pdist(full_d2v, 'euclidean')
     link_matrix = sch.linkage(dist_matrix, method='average')
     cophenet, cophenet_dist = sch.cophenet(link_matrix, dist_matrix)
     self.logger.info("doc cluster cophenet is [%s]" % cophenet)
     self.num_doc_clusters = len(
         self.lda_d2v) // self.cluster_conf['num_clusters_factor']
     self.logger.info("doc cluster number is [%d]" % self.num_doc_clusters)
     sch_d2c = None
     if self.num_doc_clusters < 2:
         self.logger.error("too small doc cluster number")
     else:
         sch_d2c = sch.fcluster(link_matrix,
                                t=self.num_doc_clusters,
                                criterion='maxclust')
         with open(self.d2c_file, "w") as fo:
             fo.write("\n".join(map(str, sch_d2c)))
         if paint:
             self._paint(full_d2v, sch_d2c, link_matrix)
     self.logger.info("end make doc cluster cost %ds" %
                      (time.time() - start_time))
     return sch_d2c
Example #7
0
def create_linkage(vecs, metric="cosine", order=True):
    link = linkfun(vecs, metric, order)

    c, coph_dists = cophenet(link, pdist(vecs, metric))
    print("Cophenet Distance between linkage and original vecs: " + str(c))

    return link
Example #8
0
 def test_linkage_cophenet_tdist_Z(self):
     # Tests cophenet(Z) on tdist data set.
     expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295,
                           295, 138, 219, 295, 295])
     Z = hierarchy_test_data.linkage_ytdist_single
     M = cophenet(Z)
     assert_allclose(M, expectedM, atol=1e-10)
Example #9
0
 def test_linkage_cophenet_tdist_Z(self):
     # Tests cophenet(Z) on tdist data set.
     expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295,
                           295, 138, 219, 295, 295])
     Z = hierarchy_test_data.linkage_ytdist_single
     M = cophenet(Z)
     assert_allclose(M, expectedM, atol=1e-10)
Example #10
0
    def copheneticCorrelationCoeff(self):
        from scipy.cluster.hierarchy import cophenet
        from scipy.spatial.distance import pdist

        coeff, coph_dists = cophenet(self.Z, pdist(self.X))

        return coeff
Example #11
0
  def dendrogram(self, X, metric = 'Euclidean', linkage = 'ward', x_label = 'Patterns'):
      """Generate hierarchical dendrogram.

      Keyword arguments:
      metric    -- distance metric; default value = 'Euclidean'
      linkage   -- default value = 'ward'"""
      X = X.reshape((X.shape[0], X.shape[1] * X.shape[2]))
      Z = sch.linkage(X, linkage)
      c, coph_dists = sch.cophenet(Z, pdist(X, metric))
      # Cophenetic Correlation Coefficient of clustering.
      # This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering.
      # The closer the value is to 1, the better the clustering preserves the original distances.
      print 'Cophenetic correlation coefficient of clustering (the closer to 1, the better):', c

      # calculate full dendrogram
      fig = plt.figure(figsize=(20, 10))
      ax = fig.add_subplot(111)
      plt.title('Hierarchical Clustering Dendrogram')
      plt.xlabel(x_label)
      plt.ylabel(metric + ' Distance')
      sch.dendrogram(
          Z,
          leaf_rotation=90,  # rotates the x axis labels
          leaf_font_size=25,  # font size for the x axis labels
          labels = self.labels

      )
      ax.set_ylim(bottom=-0.5) 
      ax.tick_params(labelsize=25)
Example #12
0
def cophenetic(M):
    """
    Calculate the cophenetic correlation coefficient to assess the quality of clutering
    """
    Z = linkage(M, method='average')
    c, cophe_dist = cophenet(Z, pdist(M))
    return c
Example #13
0
def Hierarchical_cluster_part(csvFile):
    df=pd.read_csv(csvFile)
    data=df.as_matrix()
    data=data[:,1:]
    # generate the linkage matrix
    Z = linkage(data, 'ward')
    c, coph_dists = cophenet(Z, pdist(data))
    print c
     ## Plotting a Dendrogram
    # calculate full dendrogram
    plt.figure(figsize=(140, 60))
    plt.title('Hierarchical Clustering Dendrogram(part)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=2.,  # font size for the x axis labels
    )
   # fancy_dendrogram(
       # Z,
       # truncate_mode='lastp',  # show only the last p merged clusters
        #p=18,  # show only the last p merged clusters
       # leaf_rotation=90.,  # rotates the x axis labels
      #  leaf_font_size=8.,  # font size for the x axis labels
     #   show_leaf_counts=True, # numbers in brackets are counts
    #    show_contracted=True,  # to get a distribution impression in truncated branches
   #     max_d = 6000  # max_d as in max_distance
  #  )
    plt.savefig('/Users/CeciliaLee/Dropbox/Intren/HKIA/2/Dendrogram_Tree(part).png')
    plt.show()  
    
    return c, Z   
Example #14
0
    def dendrogram(self,
                   X,
                   metric='Euclidean',
                   linkage='ward',
                   x_label='Patterns'):
        """Generate hierarchical dendrogram.

      Keyword arguments:
      metric    -- distance metric; default value = 'Euclidean'
      linkage   -- default value = 'ward'"""
        X = X.reshape((X.shape[0], X.shape[1] * X.shape[2]))
        Z = sch.linkage(X, linkage)
        c, coph_dists = sch.cophenet(Z, pdist(X, metric))
        # Cophenetic Correlation Coefficient of clustering.
        # This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering.
        # The closer the value is to 1, the better the clustering preserves the original distances.
        print 'Cophenetic correlation coefficient of clustering (the closer to 1, the better):', c

        # calculate full dendrogram
        fig = plt.figure(figsize=(20, 10))
        ax = fig.add_subplot(111)
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel(x_label)
        plt.ylabel(metric + ' Distance')
        sch.dendrogram(
            Z,
            leaf_rotation=90,  # rotates the x axis labels
            leaf_font_size=25,  # font size for the x axis labels
            labels=self.labels)
        ax.set_ylim(bottom=-0.5)
        ax.tick_params(labelsize=25)
Example #15
0
def _hierarchical_cluster_consensus_matrix(consensus_matrix,
                                           force_diagonal=True,
                                           method='ward'):
    """
    Hierarchical cluster consensus_matrix and compute cophenetic correlation coefficient.
    Convert consensus_matrix into distance matrix. Hierarchical cluster the distance matrix. And compute the
    cophenetic correlation coefficient.
    :param consensus_matrix: DataFrame;
    :param force_diagonal: bool;
    :param method: str; method parameter for scipy.cluster.hierarchy.linkage
    :return: ndarray float; linkage (Z) and cophenetic correlation coefficient
    """

    # Convert consensus matrix into distance matrix
    distance_matrix = 1 - consensus_matrix
    if force_diagonal:
        for i in range(distance_matrix.shape[0]):
            distance_matrix.iloc[i, i] = 0

    # Cluster consensus matrix to assign the final label
    hierarchical_clustering = linkage(consensus_matrix, method=method)

    # Compute cophenetic correlation coefficient
    cophenetic_correlation_coefficient = pearsonr(
        pdist(distance_matrix), cophenet(hierarchical_clustering))[0]

    return hierarchical_clustering, cophenetic_correlation_coefficient
Example #16
0
def cengci(data):
    X = data
    distMatrix = pdist(X)
    Z = linkage(X, 'ward')
    c, coph_dists = cophenet(Z, pdist(X))
    print c
    dendrogram(Z)
Example #17
0
def get_zx(start_time, method="single", fname="", Zf=False, **kwas):
    '''
    :param start_time: int indicating the earliest query the window should include
    :param method: the linkage method to be used
    :param fname: string to be appended to end of plot file name
    :param Zf: boolean -> if True, will load from file (see code for file name).
        NOTE: it will save the most recent version you calculated. Make sure the
        right version of the file exists before setting Zf to true
    :param **kwas: keyword arguments for vv.get_svl()
    :return: linkage, dendrogram's output, svl

    computes and plots dendrogram with respect to distance between clients
    '''
    if Zf is False:
        kwas['start_time'] = start_time
        X, fmt, _, ccache = vv.get_svl(**kwas)
        logger.warning("svl len: "+str(len(X)))

        dm = np.zeros((len(X) * (len(X) - 1)) // 2, dtype=np.double)
        k = 0
        for i in xrange(0, len(X)-1):
            for j in xrange(i + 1, len(X)):
                dm[k] = 1.0 - ccache[X[i]][X[j]]
                k = k + 1
        ccache.dump()
        Z = linkage(dm, method)
        df.pickleout(plotsdir+'pickles/'+'Z_'+method+fname+'.pickle', (Z, dm, X))
        logger.warning('dumped Z to ' \
                +plotsdir+'pickles/'+'Z_'+method+fname+'.pickle')
    else:
        Z, dm, X = df.picklein(plotsdir+'pickles/'+'Z_'+method+fname+'.pickle')
        logger.warning('loaded Z from '+plotsdir+'pickles/'+'Z_'+method+fname+'.pickle')
    c, coph_dists = cophenet(Z, dm)

    return Z, X
Example #18
0
    def computeLinkage( self, printDendogram = False ):
        # generate two clusters: a with 100 points, b with 50:
        #np.random.seed(4711)  # for repeatability of this tutorial
        #a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,])
        #b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,])
        #X = np.concatenate((a, b),)
        self.X = array( self.buildingAverages.values() )
        #print X  # 150 samples with 2 dimensions
        #plt.scatter(X[:,0], X[:,1])
        #plt.show()

        # generate the linkage matrix
        self.Z = linkage(self.X, 'ward')

        c, coph_dists = cophenet(self.Z, pdist(self.X))

        if (printDendogram):
            # calculate full dendrogram
            plt.figure(figsize=(25, 10))

            plt.title('Hierarchical Clustering Dendrogram (truncated)')
            plt.xlabel('Dendogram of Dartmouth campus buildings clusters')
            plt.ylabel('distance')
            dendrogram(
                self.Z,
                #truncate_mode='lastp',  # show only the last p merged clusters
                #p=20,  # show only the last p merged clusters
                show_leaf_counts=True,  # otherwise numbers in brackets are counts
                leaf_rotation=90.,
                leaf_font_size=12.,
                show_contracted=True,  # to get a distribution impression in truncated branches
            )
            plt.show()

        return self.Z
Example #19
0
def get_optimal_hc_params(mouse_day):
    """
    Returns a list of 2: [method, dist]
        method: {'ward', 'average', 'complete'}
        dist: {'cityblock', 'euclidean', 'chebychev'}

    Parameters
    ----------
    mouse_day: a 170 * M numpy array,
        column 0 : strain,
        column 1: mouse,
        other columns corresponding to feature avg/std of a mouse over 16 days

    Returns
    -------
    method_distance: list
        [method, dist]
    """
    methods = ['ward', 'average', 'complete']
    dists = ['cityblock', 'euclidean', 'chebychev']

    method_dists = [(methods[i], dists[j]) for i in range(len(methods))
                    for j in range(len(dists))]
    method_dists = [(method, dist) for method, dist in method_dists
                    if method != 'ward' or dist == 'euclidean']

    cs = []
    for method, dist in method_dists:
        Z = linkage(mouse_day[:, 2:], method=method, metric=dist)
        c, coph_dists = cophenet(Z, pdist(mouse_day[:, 2:]))
        cs.append(c)

    # determine the distance method
    method, dist = method_dists[np.argmax(cs)]
    return([method, dist])
def main():
        # fetch distance matrix from specified input file
        distMatFile = sys.argv[1]
        nameList,Dij_sq,N=fetchDistMat(distMatFile)

        # in scipy most routines operate on 'condensed'
        # distance matrices, i.e. upper triagonal matrices
        # the function square contained in the scipy.spatial
        # submodule might be used in order to switch from 
        # full square to condensed matrices and vice versa
        Dij_cd = ssd.squareform(Dij_sq)
        # hierarchical clustering where the distance between
        # two coordinates is the distance of the cluster
        # averages
        # cluster Result = 'top down view' of the hierarchical
        # clustering
        clusterResult = sch.linkage(Dij_cd, method='average')
        # returns cophenetic distances
        # corr = cophenetic correlation
        # Cij_cd = condensed cophenetic distance matrix
        corr,Cij_cd   = sch.cophenet(clusterResult,Dij_cd)
        Cij_sq = ssd.squareform(Cij_cd)

        # print dendrogram on top of cophenetic distance 
        # matrix to standard outstream
        droPyt_distMat_dendrogram_sciPy(Cij_sq,clusterResult,N)
Example #21
0
def bestCOCLUSTER(df):
    df = df.T
    # from scipy.cluster.hierarchy import distance
    from scipy.spatial import distance
    linkmethod = [
        'single', 'complete', 'average', 'weighted', 'centroid', 'median',
        'ward'
    ]
    paraDF = pd.DataFrame(columns=['method', 'CCC'], index=linkmethod)
    paraDF.loc[:, 'method'] = linkmethod

    for iter_m in linkmethod:
        Y = distance.pdist(np.asarray(df))
        print(Y.shape)
        Z = hierarchy.linkage(Y, method=iter_m)

        c, coph_dists = hierarchy.cophenet(Z, Y)
        paraDF.loc[iter_m, 'CCC'] = c
    paraDF.sort_values(by='CCC', ascending=False, inplace=True)
    # print(paraDF)
    row_linkage = hierarchy.linkage(distance.pdist(np.asarray(df)),
                                    method=paraDF.iloc[0, 0])
    col_linkage = hierarchy.linkage(distance.pdist(np.asarray(df).T),
                                    method=paraDF.iloc[0, 0])

    # print(paraDF.iloc[0,0])
    sns.clustermap(df,
                   row_linkage=row_linkage,
                   col_linkage=col_linkage,
                   figsize=(13, 13))
    plt.show()
    return hierarchy.linkage(distance.pdist(np.asarray(df)))
def make_dendogram(Amatrix,formation_stats,t=80,method='ward',dendogram=True):
    # calculate dendogram from matrix of distances between formation observations
    # returns a list that indicates the cluster for each formation observation based on the horizontal distance threshold 't'
    d = sch.distance.squareform(Amatrix)
    L = sch.linkage( d, method=method)
    c = sch.cophenet(L)
    rho_p = stats.pearsonr(c,d)[0]
    rho_s = stats.spearmanr(c,d).correlation
    print ("Cophenetic distance = %1.2f (spearman), %1.2f (pearson)" % (rho_s,rho_p))
    if dendogram:
        fig,ax = plt.subplots(figsize=(25, 10))
        #dn = sch.dendrogram(L)
        dn = fancy_dendrogram(L,truncate_mode='lastp',p=200,leaf_rotation=90.,leaf_font_size=12.,show_contracted=True,annotate_above=10)
    # group clusters
    fcl = sch.fcluster(L,t=t,criterion='distance',depth=2)
    # now map formations to clusters
    ctypes = []
    tmat = ['A','D','A','D']
    count = 0
    for f in formation_stats:
        #teams = (f[0][0:3],f[0][3:],f[0][3:],f[0][0:3])
        teams = ('H','A','A','H')
        for i in [1,2,3,4]:
            for j in range(f[i]):
                ctypes.append( (count,f[0],teams[i-1],tmat[i-1], fcl[count]) )
                count += 1
        ctypes = sorted(ctypes, key = lambda x: x[4] )
    return ctypes
def calculate_cophenetic_correlation(connmat):
    Y = 1 - connmat
    Z = linkage(squareform(Y),method='average')
    c,d= cophenet(Z,squareform(Y))
    #print c
    #print d
    return (c,d)
def get_cophenetic_scipy(A, k, n_iter, alg, start):
    """
    Returns the cophenetic correlation coefficient for NMF (specified by alg) with k metagenes

        A : data-set to decompose
        k (int): number of metagenes
        n_iter (int): number of different decompositionn to average
        alg (string) : Which variant of SNMF to perform. 
                                options are: 
                                'base' : (simultaneous NMF) 
                                'sorth_W' : (simultaneous NMF with semi-orthogonal W)
                                'sorth_H : (simultaneous NMF with semi-orthogonal H)
                                'norm_sorth_W' : (simultaneous NMF with semi-orthogonal W
                                                where columns of W normalized every iteration)
                                'norm_sorth_H' : (simultaneous NMF with semi-orthogonal H
                                                where rows of H normalized every iteration)
                                'aff_sorth_W' : (simultaneous affine NMF with semi-orthogonal W
                                                where columns of W normalized every iteration)
                                'aff_sorth_H' : (simultaneous affine NMF with semi-orthogonal H
                                                where rows of H normalized every iteration)
       start (string): How to initialize matrices.
                           options are:
                           'rand' : random initialization
                           'sorth_W' : semi-orthogonal W
                           'sorth_H' : semi-orthogonal H
                           
        returns (float): cophenetic correlation coefficient for simultaeneous NMF with k metagenes
     """
    Cb = get_avg_con_mats(A, k, n_iter, alg,start)
    lmat = linkage(Cb, method='average')
    return cophenet(lmat, pdist(Cb))[0]
Example #25
0
def get_optimal_hc_params(mouse_day):
    """
    Returns a list of 2: [method, dist]
        method: {'ward', 'average', 'complete'}
        dist: {'cityblock', 'euclidean', 'chebychev'}

    Parameters
    ----------
    mouse_day: a 170 * M numpy array,
        column 0 : strain,
        column 1: mouse,
        other columns corresponding to feature avg/std of a mouse over 16 days

    Returns
    -------
    method_distance: list
        [method, dist]
    """
    methods = ['ward', 'average', 'complete']
    dists = ['cityblock', 'euclidean', 'chebychev']

    method_dists = [(methods[i], dists[j]) for i in range(len(methods))
                    for j in range(len(dists))]
    method_dists = [(method, dist) for method, dist in method_dists
                    if method != 'ward' or dist == 'euclidean']

    cs = []
    for method, dist in method_dists:
        Z = linkage(mouse_day[:, 2:], method=method, metric=dist)
        c, coph_dists = cophenet(Z, pdist(mouse_day[:, 2:]))
        cs.append(c)

    # determine the distance method
    method, dist = method_dists[np.argmax(cs)]
    return ([method, dist])
Example #26
0
def agglomerative(embeds,names,viz=True):

    l = linkage(embeds, method='complete', metric='seuclidean')
    if viz:
        plt.figure(figsize=(25, 10))
        plt.title('Hierarchical Clustering Dendrogram')
        plt.ylabel('word')
        plt.xlabel('distance')

        dendrogram(
            l,
            leaf_rotation=90.,  # rotates the x axis labels
            leaf_font_size=0.,  # font size for the x axis labels
            orientation='top',
        )
        plt.show()

        minimal_dendrogram(
            l,
            truncate_mode='lastp',
            p=12,
            leaf_rotation=90.,
            leaf_font_size=12.,
            show_contracted=True,
            annotate_above=10,
            city='Sydney'
        )
        plt.show()

    corr, coph_dists = cophenet(l, pdist(embeds))
    print('\nCophenetic correlation:', corr,'\n')

    return l
Example #27
0
def clus_agglome(dat, meth, order):
    # Generate the linkage matrix using the (Ward, ) algorithm
    Z = linkage(dat['x'].values, method=meth)  #'ward', 'complete', single'

    # Generate the dendrogram (and save)

    #plt.ioff()     # Turn interactive plotting off
    f = plt.figure(figsize=(12, 5))
    plt.title('Hierarchical Clustering Dendrogram with link=' + meth)
    plt.ylabel('Distance in the space four dimensions')
    dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=8.,
    )  # font size for the x axis labels
    plt.savefig('dendogram_' + meth + '.png')
    plt.show()
    plt.close(f)

    # ============
    # Create cluster objects
    clus_obj = AgglomerativeClustering(n_clusters=3, linkage=meth)

    # Assign the elements to groups
    group = clus_obj.fit_predict(dat['x'])

    # Evaluate the success ratio of the clustering
    d = pd.crosstab(dat['vari'].variety,
                    group,
                    margins=True,
                    margins_name="Total")
    d = d.reindex(order)
    truth = sum(np.diag(d))
    success = 100 * truth / (np.shape(dat['x'])[0])

    # Check the Cophenetic Correlation Coefficient to assess quality of clusters:
    c, coph_dists = cophenet(Z, pdist(dat['x']))

    #  Let's plot our clusters
    plt.figure()
    plt.subplot(121)
    plt.scatter(dat['x'].s_length, dat['x'].s_width, s=10, c=vari.variety_num)
    plt.title("Real groups")
    plt.xticks(())
    plt.yticks(())

    plt.subplot(122)
    plt.scatter(dat['x'].s_length, dat['x'].s_width, s=10,
                c=group)  # predicted
    plt.title('Predicted groups ' + meth)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    resul = dict()
    resul['accuracy'] = success
    resul['cophe'] = c

    return resul
Example #28
0
 def test_linkage_cophenet_tdist_Z_Y(self):
     # Tests cophenet(Z, Y) on tdist data set.
     Z = hierarchy_test_data.linkage_ytdist_single
     (c, M) = cophenet(Z, hierarchy_test_data.ytdist)
     expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295, 295, 138, 219, 295, 295])
     expectedc = 0.639931296433393415057366837573
     assert_allclose(c, expectedc, atol=1e-10)
     assert_allclose(M, expectedM, atol=1e-10)
Example #29
0
def coph_cor(A, idx=None):
    avec = np.array([
        A[i, j] for i in range(A.shape[0] - 1)
        for j in range(i + 1, A.shape[1])
    ])
    Y = 1 - avec
    Z = linkage(Y, method='average')
    return cophenet(Z, Y)[0]
Example #30
0
def nmf_sigs(x, k_start, k_end, trial_lower, trial_upper, trial_tol, N_iter,
             dist_metric, hierarchy_method):
    Z1 = x.group_cluster
    model_d = {}
    for k in range(k_start, k_end + 1):
        print('k = {}'.format(k))
        num_trials = 0

        corr_avg = []
        corr_std = 0.0
        current_std = 0.0
        prev_std = 0.0
        std_diff = []

        error_avg = []

        while num_trials <= trial_lower or np.mean(
                std_diff) > trial_tol and num_trials < trial_upper:
            prev_std = current_std
            model = JNMF_model(x.storage)
            model.init_wh(k=k)

            for j in range(N_iter):
                model.mult_update()

            error_avg.append(model.error())
            Z2 = linkage(minmax_scale(pdist(model.w, dist_metric)),
                         method=hierarchy_method)
            corr = cophenet_corr(cophenet(Z1), cophenet(Z2))
            corr_avg.append(corr)
            current_std = np.std(corr_avg)
            std_diff.append(abs(prev_std - current_std))
            if len(std_diff) > trial_lower:
                std_diff = std_diff[1:]
            num_trials += 1
            try:
                model_d[k].append((corr, model.error(), model))
            except:
                model_d[k] = [(corr, model.error(), model)]

        print('num trials = {}'.format(num_trials))
        print('avg corr = {}'.format(np.mean(corr_avg)))
        print('avg error = {}'.format(np.mean(error_avg)))
        print()

    return model_d
def visualize(filepath, ceiling=1000, ward=None):
    """Render dendrograms of rhyme clustering

    Parameters:
        filepath (str): path to XML file with poem, required
        ceiling (int): maximum number of stanzas to return (useful for sampling long poems),
            defaults to high value
        ward (boolean): show Ward dendrogram separately (improves legibility of long stanzas),
            defaults to None

    Return: No return; prints text and renders dendrograms directly
    """

    df = explore(filepath, ceiling, ward)
    stanzas = df.groupby(level=[0, 1])
    i = 0
    for id, lines in stanzas:
        if i < 11:
            print(
                pd.concat([
                    lines["Text"].str.replace(r"<[^>]+?>", ""),
                    lines[["RhymeWord", "RhymeZone"]]
                ],
                          axis=1))  # diagnostic
            data = lines.copy().filter(
                regex=r"^token\d_")  # only one-hot features
            labelList = list(range(1,
                                   len(lines) +
                                   1))  # labels are line numbers within stanza
            data.loc[:,
                     "LineNo"] = [2 * n / len(labelList) for n in labelList
                                  ]  # scale to avoid tyranny of proximity
            complete = linkage(data, method="complete")
            complete_c, complete_coph_dist = cophenet(complete, pdist(data))
            ward = linkage(data, method="ward")
            ward_c, ward_coph_dists = cophenet(ward, pdist(data))
            plt.figure(figsize=(12, 4))
            plt.subplot(1, 2, 1)
            plt.title("Complete: " + str(complete_c))
            dendrogram(complete, labels=labelList)
            plt.subplot(1, 2, 2)
            plt.title("Ward: " + str(ward_c))
            dendrogram(ward, labels=labelList)
        i += 1
    plt.show()
Example #32
0
def FormCluster(X):
    Z = linkage(X, 'single')
    c1, coph_dists = cophenet(Z, pdist(X))
    cl1.append(c1)

    Z = linkage(X, 'complete')
    c2, coph_dists = cophenet(Z, pdist(X))
    cl2.append(c2)

    Z = linkage(X, 'average')
    c3, coph_dists = cophenet(Z, pdist(X))
    cl3.append(c3)

    Z = linkage(X, 'weighted')
    c4, coph_dists = cophenet(Z, pdist(X))
    cl4.append(c4)

    Z = linkage(X, 'centroid')
    c5, coph_dists = cophenet(Z, pdist(X))
    cl5.append(c5)

    Z = linkage(X, 'median')
    c6, coph_dists = cophenet(Z, pdist(X))
    cl6.append(c6)

    Z = linkage(X, 'ward')
    c7, coph_dists = cophenet(Z, pdist(X))
    cl7.append(c7)
def get_linkage(id):
    batch = configs[16*id:16*(id+1)-1]
    X = [x[1:] for x in batch]
    labels = [x[0] for x in batch]

    Z = linkage(X, 'average')
    c, coph_dists = cophenet(Z, pdist(X))
    print(c)
    return Z,X,labels
Example #34
0
 def plot_dendrogram(self):
     # Get linkage matrix
     Z = linkage(self.neurons.T, "ward")
     c, coph_dists = cophenet(Z, pdist(self.neurons.T))
     plt.figure()
     dendrogram(Z)
     print(
         "Plotted dendrogram with cophenetic distance of {:.2f}".format(c))
     plt.show(block=False)
def plot_clustered_heatmap(df, genes_list, cancer, output_path, scale='binary'):
    # Build nxm matrix (n samples, m genes)
    X = df[genes_list].as_matrix().transpose()
    
    if scale == 'binary':
        Z = linkage(X, method='complete', metric='hamming')
        colorscale = [[0, "rgb(111, 168, 220)"], [1, "rgb(5, 10, 172)"]]
        colorbar = {'tick0': 0,'dtick': 1}
    elif scale == 'logarithmic':
        Z = linkage(X, method='ward')
        X_max = X.max()
        colorscale = [[0, 'rgb(250, 250, 250)'],
                      [1./X_max, 'rgb(200, 200, 200)'],
                      [5./X_max, 'rgb(150, 150, 200)'],
                      [20./X_max, 'rgb(100, 100, 200)'],
                      [100./X_max, 'rgb(50, 50, 200)'],
                      [1., 'rgb(0, 0, 200)']]
        colorbar = {'tick0': 0,
                    'tickmode': 'array',
                    'tickvals': [0, 1, 5, 20, 100, X_max]}
    c, coph_dists = cophenet(Z, pdist(X))
    print "Cophenetic Correlation Coefficient:", c
    
    #layout = go.Layout(yaxis=dict(title='%s germline mutations (ordered by samples somatic mutation load)'% cancer, zeroline=False))    
#    fig = pylab.figure(figsize=(8,8))
#    ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
#    ax1.set_xticks([])
#    ax1.set_yticks([])
#    axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
    den = dendrogram(Z, orientation='left')
    idx = den['leaves']
    X = X[idx,:]
    print "X shape:", X.shape
    genes_ordered = [genes_list[i] for i in idx]
    logger.info("ordered genes: %s", str(genes_ordered))
    
#    im = axmatrix.matshow(X, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
#    axmatrix.set_xticks([])
#    axmatrix.set_yticks([])
#    # Plot colorbar.
#    axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
#    pylab.colorbar(im, cax=axcolor)
#    fig.savefig(output_path)
    
    # Plotting the heatmap (without the hirarchy)
    heatmap_trace = go.Heatmap(z=X.tolist(), x=df.patient_id, y=genes_ordered, showscale=True, colorscale=colorscale, colorbar=colorbar)
    mutation_load_trace = go.Bar(x=df.patient_id, y=df.somatic_mutations_count/30.0)
    fig = tls.make_subplots(rows=29, cols=1, specs=[[{'rowspan':5, 'colspan' : 1}]] + [[None]] * 4 + [[{'rowspan' : 24, 'colspan' : 1}]] + [[None]] * 23)
    fig.append_trace(mutation_load_trace, 1, 1)
    fig.append_trace(heatmap_trace, 6, 1)
    fig['layout']['xaxis1'].update(showticklabels = False)
    fig['layout']['xaxis1'].update(zeroline = False, showgrid=False)
    fig['layout']['yaxis1'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4))
    fig['layout']['xaxis2'].update(showticklabels = False)
    fig['layout']['xaxis2'].update(zeroline = False, showgrid=False)
    fig['layout']['yaxis2'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4))
    plot(fig, auto_open=False, filename="%s_%s_heatmap_clustered.html" % (output_path, cancer))
Example #36
0
def hycluster(X, link, metr, datatype):

    # generate the linkage matrix
    Z = linkage(X, link)

    cuttree = cut_tree(Z, n_clusters=[2, 10])
    #print('cut tree shape', cuttree.shape)
    #print('Full cuttree', cuttree)
    global clus2, clus10
    for i in cuttree:
        clus2.append(i[0])
        clus10.append(i[1])

    c, coph_dists = cophenet(Z, X)
    print('Cophenet:', metr, c)
    titl = 'Hierarchical Clustering ' + datatype + ',' + link + ',' + metr
    # calculate full dendrogram
    #plt.figure(figsize=(15, 8))
    plt.figure()

    def fancy_dendrogram(*args, **kwargs):
        max_d = kwargs.pop('max_d', None)
        if max_d and 'color_threshold' not in kwargs:
            kwargs['color_threshold'] = max_d
        annotate_above = kwargs.pop('annotate_above', 0)
        ptitle = kwargs.pop('plttitle', 0)

        ddata = dendrogram(*args, **kwargs)

        if not kwargs.get('no_plot', False):
            plt.title(ptitle)
            plt.xlabel('sample index or (cluster size)')
            plt.ylabel('distance')
            for i, d, c in zip(ddata['icoord'], ddata['dcoord'],
                               ddata['color_list']):
                x = 0.5 * sum(i[1:3])
                y = d[1]
                if y > annotate_above:
                    plt.plot(x, y, 'o', c=c)
                    plt.annotate("%.3g" % y, (x, y),
                                 xytext=(0, -5),
                                 textcoords='offset points',
                                 va='top',
                                 ha='center')
            if max_d:
                plt.axhline(y=max_d, c='k')
        return ddata

    fancy_dendrogram(
        Z,
        #truncate_mode='lastp',
        #p=12,
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True,
        annotate_above=10,  # useful in small plots so annotations don't overlap
        plttitle=titl)
Example #37
0
 def test_linkage_cophenet_tdist_Z_Y(self):
     # Tests cophenet(Z, Y) on tdist data set.
     Z = hierarchy_test_data.linkage_ytdist_single
     (c, M) = cophenet(Z, hierarchy_test_data.ytdist)
     expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295,
                           295, 138, 219, 295, 295])
     expectedc = 0.639931296433393415057366837573
     assert_allclose(c, expectedc, atol=1e-10)
     assert_allclose(M, expectedM, atol=1e-10)
Example #38
0
def buildtree(featuresvector, method):
    '''Creates tree from peptide features and returns root node'''
    featuresvector = removeprevlabel(featuresvector)
    x_scaled, _ = scale(featuresvector)
    print('Building linkage matrix using {} algorithm ...'.format(method))
    linkage_matrix = linkage(x_scaled, method)
    coph, _ = cophenet(linkage_matrix, pdist(x_scaled))
    print('Cophenet parameter (values close to 1 are good): {}'.format(coph))
    return to_tree(linkage_matrix), linkage_matrix
Example #39
0
def cal_cophenetic(C):
	""" calculate cophenetic correlation coefficient """
	print("=== calculate cophenetic correlation coefficient ===")
	X = C  # Original data (1000 observations)
	"""Z = linkage(X)"""
	Z = fc.linkage_vector(X)         # Clustering
	orign_dists = fc.pdist(X)  # Matrix of original distances between observations
	cophe_dists = cophenet(Z)  # Matrix of cophenetic distances between observations
	corr_coef = np.corrcoef(orign_dists, cophe_dists)[0,1]
	return corr_coef
Example #40
0
def compare_clusters(args):

    ref_df = pd.read_table(args['ref'],
                           sep='\t',
                           skipinitialspace=True,
                           index_col=0).as_matrix()
    check_symmetry(ref_df)
    linkage_ref = linkage(ref_df, 'average')
    c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df))

    outfile = open(args['output'], "w")
    outfile.write(
        "Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n"
    )

    for i in args['all']:
        fst_df = pd.read_table(i, sep='\t', skipinitialspace=True,
                               index_col=0).as_matrix()
        check_symmetry(fst_df)
        mantel_coeff = 0.0
        p_value_mantel = 0.0
        cophenetic_pearson = 0.0
        p_value_cophenetic = 0.0
        n = 0
        try:
            # mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df)
            mantel_coeff, p_value_mantel, n = mantel_test(ref_df, fst_df)
            linkage_fst = linkage(fst_df, 'average')
            c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df))
            cophenetic_pearson, p_value_cophenetic = pearsonr(
                coph_dists_ref, coph_dists_fst)
        except Exception as e:
            print("Error : %s" % str(e))
            mantel_coeff = "Failed"
            p_value_manel = "Failed"
            cophenetic_pearson = "Failed"
            p_value_cophenetic = "Failed"

        outfile.write(i + "\t" + str(mantel_coeff) + "\t" +
                      str(p_value_mantel) + "\t" + str(cophenetic_pearson) +
                      "\t" + str(p_value_cophenetic) + "\n")

    outfile.close()
Example #41
0
def cophenetic_best(condensedD, methods=('single', 'complete', 'average', 'weighted')):
    # What hierarchical clustering method is the best, according to the cophenetic correlation?
    # 'centroid', 'median' and 'ward' do not make sense with dice, since the dm needs to be Euclidean
    # In fact, they require the original matrix and not the distance matrix
    # (so change the API if ever considereing them).
    results = {}
    for method in methods:
        Z = linkage(condensedD, method=method)
        cophenetic_correlation, _ = hierarchy.cophenet(Z, condensedD)
        results[method] = cophenetic_correlation
    results = pd.Series(results)
    return results.sort_values(ascending=False), results.idxmax()
def create_dendrogram(dist):
    global points
    distances=linkage(points,dist)
    c,coph_dists=cophenet(distances,pdist(points))
    plt.figure(figsize=(25,10))
    plt.title('Dendogram')
    plt.xlabel('Points')
    plt.ylabel('Distance')
    dend=dendrogram(distances,show_contracted=True)
    plt.show()
    dend2=dendrogram(distances,show_contracted=True,truncate_mode='lastp',p=3)
    plt.show()
    clusters=fcluster(distances,3,criterion='maxclust')
    return c,clusters
Example #43
0
def evaluate_cluster_w(TopicData, LinkageMatrix, GroundTruth): 
    # check the correlation coefficient
    CorrCoeff, coph_dists = cophenet(LinkageMatrix, pdist(TopicData))
    ## check several cluster evaluation metrics
    Threshold = 2
    FlatClusterNumbers = fcluster(LinkageMatrix, Threshold)
    #print(GroundTruth)
    #print(FlatClusterNumbers)
    ARI = metrics.adjusted_rand_score(GroundTruth, FlatClusterNumbers)
    Homog = metrics.homogeneity_score(GroundTruth, FlatClusterNumbers)
    Compl = metrics.completeness_score(GroundTruth, FlatClusterNumbers) 
    VMeasure = metrics.v_measure_score(GroundTruth, FlatClusterNumbers) 
    print("Evaluation metrics with threshold "+str(Threshold))
    print("CorrCoeff:", CorrCoeff)
    print("adjustedRI:", ARI)
    print("Homogeneity:", Homog)
    print("Completeness:", Compl)
    print("V-Measure:", VMeasure)
    return CorrCoeff, ARI, Homog, Compl, VMeasure
Example #44
0
def _make_cluster_variants(gps, samples, max_k, variants=None):
    res = {}
    # TODO: performance: don't recompute hierarchical clusterings for diff max_k

    # TODO: include sklearn hierarchical clustering
    # model = AgglomerativeClustering()
    # model.fit(samples)
    # print('clustering results:')
    # print('labels:')
    # print(model.labels_)
    # print('n_leaves:')
    # print(model.n_leaves_)
    # print('n_components:')
    # print(model.n_components_)
    # print('children:')
    # print(model.children_)

    # TODO: include non hierarchical variants
    # a difference of 0.2 in cosine similarity is allowed to merge clusters
    # model = AffinityPropagation()
    # model.fit(samples)
    # labels = model.labels_
    # core_samples_mask = np.zeros_like(labels, dtype=bool)
    # core_samples_mask[model.core_sample_indices_] = True

    metrics = ['euclidean', 'cityblock', 'cosine']
    methods = [
        'single', 'complete', 'weighted', 'average',
        'centroid', 'median', 'ward',
    ]

    for scale in ['', 'scaled_']:
        ssamples = samples
        if scale:
            ssamples = StandardScaler().fit_transform(samples)

        for metric in metrics:
            cdist = pdist(ssamples, metric)
            if metric == 'cosine':
                # see https://github.com/scipy/scipy/issues/5208
                np.clip(cdist, 0, 1, out=cdist)

            for method in methods:
                name = '%s%s_%s' % (scale, metric, method)
                logger.debug('computing clustering %s', name)
                try:
                    if variants and name not in variants:
                        # could skip earlier but would make code more complex
                        continue
                    if method in ['ward', 'centroid', 'median']:
                        # method needs raw feature vectors in euclidean space
                        if metric == 'euclidean':
                            cluster_hierarchy = linkage(ssamples, method=method)
                        else:
                            continue
                    elif method not in [
                            'single', 'complete', 'weighted', 'average']:
                        # method needs raw inputs, recompute:
                        if metric == 'cosine':
                            # see: https://github.com/scipy/scipy/issues/5208
                            continue
                        cluster_hierarchy = linkage(
                            ssamples, method=method, metric=metric)

                    else:
                        cluster_hierarchy = linkage(cdist, method=method)

                    c, coph_dists = cophenet(cluster_hierarchy, cdist)

                    res[name] = HierarchicalCluster(
                        name, gps, samples, max_k, cluster_hierarchy, c)
                    logger.info('clustering %s computed with c: %0.3f', name, c)
                except ValueError:
                    logger.warning(
                        'The following exception occurred during clustering '
                        'with variant %s:\nException:',
                        name,
                        exc_info=1,  # appends exception to message
                    )
    logger.info('computed %d clustering variants', len(res))
    return res
Example #45
0
#for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
plt.legend()
plt.title('Phoneme PCA')
plt.xlabel('First Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[0], decimals = 3))+')')
plt.ylabel('Second Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[1], decimals = 3))+')')
fig.savefig('pca.png', bbox_inches='tight')
fig.savefig('pca.pdf', bbox_inches='tight')


#print phoneme
#print label
label = list(label)
#X = np.asarray(phoneme)
# generate the linkage matrix
Z = sch.linkage(X, 'ward')
c, coph_dists = sch.cophenet(Z, pdist(X, 'euclidean'))
# c, coph_dists = sch.cophenet(Z, pdist(X))
# Cophenetic Correlation Coefficient of clustering.
# This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering.
# The closer the value is to 1, the better the clustering preserves the original distances.
print label, type(label[0])
print c

# calculate full dendrogram
fig = plt.figure(figsize=(15, 5))
ax = fig.add_subplot(111)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('International Phonetic Alphabet Phoneme')
plt.ylabel('Distance')
sch.dendrogram(
    Z,
def distance_patients_from_consensus_file(
    result_folder, distance_patients, ppi_data, mut_type,
    influence_weight, simplification,
    alpha, tol,  keep_singletons, ngh_max, min_mutation, max_mutation,
    n_components, n_permutations, lambd, tol_nmf, linkage_method):

    consensus_directory = result_folder+'consensus_clustering/'
    consensus_mut_type_directory = consensus_directory + mut_type + '/'

    hierarchical_directory = result_folder+'hierarchical_clustering/'
    os.makedirs(hierarchical_directory, exist_ok=True)
    hierarchical_mut_type_directory = hierarchical_directory + mut_type + '/'
    os.makedirs(hierarchical_mut_type_directory, exist_ok=True)

    if lambd > 0:
        consensus_factorization_directory = (
            consensus_mut_type_directory + 'gnmf/')
        hierarchical_factorization_directory = (
            hierarchical_mut_type_directory + 'gnmf/')
    else:
        consensus_factorization_directory = (
            consensus_mut_type_directory + 'nmf/')
        hierarchical_factorization_directory = (
            hierarchical_mut_type_directory + 'nmf/')
    os.makedirs(hierarchical_factorization_directory, exist_ok=True)

    hierarchical_clustering_file = (
        hierarchical_factorization_directory +
        'hierarchical_clustering_Patients_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}_minMut={}_maxMut={}_comp={}_permut={}_lambd={}_tolNMF={}_method={}.mat'
        .format(influence_weight, simplification, alpha, tol, keep_singletons,
                ngh_max, min_mutation, max_mutation, n_components,
                n_permutations, lambd, tol_nmf, linkage_method))
    existance_same_param = os.path.exists(hierarchical_clustering_file)

    if existance_same_param:
        print(' **** Same parameters file of hierarchical clustering already exists')
    else:
        # print(type(distance_patients), distance_patients.shape)
        # hierarchical clustering on distance matrix (here: distance_patients)
        Z = linkage(distance_patients, method=linkage_method)

        # Plot setting
        matplotlib.rcParams.update({'font.size': 14})
        fig = plt.figure(figsize=(20, 20))
        fig.suptitle(
            'Hierarchical clustering\n\nPatients', fontsize=30, x=0.13, y=0.95)

        # Compute and plot dendrogram
        ax_dendro = fig.add_axes([0, 0.71, 0.6, 0.15])
        P = dendrogram(Z, count_sort='ascending', no_labels=True)
        ax_dendro.set_xticks([])
        ax_dendro.set_yticks([])

        # Plot distance matrix.
        ax_matrix = fig.add_axes([0, 0.1, 0.6, 0.6])
        idx = np.array(P['leaves'])
        D = distance_patients[idx, :][:, idx]
        im = ax_matrix.imshow(D, interpolation='nearest', cmap=cm.viridis)
        ax_matrix.set_xticks([])
        ax_matrix.set_yticks([])

        # Plot colorbar.
        ax_color = fig.add_axes([0.62, 0.1, 0.02, 0.6])
        ax_color.set_xticks([])
        plt.colorbar(im, cax=ax_color)

        # forms flat clusters from Z
        # given k -> maxclust
        clust_nb = fcluster(Z, n_components, criterion='maxclust')
        # cophenetic correlation distance
        coph_dist, coph_matrix = cophenet(Z, pdist(distance_patients))
        print(' cophenetic correlation distance = ', coph_dist)

        ax_dendro.set_title(
            'network = {}\nalpha = {}\nmutation type = {}\ninfluence weight = {}\nsimplification = {}\ncomponent number = {}\nlambda = {}\nmethod = {}\ncophenetic corr = {}\n'
            .format(ppi_data, alpha, mut_type,
                    influence_weight, simplification,
                    n_components, lambd, linkage_method,
                    format(coph_dist, '.2f')), loc='right')

        plot_name = "similarity_matrix_Patients" + (
            '_alpha={}_tol={}_singletons={}_ngh={}_minMut={}_maxMut={}_comp={}_permut={}_lambd={}_tolNMF={}_method={}'
            .format(alpha, tol, keep_singletons, ngh_max, min_mutation,
                    max_mutation, n_components, n_permutations, lambd, tol_nmf,
                    linkage_method))
        plt.savefig('{}{}.pdf'.format(hierarchical_factorization_directory,
                                      plot_name), bbox_inches='tight')
        plt.savefig('{}{}.svg'.format(hierarchical_factorization_directory,
                                      plot_name), bbox_inches='tight')

        # start = time.time()
        savemat(hierarchical_clustering_file,
                {'Z_linkage_matrix': Z,
                 'dendrogram_data_dictionary': P,
                 'dendrogram_index': idx,
                 'flat_cluster_number': clust_nb,
                 'cophenetic_correlation_distance': coph_dist,
                 'cophenetic_correlation_matrix': coph_matrix},
                do_compression=True)
Example #47
0
    ax.annotate(txt, (X_r[i, 0], X_r[i, 1]), horizontalalignment='center', verticalalignment='top',size = 14)
#for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
plt.legend()
plt.title('Phoneme PCA')
plt.xlabel('First Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[0], decimals = 3))+')')
plt.ylabel('Second Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[1], decimals = 3))+')')
#plt.show()
#fig.savefig('pca.jpg', bbox_inches='tight')
#fig.savefig('pca.pdf', bbox_inches='tight')


# generate the linkage matrix
X = X_r
print '\n\nX = ', X
Z = sch.linkage(X, 'ward')
c, coph_dists = sch.cophenet(Z, pdist(X, 'jaccard'))
# c, coph_dists = sch.cophenet(Z, pdist(X))
# Cophenetic Correlation Coefficient of clustering.
# This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering.
# The closer the value is to 1, the better the clustering preserves the original distances.
print label, type(label[0])
print c

# calculate full dendrogram
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Photos')
plt.ylabel('Distance')
sch.dendrogram(
    Z,
Example #48
0
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels):
    """
    Generate silhoutte score based on hierarchical clustering.

    Args:
        dendroMatrix: list, occurance of words in different files
        distance_metric: string, style of distance metric in the dendrogram
        linkage_method: string, style of linkage method in the dendrogram
        labels: list, file names

    Returns:
        silhouetteScore: string, containing the result of silhouette score 
        silhouetteAnnotation: string, annotation of the silhouette score
        score: float, silhouette score
        inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion 
        maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion
        distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion
        distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion
        monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion
        monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion
        threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank
    """
    activeFiles = len(labels) - 1
    if (activeFiles > 2):  # since "number of lables should be more than 2 and less than n_samples - 1"
        Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric)
        Z = hierarchy.linkage(Y, method=linkage_method)

        monocrit = None

        # 'maxclust' range
        maxclustMax = len(labels) - 1

        # 'incosistent' range
        R = hierarchy.inconsistent(Z, 2)
        inconsistentMax = R[-1][-1]
        slen = len('%.*f' % (2, inconsistentMax))
        inconsistentMax = float(str(inconsistentMax)[:slen])

        # 'distance' range
        d = hierarchy.cophenet(Z)
        distanceMax = d.max()
        slen = len('%.*f' % (2, distanceMax))
        distanceMax = float(str(distanceMax)[:slen])
        distanceMin = d.min() + 0.01
        slen = len('%.*f' % (2, distanceMin))
        distanceMin = float(str(distanceMin)[:slen])

        # 'monocrit' range
        MR = hierarchy.maxRstat(Z, R, 0)
        monocritMax = MR.max()
        slen = len('%.*f' % (2, monocritMax))
        monocritMax = float(str(monocritMax)[:slen])
        monocritMin = MR.min() + 0.01
        slen = len('%.*f' % (2, monocritMin))
        monocritMin = float(str(monocritMin)[:slen])

        threshold = request.form['threshold']
        if threshold == '':
            threshold = str(threshold)
        else:
            threshold = float(threshold)

        if request.form['criterion'] == 'maxclust':
            criterion = 'maxclust'
            if (threshold == '') or (threshold > maxclustMax):
                threshold = len(labels) - 1
            else:
                threshold = round(float(threshold))
        elif request.form['criterion'] == 'distance':
            criterion = 'distance'
            if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin):
                threshold = distanceMax
        elif request.form['criterion'] == 'inconsistent':
            criterion = 'inconsistent'
            if (threshold == '') or (threshold > inconsistentMax):
                threshold = inconsistentMax
        elif request.form['criterion'] == 'monocrit':
            criterion = 'monocrit'
            monocrit = MR
            if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin):
                threshold = monocritMax
        scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit)

        if len(set(scoreLabel)) <= 1:  # this means all the files are divided into only 1 or less cluster
            silhouetteScore = "Silhouette Score: invalid for only 1 cluster."
            silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster"
            score = 'invalid for only 1 cluster'
            inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A'
        else:
            score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed')
            score = round(score, constants.ROUND_DIGIT)
            inequality = '≤'.decode('utf-8')
            silhouetteScore = "Silhouette Score: " + str(
                score) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)"
            silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar."

    else:
        silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files."
        silhouetteAnnotation = ""
        score = 'invalid for less than or equal to 2 files.'
        threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A'


    return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
Example #49
0
            #  We stick stick with average-linkage
            #  Maybe we should check how original medoid order affects the clustering
            print('Clustering the medoids')
            D = dicedist_metric(medoids_df)  # And this is as cool as for spitting back a pandas dataframe
            condensedD = squareform(D)

            # Clustering time is irrelevant, report the qualities all at once
            cophenetic_ranking, best_method = cophenetic_best(condensedD)
            print('Cophenetic ranking\n%s\nbest: %s' % (cophenetic_ranking, best_method))

            linkage_method = 'average'
            print('Linkage: %s' % linkage_method)

            # --- Perform the linkage calculation
            Z = linkage(condensedD, method=linkage_method)
            cophenetic_correlation, _ = hierarchy.cophenet(Z, condensedD)

            # --- Save clustering to a json file for web-ingestion

            fn = get_hierarchy_file_prefix(dataset=dataset,
                region=neuropil, cluster_type=cluster_type)

            def save_hierarchy_json():
                # To keep the json small we should probably reduce the digits we save
                #   http://stackoverflow.com/questions/1447287/format-floats-with-standard-json-module
                # And of course, remove spaces, and maybe, use much shorter keys...
                # So make a function out of this, with parameter "small", and coordinate with the js world
                print('Saving json')
                tree = hierarchy2dictionary(Z, dendrogram=False, base=1)
                hdict = {
                    'dataset': dataset,
iiTD_ordered = OrderedDict(sorted(iiTD.items()))
tfidfV = TfidfVectorizer(stop_words='english',vocabulary=featureIndex,sublinear_tf=True)
tfs = tfidfV.fit_transform(iiTD_ordered.values())  #---(doc#,feature#)
print('TD matrix dimensions :', tfs.shape)

#---SVD!
svd = TruncatedSVD(n_components=MedianUniqueKeys,algorithm="arpack")   #CAN ALSO TRY: some% of #of Features instead of fixed n_components OR change algo arpack/randomized
svd.fit(tfs)
Sigma=svd.transform(tfs)
print('Reduced Dimensions of TD Matrix', Sigma.shape)



#---clustering!
Clustering_Order = linkage(Sigma,method='ward', metric='euclidean')  #can also provide a consistency constraint by making a graph of activities linked by category values. See http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html
c, coph_dists = cophenet(Clustering_Order, pdist(Sigma))
print(str(c))
#print(Clustering_Order)
plt.figure(figsize=(25, 10))
plt.title('Clustering Dendrogram')
plt.xlabel('ID_space')
plt.ylabel('Distance')
dendrogram(
    Clustering_Order,
    #truncate_mode='lastp',
    #p=20,
    #show_leaf_counts=False,
    leaf_rotation=90.,
    leaf_font_size=10.,
    #show_contracted=True,
)
# load the data
lang = pd.read_csv('https://raw.githubusercontent.com/generalassembly-studio/dsi-course-materials/master/curriculum/04-lessons/week-07/3.2-lesson/assets/datasets/lang.csv?token=ANUte4ku6wHT_-2xOgUxMM_08YUJ0RB6ks5XWWISwA%3D%3D')
lang.head()

# scatter to guess clusters
plt.scatter(lang['country'], lang['english'])
plt.show()

# Now, let's convert our data to a matrix to pass to the clustering algorithm - the matrix makes it easier for our algorithm to compute distance:
X = lang.as_matrix(columns=None)

# We'll implement the actual clustering algorithm using the ward method:
Z = linkage(X, 'ward')

# We can calculate the cophenetic correlation coefficient to see how well our algorithm has measured the distances between the points:
c, coph_dists = cophenet(Z, pdist(X))

# let's 'c' how it did
c

# now let's make our dendrogram
plt.title('Dendrogram')
plt.xlabel('Index Numbers')
plt.ylabel('Distance')
dendrogram(
    Z,
    leaf_rotation=90.,  
    leaf_font_size=8.,
)
plt.show()
#data


# In[ ]:

#clusterInfo = linkage(data, 'ward')      # c=0.62
#clusterInfo = linkage(data, 'centroid')  # c=0.89
#clusterInfo = linkage(data, 'weighted')  # c=0.86
#clusterInfo = linkage(data, 'average')   # c=0.91
#clusterInfo = linkage(data, 'complete')  # c=0.90
#clusterInfo = linkage(data, 'single')    # c=0.78

# Cophenet correlation coefficient measures 
# how faithfully a dendrogram preserves pairwise
# distance between the original data points:
(c, coph_dists) = cophenet(clusterInfo, pdist(data))
c


# In[ ]:

pandas.DataFrame(clusterInfo[:20],columns=['feature1', 'feature2', 'distance', 'clusterSize'])


# In[ ]:

# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Survey Question')
plt.ylabel('Distance')
print("Similarity matrix according to cos distances")
print((np.matrix(cosDistsFeatures)+np.matrix(cosDistsCirc[:][:]))/2)

print("Similarity matrix according to jaccard distances")
print((np.matrix(jsfea)+np.matrix(jsCirc))/2)
#Kmeands CLUSTER------------------------
concatted=np.concatenate((np.array(circlepeople), np.array(feats)), axis=1)
for num in range(2,10):
	print("k=")
	print(num)
	
	codebook, distortion = kmeans(concatted, num)
	code, dist = vq(concatted, codebook)
	print(code)

#centroids, labels = kmeans([ys2,circles,feats], 3)
#Hiearchical clustering

Z = linkage(concatted, 'ward')
c, coph_dists = cophenet(Z, pdist(concatted))
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()
Example #54
0
for i in range(0,len(tokens_txt)):
    a0.append(np.sum([Counter(tokens_txt[i])[x] for x in tokens_lsi[0]]))
topic1=norm(a0)

threshold=0.3
[print(topic1[i],documents[i]) for i in np.where(topic1>threshold)[0]]
lsi.print_topics(1)

from scipy.cluster.hierarchy import dendrogram, linkage

P = linkage(matrix3, 'ward')

from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

corr, coph_distances = cophenet(P, pdist(matrix3))
corr

plt.figure(figsize=(9,4))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('PARAGRAPH')
plt.ylabel('DISTANCE')
dendrogram(P,
    leaf_rotation=0.,
    leaf_font_size=12.,)
plt.show()

model00=[]
for i in range(0,len(sentences)):
    tokens = word_tokenize(str(sentences[i]))
    
distance = scipy.spatial.distance.pdist(X)
single_hierarchy = scipy.cluster.hierarchy.single(distance)

# max_d = 1.05
# prediction = fcluster(single_hierarchy, max_d, criterion='distance')
# prediction_single_hier = prediction


# Want to use the cophenetic distance matrix for each heirarchical algorithm
# to:

# 1: compare then against the dististance matrix
# 2: compare against themselves

single_cophenet = cophenet(single_hierarchy)

from scipy.stats import pearsonr

distance_metrics = [distance, single_cophenet]


comparisons = np.zeros((len(distance_metrics), len(distance_metrics)))

for i, j in itertools.product(np.arange(len(distance_metrics)), np.arange(len(distance_metrics))):

    comparisons[i, j] = pearsonr(distance_metrics[i], distance_metrics[j])[0]


######
# to get all through the different data types:
#Happened Correctly


# In[43]:

#Clustering based on Cosine Metric and Average linkage Method
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram    
from scipy.cluster.hierarchy import cophenet

checkx=diabetic_patients_binary_ddup[1:]
X=pdist(checkx.ix[:,0:len(diabetic_patients_binary_ddup.columns)-1],metric='cosine')

Z = linkage(X,method='average') 
c,cd=cophenet(Z,X)
c 


# In[44]:

##Code to make dendogram

# plt.title('Hierarchical Clustering Dendrogram (truncated)')
# plt.xlabel('Patient Group')
# plt.ylabel(' Cosine distance')
# dendrogram(
#     Z,
#     truncate_mode='level',  # show only the last p merged clusters
#     p=100,  # show only the last p merged clusters
#     show_leaf_counts=True,  # otherwise numbers in brackets are counts
Example #57
0
def coph_cor(A, idx=None):
    avec = np.array([A[i, j] for i in range(A.shape[0] - 1)
                    for j in range(i + 1, A.shape[1])])
    Y = 1 - avec
    Z = linkage(Y, method='average')
    return cophenet(Z, Y)[0]
Example #58
0
    
                    subsetProp[i, 0] = density[index]
                    subsetProp[i, 1] = temperature[index]
                    subsetProp[i, 2] = snII[index]
                    

                # Genearate the linkage matrix
                # Use the Ward variance minimization algorithm
                time1 = time.time()
                z = sh.linkage(subsetLoc, 'ward')
                time2 = time.time()
                print 'Duration of linkage = {0:f} seconds'.format(time2-time1)

                # Determine how well the clustering preserves the
                # original distance
                c, coph_dist = sh.cophenet(z, sd.pdist(subsetLoc))
                print c


                # Create the fancy dendrogram and save
                fancy_dendrogram(z, truncate_mode='lastp', p=12,
                                leaf_rotation=90, leaf_font_size=12,
                                show_contracted=True, annotate_above=10)

                figname = '{0:s}_{1:s}_{2:s}_abscells_dendrogram.png'.format(ion,galID,expn)
                plt.savefig(figname, bbox_inches='tight')
                plt.cla()
                plt.clf()

                k = num_clusters(z)
                print 'Number of clusters = {0:d}'.format(k)
Example #59
0
def main():
    all_rpkms = {"names": [], "rpkms": []}
    srna_rpkms = {"names": [], "rpkms": []}
    gene_rpkms = {"names": [], "rpkms": []}
    gff_f = open(args.gff_file, "r")
    genes = []
    for entry in Gff3Parser().entries(gff_f):
        if entry.feature != "source":
            genes.append(entry)
    libs =  {"TSB_OD_0.2": [], "TSB_OD_0.5": [], "TSB_OD_1": [], "TSB_t0": [], "TSB_t1": [], "TSB_t2": [], "TSB_ON": [],
             "pMEM_OD_0.2": [], "pMEM_OD_0.5": [], "pMEM_OD_1": [], "pMEM_t0": [], "pMEM_t1": [], "pMEM_t2": [], "pMEM_ON": []}
    fh = open(args.input_file, "r")
    for row in csv.reader(fh, delimiter='\t'):
        if (not row[0].startswith("Orientation")) and (
            row[0] == "sense"):
            gene_name = get_name(row)
            rpkm_row = [float(row[10]), float(row[11]), float(row[12]),
                        float(row[13]), float(row[14]), float(row[15]),
                        float(row[16]), float(row[17]), float(row[18]),
                        float(row[19]), float(row[20]), float(row[21]),
                        float(row[22]), float(row[23])]
            if row[3] == "CDS":
                all_rpkms["names"].append(gene_name)
                gene_rpkms["names"].append(gene_name)
                all_rpkms["rpkms"].append(rpkm_row)
                gene_rpkms["rpkms"].append(rpkm_row)
            elif row[3] == "sRNA":
                all_rpkms["names"].append(gene_name)
                srna_rpkms["names"].append(gene_name)
                all_rpkms["rpkms"].append(rpkm_row)
                srna_rpkms["rpkms"].append(rpkm_row)
    data = np.array(all_rpkms["rpkms"])
    Z = linkage(data, method='ward', metric='euclidean')
    c, coph_dists = cophenet(Z, pdist(data))
    clusters = fcluster(Z, args.max_d, criterion='distance')
    nums = {}
    names = {}
    c_genes = {}
    index = 0
    for c in clusters:
        if c not in nums.keys():
            nums[c] = 1
            names[c] = [all_rpkms["names"][index]]
            c_genes[c] = [all_rpkms["rpkms"][index]]
        else:
            nums[c] += 1
            names[c].append(all_rpkms["names"][index])
            c_genes[c].append(all_rpkms["rpkms"][index])
        index += 1
    print(nums)
#    x = np.arange(14)
#    labels = ["TSB_OD_0.2", "TSB_OD_0.5", "TSB_OD_1", "TSB_t0", "TSB_t1", "TSB_t2", "TSB_ON",
#              "pMEM_OD_0.2", "pMEM_OD_0.5", "pMEM_OD_1", "pMEM_t0", "pMEM_t1", "pMEM_t2", "pMEM_ON"]
#    color_list = list(six.iteritems(colors.cnames))
#    for index, gene_list in c_genes.items():
#        plt.figure(figsize=(12.5, 8))
#        srna_detect = False
#        srna_num = 1
#        color_num = 0
#        for i in range(len(gene_list)):
#            if "sRNA" in names[index][i]:
#                srna_detect = True
#                if ":" in names[index][i]:
#                    srna_name = names[index][i].split(":")[-1]
#                else:
#                    srna_name = "novel_" + str(srna_num)
#                    srna_num += 1
#                if ("grey" not in color_list[color_num][0]) and (
#                    "gray" not in color_list[color_num][0]) and (
#                    "white" not in color_list[color_num][0]) and (
#                    "snow" not in color_list[color_num][0]) and (
#                    color_list[color_num][0] != "w"):
#                    plt.plot(x,gene_list[i], color=color_list[color_num][0], label=srna_name)
#                    color_num += 1
#            else:
#                plt.plot(x,gene_list[i],color='lightgrey')
##        plt.axhline(y=0, linewidth=2, color='red')
#        plt.ylabel("log2 fold change", fontsize=10)
#        plt.xticks(x,labels,rotation=45, fontsize=8)
#        if srna_detect:
#            plt.legend(loc=9, bbox_to_anchor=(1.065, 1), fontsize=8)
#        plt.savefig("test_" + str(index) + ".png")
    for index, gene_names in names.items():
        print(index)
        for name in gene_names:
            for gene in genes:
                if ("locus_tag" in gene.attributes.keys()):
                    if name == gene.attributes["locus_tag"]:
                        print(gene.info)
                elif ("sRNA_hit" in gene.attributes.keys()):
                    infos = name.split("|")
                    if (infos[0] == gene.attributes["Name"]) and (
                        infos[1] == str(gene.start)) and (
                        infos[2] == str(gene.end)) and (
                        infos[3] == gene.strand):
                        print(gene.info)
    plt.style.use('ggplot')
    plt.figure(figsize=(25, 10))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Genes')
    plt.ylabel('distance')
    fancy_dendrogram(
        Z,
#        truncate_mode='lastp',
#        p=12,
        leaf_rotation=90.,
        leaf_font_size=12.,
#        show_contracted=True,
#        annotate_above=10,
        no_labels=True,
        show_leaf_counts=False,
        max_d=args.max_d,  # plot a horizontal cut-off line
    )
    plt.savefig("hierarchical_tree.png")