def progressive_msa_and_tree(sequences,
                             pairwise_aligner,
                             metric=kmer_distance,
                             guide_tree=None,
                             display_aln=False,
                             display_tree=False):
    """ Perform progressive msa of sequences and build a UPGMA tree
    Parameters
    ----------
    sequences : skbio.SequenceCollection
        The sequences to be aligned.
    pairwise_aligner : function
        Function that should be used to perform the pairwise alignments,
        for example skbio.alignment.global_pairwise_align_nucleotide. Must
        support skbio.Sequence objects or skbio.TabularMSA objects
        as input.
    metric : function, optional
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects. This will be used to build a guide tree if one
      is not provided.
    guide_tree : skbio.TreeNode, optional
        The tree that should be used to guide the alignment process.
    display_aln : bool, optional
        Print the alignment before returning.
    display_tree : bool, optional
        Print the tree before returning.

    Returns
    -------
    skbio.alignment
    skbio.TreeNode

    """
    if guide_tree is None:
        guide_dm = DistanceMatrix.from_iterable(
                        sequences, metric=metric, key='id')
        guide_lm = average(guide_dm.condensed_form())
        guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids)

    msa = progressive_msa(sequences, guide_tree,
                          pairwise_aligner=pairwise_aligner)

    if display_aln:
        print(msa)

    msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id')
    msa_lm = average(msa_dm.condensed_form())
    msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids)
    if display_tree:
        print("\nOutput tree:")
        d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right',
                   link_color_func=lambda x: 'black', leaf_font_size=24)
    return msa, msa_tree
def progressive_msa_and_tree(
    sequences,
    pairwise_aligner,
    sequence_distance_fn=kmer_distance,
    guide_tree=None,
    display_aln=False,
    display_tree=False,
):
    """ Perform progressive msa of sequences and build a UPGMA tree
    Parameters
    ----------
    sequences : skbio.SequenceCollection
        The sequences to be aligned.
    pairwise_aligner : function
        Function that should be used to perform the pairwise alignments,
        for example skbio.Alignment.global_pairwise_align_nucleotide. Must
        support skbio.BiologicalSequence objects or skbio.Alignment objects
        as input.
    sequence_distance_fn : function
        Function that returns and skbio.DistanceMatrix given an
        skbio.SequenceCollection. This will be used to build a guide tree if
        one is not provided.
    guide_tree : skbio.TreeNode, optional
        The tree that should be used to guide the alignment process.
    display_aln : bool, optional
        Print the alignment before returning.
    display_tree : bool, optional
        Print the tree before returning.

    Returns
    -------
    skbio.alignment
    skbio.TreeNode

    """
    if guide_tree is None:
        guide_dm = sequences.distances(sequence_distance_fn)
        guide_lm = average(guide_dm.condensed_form())
        guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids)

    msa = progressive_msa(sequences, guide_tree, pairwise_aligner=pairwise_aligner)
    if display_aln:
        print(msa)

    msa_dm = msa.distances()
    msa_lm = average(msa_dm.condensed_form())
    msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids)
    if display_tree:
        print("\nOutput tree:")
        d = dendrogram(
            msa_lm, labels=msa_dm.ids, orientation="right", link_color_func=lambda x: "black", leaf_font_size=24
        )
    return msa, msa_tree
Beispiel #3
0
def hierclust(dataset,k):
    f = open('biclusters/'+dataset+'M.bic')
    bics = []
    for l in f:
        bics.append( json.loads(l) )
    f.close()

    dist = []
    for i,b1 in enumerate(bics):
        for b2 in bics[i+1:]:
            dist.append(Jaccard(b1,b2))

    clusters = average(dist)    
    clustdict = {i:[i] for i in xrange(len(clusters)+1)}
    for i in xrange(len(clusters)-k+1):
        clust1= int(clusters[i][0])
        clust2= int(clusters[i][1])
        clustdict[max(clustdict)+1] = clustdict[clust1] + clustdict[clust2]
        del clustdict[clust1], clustdict[clust2]

    newbics = []
    for clusts in clustdict.values():        
        objs = reduce(lambda x,y: x+y,[bics[idx]['objs'] for idx in clusts])
        feats = reduce(lambda x,y: x+y,[bics[idx]['feats'] for idx in clusts])
        newbics.append({'objs': list(set(objs)), 'feats':list(set(feats))})
        
    fw = open('biclusters/'+dataset+'.bic','w')
    for bic in newbics:
        fw.write(json.dumps(bic)+'\n')
    fw.close()
 def linkage_along_graph(self):
     """
     Return the UPGMA linkage matrix for the distances along the graph.
     """
     if getattr(self, '_dist_linkage', None) is None:
         self._dist_linkage = average(squareform(self.graph_distances))
     return self._dist_linkage
def cluster_alchemy(dataset, gamma=None, filter=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy(gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy()

    print 'starting clustering: found %s document and %s features' \
          % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
def guide_tree_from_sequences(sequences,
                              distance_fn=kmer_distance,
                              display_tree = False):
    """ Build a UPGMA tree by applying distance_fn to sequences

    Parameters
    ----------
    sequences : skbio.SequenceCollection
      The sequences to be represented in the resulting guide tree.
    sequence_distance_fn : function
      Function that returns and skbio.DistanceMatrix given an
      skbio.SequenceCollection.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = sequences.distances(distance_fn)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
               link_color_func=lambda x: 'black')
    return guide_tree
def scipy_algo(dataset, abstract=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data(abstract)

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    #tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    print 'starting clustering after lsa: found %s document and %s features' \
          % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    linkage_matrix = hr.average(tfidf_matrix.toarray())
    #linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    avg_f_score = average_f_score(f, tfidf_matrix.shape[0])
    print 'average f_score: %s' % avg_f_score
    return avg_f_score
def guide_tree_from_sequences(sequences,
                              metric=kmer_distance,
                              display_tree = False):
    """ Build a UPGMA tree by applying metric to sequences

    Parameters
    ----------
    sequences : list of skbio.Sequence objects (or subclasses)
      The sequences to be represented in the resulting guide tree.
    metric : function
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = DistanceMatrix.from_iterable(
                    sequences, metric=metric, key='id')
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
               link_color_func=lambda x: 'black')
    return guide_tree
def cluster_dandelion_2(dataset, gamma=0.91, filter=False):
    #duplicato, mi serve solo per tornare la linkage_matrix
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion()

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    #linkage_matrix = hr.average(tfidf_matrix.toarray())
    linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    return linkage_matrix
Beispiel #10
0
    def hcluster(self):
        """

        .. plot::
            :include-source:
            :width: 50%

            from cno import XCNOGraph, cnodata
            c = XCNOGraph(cnodata("PKN-ToyPB.sif"), cnodata("MD-ToyPB.csv"))
            c.hcluster()

        .. warning:: experimental
        """
        from scipy.cluster import hierarchy
        from scipy.spatial import distance
        path_length=nx.all_pairs_shortest_path_length(self.to_undirected())
        n = len(self.nodes())
        distances=np.zeros((n,n))
        nodes = self.nodes()
        for u,p in path_length.iteritems():
            for v,d in p.iteritems():
                distances[nodes.index(u)-1][nodes.index(v)-1] = d
        sd = distance.squareform(distances)
        hier = hierarchy.average(sd)
        pylab.clf();
        hierarchy.dendrogram(hier)

        pylab.xticks(pylab.xticks()[0], nodes)
 def linkage_in_structure(self):
     """
     Return the UPGMA linkage matrix based on the correlation structure of
     the topslam embedding MST
     """
     if getattr(self, '_struct_linkage', None) is None:
         self._struct_linkage = average(pdist(self.distances_in_structure, metric='correlation'))
     return self._struct_linkage
Beispiel #12
0
def make_tree(X, C, method='single'):
    if method == 'single':
        tree = to_tree(single(C))
    elif method == 'ward':
        tree = to_tree(ward(X))
    elif method == 'average':
        tree = to_tree(average(C))
    return Tree(root=construct_node(tree))
def group_tuples(items=None, val_ind=None, dist_thresh = 0.1, distance_matrix=None, 
                 metric='jaccard', linkage='complete', sp_areas=None):
    '''
    items: a dict or list of tuples
    val_ind: the index of the item of interest within each tuple
    '''
    
    if distance_matrix is not None:
        if items is not None:
            if isinstance(items, dict):
                keys = items.keys()
                values = items.values()
            elif isinstance(items, list):
                keys = range(len(items))
                if isinstance(items[0], tuple):
                    values = map(itemgetter(val_ind), items)
                else:
                    values = items
    else:
        if isinstance(items, dict):
            keys = items.keys()
            values = items.values()
        elif isinstance(items, list):
            keys = range(len(items))
            if isinstance(items[0], tuple):
                values = map(itemgetter(val_ind), items)
            else:
                values = items
        else:
            raise Exception('clusters is not the right type')

        assert items is not None, 'items must be provided'
        distance_matrix = compute_pairwise_distances(values, metric, sp_areas=sp_areas)
    
    if items is None:
        assert distance_matrix is not None, 'distance_matrix must be provided.'    
        
    if linkage=='complete':
        lk = complete(squareform(distance_matrix))
    elif linkage=='average':
        lk = average(squareform(distance_matrix))
    elif linkage=='single':
        lk = single(squareform(distance_matrix))

    # T = fcluster(lk, 1.15, criterion='inconsistent')
    T = fcluster(lk, dist_thresh, criterion='distance')
    
    n_groups = len(set(T))
    groups = [None] * n_groups

    for group_id in range(n_groups):
        groups[group_id] = np.where(T == group_id+1)[0]

    index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0]
    item_groups = [[items[i] for i in g] for g in groups if len(g) > 0]
    
    return index_groups, item_groups, distance_matrix
def cluster_fabio(db, dataset, gamma=None, with_lsa=False, ranking_metric='r'):
    doc_proc = dp.DocumentsProcessor(dataset, db=db)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio(
            rank_metric=ranking_metric, gamma=gamma)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio(
            rank_metric=ranking_metric)

    doc, features = tfidf_matrix.shape

    print 'starting clustering: found %s document and %s features' \
          % (doc, features)

    if with_lsa:
        svd = TruncatedSVD(tfidf_matrix.shape[0])
        lsa = make_pipeline(svd, Normalizer(copy=False))

        tfidf_matrix = lsa.fit_transform(tfidf_matrix)
        linkage_matrix = hr.average(tfidf_matrix)
    else:
        linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
def cluster_matrix(matrix, labels, dpi, by_cols, algorithm):
    """
    From a matrix, generate a distance matrix & perform hierarchical clustering

    :param matrix: a numpy matrix of scores
    :param labels: the ids for all row elements or column elements
    :param dpi: the resolution to save the diagram at
    :param by_cols: whether to perform the clustering by row similarity
                    (default) or column similarity.
    :param algorithm: the clustering algorithm (linkage (default, False)
                      or UPGMA)

    :type matrix: numpy matrix
    :type labels: list
    :type dpi: int
    :type by_cols: boolean (default == False)
    :type algorithm: boolean

    :returns: a tuple of the updated (clustered) matrix & the updated labels
    """
    if by_cols:
        matrix = matrix.transpose()
    print "\nClustering the matrix"
    # Clear any matplotlib formatting
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    # Hide x labels/ticks
    ax.set_yticklabels([])
    ax.set_yticks([])
    plt.xticks(fontsize=6)
    Y = pdist(matrix)
    if not algorithm:
        Z = linkage(Y)
        print "Linkage algorithm\n"
    else:
        Z = average(Y)
        print "UPGMA algorithm\n"
    dend = dendrogram(Z, labels=labels, link_color_func=None)
    plt.savefig("dendrogram.png", dpi=dpi)
    # Reshape
    ordered_index = dend['leaves']
    updated_labels = dend['ivl']
    tmp = []
    for i in range(0, len(ordered_index)):
        tmp.append(list(matrix[ordered_index[i], :]))
    matrix = np.array(tmp)
    if by_cols:
        matrix = matrix.transpose()
    return matrix, updated_labels
Beispiel #16
0
def __apply_cluster_alg(cluster_data=[], alg="kmean", prior_cluster_num=2, t=0.155):
    pass
    """clustering"""
    if alg == "kmean":
        from scipy.cluster.vq import whiten

        cluster_data = whiten(cluster_data)
        from scipy.cluster.vq import kmeans, vq

        centroids, _ = kmeans(cluster_data, prior_cluster_num, iter=250)
        idx, dist = vq(cluster_data, centroids)
        return idx, prior_cluster_num
    elif alg == "spec":
        from sklearn import cluster
        from sklearn.preprocessing import StandardScaler

        X = cluster_data
        X = StandardScaler().fit_transform(X)
        spectral = cluster.SpectralClustering(n_clusters=prior_cluster_num, eigen_solver="arpack")
        spectral.fit(X)
        import numpy as N

        idx = spectral.labels_.astype(N.int)
        return idx, prior_cluster_num
    else:
        """hierarchical clustering
		   http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html"""
        import scipy.cluster.hierarchy as hcluster

        """needs distance matrix: 
		   http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html"""
        import scipy.spatial.distance as dist

        distmat = dist.pdist(cluster_data, "minkowski")  #'euclidean')
        if alg == "hflat":
            link = hcluster.linkage(distmat)
        elif alg == "hcomp":
            link = hcluster.complete(distmat)
        elif alg == "hweight":
            link = hcluster.weighted(distmat)
        elif alg == "havg":
            link = hcluster.average(distmat)
        idx = hcluster.fcluster(link, t=t, criterion="distance")
        import numpy as N

        post_cluster_num = len(N.unique(idx))
        print "# of channels established:", post_cluster_num
        assert post_cluster_num < 64, "number of cluster too large to be biological meaningful"
        return idx, post_cluster_num
def main(): 
    URL = 'C:\Users\NYU\Desktop\\'
    ListofInputFiles = URL + "A31" + "\*"
    DendogramImage = "C:\Users\NYU\Desktop\Dendogram" + ".png"
    FieldNames = []
    ReadGroups = []
    CulturalHoleMatrix = [] 
    ArraysOfFileReads = []
    CleanedContent = defaultdict()
    FileContent = defaultdict()
    CulturalHole = defaultdict(list)
    Preprocessed = defaultdict(list)
    FileReads = CreateList(ListofInputFiles)
    with open(FileReads[1]) as f:
        for l in f:
            FieldNames.append(l.strip().split(",")[1])    
    with open(FileReads[0]) as f:
        for l in f:
            ArraysOfFileReads.append(l.strip().split("\t"))
    for i in range(len(ArraysOfFileReads)):
        FileContent[ArraysOfFileReads[i][0]] = ArraysOfFileReads[i][1]
    with open(FileReads[2]) as f1:
        for l in f1:
            ReadGroups.append(l.strip().split("\t"))
        NewList = ReadGroups[1:]
    for i,f in enumerate(NewList[1:]):
        if FileContent[NewList[i][0]] == 'null':
            continue
        else:
            Preprocessed[NewList[i][1]].append(FileContent[NewList[i][0]]) 
    for k,v in Preprocessed.items():
        Words = (word for word in str(v).split() if word.isalpha() and len(word)>1) #Remove all Single Letter & Alpha-Numeric Enteries
        CleanedContent[k] = StopWords(Words)
    KeyList = sorted([int(i) for i in CleanedContent])
    for i in itertools.product(KeyList, repeat=2):
        Writer = str(i[0])
        Reader = str(i[1])
        CulturalHole[i[0]].append(1 - Calculate_CH(CleanedContent[Writer],CleanedContent[Reader],(CleanedContent[Writer] + " " + CleanedContent[Reader])))
    CulturalHoleMatrix = [CulturalHole[key] for key in CulturalHole]
    UPGMAMatrix = np.array(CulturalHoleMatrix)
    #UPGMA Clustering 
    UPGMACluster = UPGMA.average(UPGMAMatrix) 
    fig = plt.figure(figsize=(20,10))  
    plt.title("Document Jargon Distance/Relation")
    #Dendogram Plotting
    UPGMA.dendrogram(UPGMACluster, labels=np.array(FieldNames))
    plt.xlabel("Group Names")
    plt.savefig(DendogramImage)    
def create_dendrogram(g):
    """
    create_dendrogram(g)
    create dendrogram (tree structure) from graph from lowest to highest level
    :param g:   source graph
    :return:    hier - hierarchy
    """
    logging.info(cs_ref, 'create graph')
    path_length = nx.all_pairs_shortest_path_length(g)
    n = len(g.nodes())
    distances = np.zeros((n, n))
    for u, p in path_length.iteritems():
        for v, d in p.iteritems():
            distances[int(u) - 1][int(v) - 1] = d
    sd = distance.squareform(distances)
    hier = hierarchy.average(sd)
    return hier
def plotHaplotypes(chr, startPos, endPos):
    snpsd = dataParsers.parseCSVData(
        "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv")[chr - 1]
    import scipy as sp
    import scipy.cluster.hierarchy as hc
    import Emma
    snpsd = snpsd.getSnpsData()
    newSnps = []
    positions = []
    for i in range(0, len(snpsd.positions)):
        pos = snpsd.positions[i]
        if pos > endPos:
            break
        elif pos >= startPos:
            newSnps.append(snpsd.snps[i])
            positions.append(snpsd.positions[i])

    print "calculating the kinship"
    K = Emma.calcKinship(newSnps)
    #print "K:",K
    Z = hc.average(K)
    #print "Z:",Z
    import pylab
    #hc.leaders(Z)
    dend_dict = hc.dendrogram(Z, labels=snpsd.accessions)
    new_acc_order = dend_dict['ivl']
    print new_acc_order
    print snpsd.accessions
    pylab.savefig("/Users/bjarni/tmp/FRI_tree.pdf", format='pdf')
    #cluster to get ordering??

    acc_mapping = []
    for acc in snpsd.accessions:
        i = new_acc_order.index(acc)
        acc_mapping.append(i)

    snps = []
    for snp in newSnps:
        newSNP = [0] * len(snp)
        for (nt, i) in zip(snp, acc_mapping):
            newSNP[i] = nt
        snps.append(newSNP)

    snps = sp.array(snps)
    pylab.matshow(snps.transpose())
    pylab.savefig("/Users/bjarni/tmp/FRI_haplotype.pdf", format='pdf')
def write_tree(cluster_method):
    import scipy.spatial.distance as ssd
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    triu = np.square(dmx.values)
    distArray = ssd.squareform(triu)
    if cluster_method == "average":
        hclust = average(distArray)
    elif cluster_method == "weighted":
        hclust = weighted(distArray)
    else:
        print("invalid cluster method chosen")
        sys.exit()
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
Beispiel #21
0
def dendrogram(data, by_cols):
    dist_mat = data.to_distance_matrix(by_cols)
    clusters = hierarchy.average(dist_mat)
    tree = hierarchy.to_tree(clusters, rd=False)
    leaf_labels = []
    for i in leaves:
        if by_cols:
            leaf_labels.append(data.col_names[i])
        else:
            leaf_labels.append(data.row_names[i])

    response_output = {
        'name': data.name,
        'key': data.key,
        'tree': dict_node(tree, leaf_labels, 'root'),
        'labels': leaf_labels
    }
    return response_output
Beispiel #22
0
    def run(self):
        try:
            g = networkx.Graph()
            sewer = self.getData("Sewer")
            CostsTotal = 0
            LengthTot = 0
            names = sewer.getNamesOfComponentsInView(self.conduits)
            pointnamelist = []
            for nc in names:
                c = sewer.getEdge(nc)
                startNode = c.getStartpointName()
                endNode = c.getEndpointName()
                if startNode not in pointnamelist:
                    pointnamelist.append(startNode)
                if endNode not in pointnamelist:
                    pointnamelist.append(endNode)
                g.add_edge(pointnamelist.index(startNode),
                           pointnamelist.index(endNode))
            path_length = networkx.all_pairs_shortest_path_length(g)
            n = len(g.nodes())
            distances = numpy.zeros((n, n))
            for u, p in path_length.iteritems():
                for v, d in p.iteritems():
                    distances[int(u) - 1][int(v) - 1] = d
            sd = distance.squareform(distances)

            hier = hierarchy.average(sd)
            hierarchy.dendrogram(hier)
            matplotlib.pylab.savefig("tree.png", format="png")

            partition = community.best_partition(g)
            print partition

            for i in set(partition.values()):
                print "Community", i
                members = list_nodes = [
                    nodes for nodes in partition.keys()
                    if partition[nodes] == i
                ]
            print members
        except Exception, e:
            print e
            print "Unexpected error:"
Beispiel #23
0
def cluster_dandelion_entities(dataset, gamma=None, filter=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities(
        )

    doc, features = tfidf_matrix.shape

    print 'starting clustering: found %s document and %s features' \
          % (doc, features)

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    print 'starting clustering: found %s document and %s features after LSA' \
         % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    #linkage_matrix = hr.average(tfidf_matrix.toarray())
    linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
Beispiel #24
0
def guide_tree_from_query_sequences(query_sequences, 
                                    distance_fn=three_mer_distance,
                                    display_tree = False):
    guide_dm = []
    seq_ids = []
    for seq_id1, seq1 in query_sequences:
        seq_ids.append(seq_id1)
        row = []
        for seq_id2, seq2 in query_sequences:
            row.append(kmer_distance(seq1, seq2, k=3))
        guide_dm.append(row)
    
    guide_dm = DistanceMatrix(guide_dm, seq_ids)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', 
               link_color_func=lambda x: 'black')
    return guide_tree
def write_tree(cluster_method):
    import scipy.spatial.distance as ssd
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    #triu = np.square(dmx.as_matrix())
    triu = np.square(dmx.values)
    distArray = ssd.squareform(triu)
    if cluster_method == "average":
        hclust = average(distArray)
    elif cluster_method == "weighted":
        hclust = weighted(distArray)
    else:
        print("invalid cluster method chosen")
        sys.exit()
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
Beispiel #26
0
def progressive_msa_and_tree(query_sequences, gap_open_penalty=8, gap_extend_penalty=1, 
                             substitution_matrix=nt_substitution_matrix,
                             msa_distance_fn=compute_aligned_sequence_distances,
                             guide_tree=None, display_aln=False, display_tree=False):
    msa, guide_tree = progressive_msa(query_sequences, guide_tree, 
                                      gap_open_penalty, gap_extend_penalty, substitution_matrix)
    if display_aln:
        print "Multiple sequence alignment:\n"
        for seq_id, seq in msa:
            print seq, "(%s)" % seq_id
    
    dm = msa_distance_fn(msa)
    lm = average(dm.condensed_form())
    tree = to_tree(lm)
    if display_tree:
        print "\nOutput tree:"
        d = dendrogram(lm, labels=dm.ids, orientation='right', 
                   link_color_func=lambda x: 'black', leaf_font_size=24)
    return msa, tree
Beispiel #27
0
def compute_distance_matrix(covers):
    # Compute stochastic clusters
    num_results = len(covers)
    distance_matrix= np.zeros((num_results,num_results))
    print('Calculating distance matrix ... ')
    for i in range(num_results):
        for j in range(i+1,num_results):
            #score = metrics.omega_index(results['vc'][i].membership,results['vc'][j].membership)
            #score = skmetrics.f1_score(to_crisp_membership(results['vc'][i].membership),
            #                           to_crisp_membership(results['vc'][j].membership))
            score = skmetrics.adjusted_rand_score(to_crisp_membership(covers[i].membership),
                                                  to_crisp_membership(covers[j].membership))
            distance_matrix[i,j] = 1-score
            distance_matrix[j,i] = 1-score
    distance_matrix = np.matrix(distance_matrix)

    y = squareform(distance_matrix)
    Z = average(y)
    return distance_matrix, y, Z
def progressive_msa_and_tree(query_sequences, gap_open_penalty=8, gap_extend_penalty=1, 
                             substitution_matrix=nt_substitution_matrix,
                             msa_distance_fn=compute_aligned_sequence_distances,
                             guide_tree=None, display_aln=False, display_tree=False):
    msa, guide_tree = progressive_msa(query_sequences, guide_tree, 
                                      gap_open_penalty, gap_extend_penalty, substitution_matrix)
    if display_aln:
        print "Multiple sequence alignment:\n"
        for seq_id, seq in msa:
            print seq, "(%s)" % seq_id
    
    dm = msa_distance_fn(msa)
    lm = average(dm.condensed_form())
    tree = to_tree(lm)
    if display_tree:
        print "\nOutput tree:"
        d = dendrogram(lm, labels=dm.ids, orientation='right', 
                   link_color_func=lambda x: 'black', leaf_font_size=24)
    return msa, tree
def guide_tree_from_query_sequences(query_sequences, 
                                    distance_fn=three_mer_distance,
                                    display_tree = False):
    guide_dm = []
    seq_ids = []
    for seq_id1, seq1 in query_sequences:
        seq_ids.append(seq_id1)
        row = []
        for seq_id2, seq2 in query_sequences:
            row.append(kmer_distance(seq1, seq2, k=3))
        guide_dm.append(row)
    
    guide_dm = SymmetricDistanceMatrix(guide_dm, seq_ids)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', 
               link_color_func=lambda x: 'black')
    return guide_tree
def cluster_dandelion_abstract(dataset, gamma=None, filter=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_abstract(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_abstract(min_df=2, relevance_threshold=0.95)

    doc, features = tfidf_matrix.shape

    print 'starting clustering: found %s document and %s features' \
          % (doc, features)

    svd = TruncatedSVD(1300)
    lsa = make_pipeline(svd, Normalizer(copy=False))

    tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    print 'starting clustering: found %s document and %s features after LSA' \
         % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    #linkage_matrix = hr.average(tfidf_matrix.toarray())
    linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
def demo_elbow_method(multiplexity_matrix):
    """
    Performs agglomarative clustering with different cut-off levels,
    display silhouette scores.
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy.cluster import hierarchy
    from sklearn.metrics import silhouette_score
    from scipy.spatial.distance import squareform

    sns.set_style("whitegrid")
    sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5})

    src_dist_matrix = multiplexity_matrix.max() - multiplexity_matrix
    num_objects = len(src_dist_matrix)

    # Converting (possibly) asymmetric distance matrix to symmetric
    # pairwise distance array as expected by scipy clustering
    pdist_array = [(src_dist_matrix[i, j] + src_dist_matrix[j, i]) / 2
                   for i in range(num_objects)
                   for j in range(i + 1, num_objects)]
    pdist_matrix = squareform(pdist_array)

    linkage = hierarchy.average(pdist_array)
    nn = np.arange(2, 31)
    scores = []
    for n in nn:
        labels = hierarchy.fcluster(linkage, n, criterion='maxclust')
        scores.append(
            silhouette_score(pdist_matrix, labels, metric='precomputed'))
    scores = pd.DataFrame({
        'Number of clusters': nn,
        'Silhouette score': scores
    })

    plt.title('Agglomerative clustering')
    sns.lineplot(data=scores,
                 x='Number of clusters',
                 y='Silhouette score',
                 markers=False,
                 dashes=True)
Beispiel #32
0
def compute_distance_matrix(covers):
    # Compute stochastic clusters
    num_results = len(covers)
    distance_matrix = np.zeros((num_results, num_results))
    print('Calculating distance matrix ... ')
    for i in range(num_results):
        for j in range(i + 1, num_results):
            #score = metrics.omega_index(results['vc'][i].membership,results['vc'][j].membership)
            #score = skmetrics.f1_score(to_crisp_membership(results['vc'][i].membership),
            #                           to_crisp_membership(results['vc'][j].membership))
            score = skmetrics.adjusted_rand_score(
                to_crisp_membership(covers[i].membership),
                to_crisp_membership(covers[j].membership))
            distance_matrix[i, j] = 1 - score
            distance_matrix[j, i] = 1 - score
    distance_matrix = np.matrix(distance_matrix)

    y = squareform(distance_matrix)
    Z = average(y)
    return distance_matrix, y, Z
def scipy_algo(dataset):
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data()

    linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    print 'average f_score: %s' % average_f_score(f, tfidf_matrix.shape[0])
Beispiel #34
0
def scipy_algo(dataset):
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data()

    linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    print 'average f_score: %s' % average_f_score(f, tfidf_matrix.shape[0])
Beispiel #35
0
def create_correlation_tree(corr_matrix, method="single"):
    """ Creates hierarchical clustering (correlation tree)
    from a correlation matrix

    Parameters
    ----------
    corr_matrix : np.ndarray
        ``(p, p)``-shaped correlation matrix
    method : str
        the method of hierarchical clustering:
        'single', 'average', 'fro', or 'complete'.
        Defaults to 'single'.
    
    Returns
    -------
    link : np.ndarray
        The `link` of the correlation tree, as in scipy
    """

    # Distance matrix for tree method
    if method == "fro":
        dist_matrix = np.around(1 - np.power(corr_matrix, 2), decimals=7)
    else:
        dist_matrix = np.around(1 - np.abs(corr_matrix), decimals=7)
    dist_matrix -= np.diagflat(np.diag(dist_matrix))

    condensed_dist_matrix = ssd.squareform(dist_matrix)

    # Create linkage
    if method == "single":
        link = hierarchy.single(condensed_dist_matrix)
    elif method == "average" or method == "fro":
        link = hierarchy.average(condensed_dist_matrix)
    elif method == "complete":
        link = hierarchy.complete(condensed_dist_matrix)
    else:
        raise ValueError(
            f'Only "single", "complete", "average", "fro" are valid methods, not {method}'
        )

    return link
Beispiel #36
0
def create_hc(G, t=1.2):
    """
    从距离矩阵中创造一个图G的分层聚类
    马克西姆注:对带有标签的图进行聚类的前处理和后处理,并返回聚类的结果
    参数化门槛值之后,其取值范围应该通过对每个数据进行尝试的基础上确定
    """
    """在对德鲁•康威(Drew Conway)编写的代码进行优化的基础上而来"""
    ## 创造最短路径距离矩阵,但是保留节点标签
    labels = list(G.nodes())
    indx = {}

    for ind in range(len(labels)):
        word = labels[ind]
        indx[word] = ind

    path_length = nx.all_pairs_shortest_path_length(G)

    distances = numpy.zeros((len(G), len(G)))
    for i in range(len(labels)):
        for j in range(len(labels)):
            distances[i][j] = 10000

    for u, p in path_length.items():
        uind = indx[u]
        for v, d in p.items():
            vind = indx[v]
            #u和v 都是词
            distances[uind][vind] = d

    # 创造分层聚类
    Y = distance.squareform(distances)
    #Z=hierarchy.single(Y)
    Z = hierarchy.average(Y)
    print("caonima", Z.shape)
    # 这种划分的选择是任意的,仅仅为了说明 的目的
    membership = list(hierarchy.fcluster(Z, t=t))

    partition = defaultdict(list)
    for n, p in zip(list(range(len(G))), membership):
        partition[p].append(labels[n])
    return list(partition.values())
Beispiel #37
0
def create_hc(G, t=1.2):
    """
    从距离矩阵中创造一个图G的分层聚类
    马克西姆注:对带有标签的图进行聚类的前处理和后处理,并返回聚类的结果
    参数化门槛值之后,其取值范围应该通过对每个数据进行尝试的基础上确定
    """
    """在对德鲁•康威(Drew Conway)编写的代码进行优化的基础上而来"""
    ## 创造最短路径距离矩阵,但是保留节点标签
    labels=list(G.nodes())
    indx = {}
    
    for ind in range(len(labels)):
        word = labels[ind]
        indx[word] = ind
        
    path_length=nx.all_pairs_shortest_path_length(G)

    distances=numpy.zeros((len(G),len(G)))
    for i in range(len(labels)):
        for j in range(len(labels)):
            distances[i][j] = 10000

    for u,p in path_length.items():
        uind = indx[u]
        for v,d in p.items():
            vind = indx[v]
            #u和v 都是词
            distances[uind][vind]=d

    # 创造分层聚类
    Y=distance.squareform(distances)
    #Z=hierarchy.single(Y)
    Z=hierarchy.average(Y) 
    print("caonima",Z.shape)
    # 这种划分的选择是任意的,仅仅为了说明 的目的
    membership=list(hierarchy.fcluster(Z,t=t))
    
    partition=defaultdict(list)
    for n,p in zip(list(range(len(G))),membership):
        partition[p].append(labels[n])
    return list(partition.values())
Beispiel #38
0
    def run(self):
	try:
                g = networkx.Graph()
		sewer = self.getData("Sewer")
		CostsTotal = 0
		LengthTot  = 0
		names = sewer.getNamesOfComponentsInView(self.conduits)
                pointnamelist = []
		for nc in names:                    
		    c = sewer.getEdge(nc)                    
                    startNode = c.getStartpointName()
                    endNode = c.getEndpointName()
                    if startNode not in pointnamelist:
                        pointnamelist.append(startNode)
                    if endNode not in pointnamelist:
                        pointnamelist.append(endNode)
                    g.add_edge(pointnamelist.index(startNode), pointnamelist.index(endNode))
                path_length=networkx.all_pairs_shortest_path_length(g)
                n = len(g.nodes())
                distances=numpy.zeros((n,n))
                for u,p in path_length.iteritems():
                    for v,d in p.iteritems():
                                     distances[int(u)-1][int(v)-1] = d
                sd = distance.squareform(distances)

                hier = hierarchy.average(sd)		
                hierarchy.dendrogram(hier)
                matplotlib.pylab.savefig("tree.png",format="png")
                

                partition = community.best_partition(g)
                print partition

                for i in set(partition.values()):
                    print "Community", i
                    members = list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == i]
                print members
	except Exception, e:
	        print e
		print "Unexpected error:"
Beispiel #39
0
def get_clusters_average():
    x = np.array(df)
    print(len(x))
    linkage_array = average(x)
    print(linkage_array)
    print(len(linkage_array))
    dendrogram(linkage_array)
    ax = plt.gca()
    bounds = ax.get_xbound()
    ax.plot(bounds, [2500, 2500], '--', c='k')
    ax.plot(bounds, [850, 850], '--', c='k')
    ax.text(bounds[1],
            2500,
            ' два кластера',
            va='center',
            fontdict={'size': 5})
    ax.text(bounds[1], 850, ' три кластера', va='center', fontdict={'size': 5})
    plt.xlabel("Индекс наблюдения")
    plt.ylabel("Кластерное расстояние")
    plt.xlim(2567, 3000)
    plt.ylim(0, 10)
    plt.show()
Beispiel #40
0
def hierarchy_clustering(df):
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(50, 18))

    for linkage, cluster, ax in zip([
            hierarchy.complete(df),
            hierarchy.average(df),
            hierarchy.single(df),
            hierarchy.ward(df)
    ], ['c1', 'c2', 'c3', 'c4'], [ax1, ax2, ax3, ax4]):
        cluster = hierarchy.dendrogram(linkage,
                                       labels=df.index,
                                       p=12,
                                       truncate_mode="lastp",
                                       orientation='top',
                                       color_threshold=0,
                                       leaf_font_size=10,
                                       distance_sort=True,
                                       ax=ax)
    ax1.set_title('Complete Linkage')
    ax2.set_title('Average Linkage')
    ax3.set_title('Single Linkage')
    ax4.set_title('Ward')
    plt.show()
Beispiel #41
0
    def clustering(self):
        counts = np.log10(self.counts().transpose() + 1)
        labels = counts.index.values
        # calculate correlation distance
        dist = pdist(counts, 'correlation')
        # calculate correlation from distance
        corr = 1 - dist
        dist = np.clip(dist, 0, 1)
        # cluster
        clustering = average(dist)
        #import matplotlib.pyplot as plt
        d = dendrogram(clustering,
                       labels=labels,
                       get_leaves=True,
                       no_plot=True)
        #plt.savefig("/tmp/ddg.pdf")
        leaves = d["leaves"]

        # calculate correlation matrix
        corr = np.round(squareform(corr), 2)
        # fix diagonal, that will contain zeros because squareform expects a dist
        np.fill_diagonal(corr, 1)
        return corr, leaves, labels, d
Beispiel #42
0
def arrangeClusters(pairwiseDistanceMatrix):
    y = spatial.distance.squareform(pairwiseDistanceMatrix)
    # print(y)
    Z = average(y)

    clusters = fcluster(Z, 0.8, criterion='distance')
    numberOfClusters = max(clusters)

    for point in range(0, (numberOfClusters)):
        orderedCluster.append([])

    for point in range(0, len(clusters)):
        orderedCluster[clusters[point] - 1].append(point)

    for element in orderedCluster:
        print(element)

    # Z = linkage(y, 'average')
    # print(Z)
    #fig = plt.figure(figsize=(25, 10))
    dn = dendrogram(Z)
    plt.savefig('result.png')
    plt.show()
Beispiel #43
0
 def plot_dendrogram(self,topology_type='litho',path=None):
     '''
     Calculates the average number of nodes in all of the model realisations that are part of this
     experiment.
     
     **Arguments**
      - *topology_type* = The type of topology you are interested in. This should be either 'litho'
                          or 'struct'
      - *path* = A path to save the image to. If left as None the image is drawn to the screen.
     '''
     #get difference matrix (NB. squareform converts it to a condensed matrix for scipy)
     import scipy.spatial.distance as dist
     import scipy.cluster.hierarchy as clust
     
     m_dif = dist.squareform( self.get_difference_matrix(topology_type),force='tovector' )
     
     if len(m_dif) > 2:
         #generate dendrogram using UPGMA
         Z = clust.average(m_dif)
         
         #generate plot
         clust.dendrogram(Z)
     else: #we cant build a tree with only one topology...
         print "Error: only a single unique topology of this type has been found"
Beispiel #44
0
def cluster_with_mpear(X, max_clusters=None):
    '''
    Args:
        X : (array) An array with as many rows as (post-burnin) MCMC iterations and columns as data points.
    '''
    X = np.array(X).T

    dist_mat = pdist(X, metric='hamming')

    sim_mat = 1 - squareform(dist_mat)

    Z = average(dist_mat)

    max_pear = 0

    best_cluster_labels = _get_flat_clustering(Z, 1)

    if max_clusters is None:
        max_clusters = len(X) + 1

    else:
        max_clusters = min(max_clusters, len(X))

    max_clusters = max(max_clusters, 1)

    for i in range(2, max_clusters + 1):
        cluster_labels = _get_flat_clustering(Z, i)

        pear = _compute_mpear(cluster_labels, sim_mat)

        if pear > max_pear:
            max_pear = pear

            best_cluster_labels = cluster_labels

    return best_cluster_labels
Beispiel #45
0
	def CalculateClusterTree(self):
		fullMatrix = self.GenerateFullMatrix(self.results)
		dissMatrix = []
		labels = fullMatrix.keys()
		for i in xrange(0, len(labels)):
			sampleNameI = labels[i]
			for j in xrange(i+1, len(labels)):
				sampleNameJ = labels[j]
				dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ])
				
		# calculate hierarchical cluster tree
		if self.radioSingleLinkage.GetValue():
			linkageMatrix = single(dissMatrix)
		elif self.radioUPGMA.GetValue():
			linkageMatrix = average(dissMatrix)
		elif self.radioCompleteLinkage.GetValue():
			linkageMatrix = complete(dissMatrix)
		elif self.radioWeighted.GetValue():
			linkageMatrix = weighted(dissMatrix)
			
		root = to_tree(linkageMatrix)
		
		# create Newick string
		return self.CreateNewickString(root, labels) + ';'
Beispiel #46
0
    def CalculateClusterTree(self):
        fullMatrix = self.GenerateFullMatrix(self.results)
        dissMatrix = []
        labels = fullMatrix.keys()
        for i in xrange(0, len(labels)):
            sampleNameI = labels[i]
            for j in xrange(i + 1, len(labels)):
                sampleNameJ = labels[j]
                dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ])

        # calculate hierarchical cluster tree
        if self.radioSingleLinkage.GetValue():
            linkageMatrix = single(dissMatrix)
        elif self.radioUPGMA.GetValue():
            linkageMatrix = average(dissMatrix)
        elif self.radioCompleteLinkage.GetValue():
            linkageMatrix = complete(dissMatrix)
        elif self.radioWeighted.GetValue():
            linkageMatrix = weighted(dissMatrix)

        root = to_tree(linkageMatrix)

        # create Newick string
        return self.CreateNewickString(root, labels) + ';'
Beispiel #47
0
def cluster_zips(area_features, linkage, t, return_dist=False):
    """ Clusters zip codes using a hierachial method with euclidean distance
        and the inputted feature vector.
    """
    if type(area_features) == str:
        features = json.load(
            open('data/{}/census/features.json'.format(area_features), 'r'))
    elif type(area_features) == dict:
        features = area_features
    feat = []
    for i in features.values():
        feat.append(i)
    y = pdist(np.matrix(feat), 'euclidean')
    if linkage == 'single':
        Z = hierarchy.single(y)
    elif linkage == 'average':
        Z = hierarchy.average(y)
    elif linkage == 'complete':
        Z = hierarchy.complete(y)
    f = hierarchy.fcluster(Z, criterion='distance', t=t)
    if return_dist == True:
        return (squareform(y), f)
    else:
        return f
Beispiel #48
0
nx.draw_networkx(z,pos)
plt.draw()
plt.savefig('Karate_graph.pdf')

#-----------------------------------------------------------------------------
# 3: Hierarchical clustering:

path_length=nx.all_pairs_shortest_path_length(z) 
n = len(z.nodes())
distances=np.zeros((n,n))

for u,p in path_length.iteritems(): 
	for v,d in p. iteritems ():
		distances[u][v] = d 

hier = hierarchy.average( distances )
plt.figure(2)
hierarchy.dendrogram(hier)
plt.savefig('h_clustering.pdf')

#-----------------------------------------------------------------------------
# 4: Spectral clustering:

def spectralClustering(W, k):
	# Create degree matrix
	D = diag(sum(W, axis=0))
	# Create Laplacian matrix
	L = D - W
	eigval, eigvec = linalg.eig(L) # Calculate eigenvalues and eigenvectors
	eigval = eigval.real # Keep the real part
	eigvec = eigvec.real # Keep the real part
Beispiel #49
0
print("#", cmd_args)

num_clusters = args.num_clusters
if args.index_file is not None:
    index_filename = args.index_file
else:
    index_filename = None

# read the RMSD file and
# convert to condensed upper triangular
rmsd = numpy.loadtxt(args.rmsd_file)
upper = squareform(rmsd)


link = sch.average(upper)

if args.link:
    numpy.savetxt(args.prefix + ".link", link, header=cmd_args)

assignments = sch.fcluster(link, num_clusters, criterion='maxclust')

# Read the index file if one was supplied
indices = {}
if index_filename:
    with(open(index_filename)) as index_file:
        for line in index_file.readlines():
            (first, last, filename) = line.split()
            first = int(first)
            last = int(last)
            t = basename(filename)
Beispiel #50
0
# print (m)



# --------------
# dm = DistanceMatrix(X, labels)
# sys.exit(1)

# tree = nj(dm)
# nj()
# print(tree.ascii_art())
sys.exit(1)
# ---------------------

# calculating UPGMA
x = average(X) # average (X)

file_1 = open('results/clustered_data2.txt','w')
for i in x:
    file_1.write(f'{int(i[0])}\t{int(i[1])}\t{i[2]}\t{int(i[3])}\n')


print("Done avg")
# fig = plt.figure(figsize=(350,120),  dpi=100)
fig = plt.figure()

# figsize=(200, 200)
dn = dendrogram(x, labels=labels, orientation='left')
plt.xticks(rotation='horizontal')
plt.yticks(rotation='horizontal')
#single link
H = h.single(X)
print H.shape
#sono tutti i link effettuati (#esempi-1) e per ciascuno abbiamo
# coppie di cluster uniti, distanza e #esempi contenuti in nuovo cluster
h.dendrogram(H)
pl.show()
#il dendogramma e' lungo perche' c'e' chain effect tipico problema del single link

#comlpete link
H = h.complete(X)
h.dendrogram(H)
pl.show()

#average link
H = h.average(X)
h.dendrogram(H)
pl.show()

#centroid link
H = h.centroid(X)
h.dendrogram(H)
pl.show()
#ci sono delle inversioni perche' la distanza qui non e' monotona

#per ottenere un cluster devo definire una distanza
H = h.average(X)
C = h.fcluster(H, 1.9,
               criterion='distance')  #la soglia 3.5 sembra buona dal grafico
#per vedere il numero di cluster:
print "n cluster:", len(np.unique(C))
    def setupUi(self, Form,Matrix,list_d, c):
        yourarray=Matrix
      

        center=yourarray.mean(axis=1)
        min_mean=center[0]
        mean_tree=[]
        for r in range(0,len(center)):
            if(center[r]<min_mean):
                
                mean_tree=[]
                mean_tree.append(r+1)
                min_mean=center[r]
                #print(mean_tree,min_mean)
            elif(center[r]==min_mean):
                mean_tree.append(r+1)
                #print(mean_tree,min_mean)
        print(mean_tree)
           

        min=0
        first_row=yourarray[0]
        for i in range(0,len(first_row)):
                min+= math.sqrt((first_row[i]-center[i])**2)

        min_tree=[]
        for r in range(len(yourarray)):
            row=yourarray[r]
            sum=0
            for i in range(0,len(row)):
                sum+= math.sqrt((row[i]-center[i])**2) 
                
                if(sum>min):break
            if(sum<min):
                
                min_tree=[]
                min_tree.append(r+1)
                min=sum
            elif(sum==min):
                min_tree.append(r+1)
        print(min_tree)        
        text="----------------------------------------------------------------------------------------\n"
        #text+="Cluster ID:"+str(key)+"\n"
        #text+="Cluster tree set:"+ str(clusters[key])+"\n"
        text+="Center tree-approach #1:"+str(mean_tree)+"\n"
        text+="Center tree-approach #2:"+ str(min_tree)+"\n"  
        text+="----------------------------------------------------------------------------------------\n"
        if(c=="Y"):
            distances=Matrix
            distArray = ssd.squareform(distances)
            arr =list_d

            linkage_matrix = average(distArray)
          
            fc= hier.fcluster(linkage_matrix, 6, criterion='maxclust')
            clusters = defaultdict(lambda:[])
            for pos in range(0,len(fc)):
                clusters[fc[pos]].append(pos+1)
            for key in clusters:    
                Cluster_matrix=[]
                Cluster_distances=[]
                tempSim=[]
                array=clusters[key]
               
                for x in range (0,len(array)):
                   
                    temp=[]
                    for y in range (0,len(array)):
                        temp.append(distances[x,y])
                        
                    tempSim.append(temp)

                Cluster_distances=np.array(tempSim)
                center=Cluster_distances.mean(axis=1)
                min_mean= float("inf")
                mean_tree=[]
                for r in range(0,len(center)):
                    if(center[r]<min_mean):
                        
                        mean_tree=[]
                        mean_tree.append(array[r])
                        min_mean=center[r]
                       
                    elif(center[r]==min_mean):
                        mean_tree.append(array[r])
                
                min=0
                first_row=Cluster_distances[0]
                for i in range(0,len(first_row)):
                        min+= math.sqrt((first_row[i]-center[i])**2)

                min_tree=[]
                for r in range(len(Cluster_distances)):
                    row=Cluster_distances[r]
                    sum=0
                    for i in range(0,len(row)):
                        sum+= math.sqrt((row[i]-center[i])**2) 
                        
                        if(sum>min):break
                    if(sum<min):
                        
                        min_tree=[]
                        min_tree.append(array[r])
                        min=sum
                    elif(sum==min):
                        min_tree.append(array[r])
                
                
                text+="Cluster ID:"+str(key)+"\n"
                text+="Cluster tree set:"+ str(clusters[key])+"\n"
                text+="Center tree-approach #1:"+str(mean_tree)+"\n"
                text+="Center tree-approach #2:"+ str(min_tree)+"\n"    
                text+="----------------------------------------------------------------------------------------\n"
        self.text=text
        Form.setObjectName("Form")
        Form.resize(587, 515)
        self.textEdit = QtWidgets.QTextEdit(Form)
        self.textEdit.setGeometry(QtCore.QRect(30, 60, 531, 391))
        self.textEdit.setObjectName("textEdit")
        self.label = QtWidgets.QLabel(Form)
        self.label.setGeometry(QtCore.QRect(40, 20, 221, 21))
        font = QtGui.QFont()
        font.setPointSize(12)
        self.label.setFont(font)
        self.label.setObjectName("label")
        self.lineEdit = QtWidgets.QLineEdit(Form)
        self.lineEdit.setGeometry(QtCore.QRect(30, 480, 251, 20))
        self.lineEdit.setObjectName("lineEdit")
        self.pushButton_2 = QtWidgets.QPushButton(Form)
        self.pushButton_2.setGeometry(QtCore.QRect(290, 480, 71, 23))
        self.pushButton_2.setObjectName("pushButton_2")
        self.label_2 = QtWidgets.QLabel(Form)
        self.label_2.setGeometry(QtCore.QRect(30, 460, 111, 16))
        self.label_2.setObjectName("label_2")
        self.textEdit.setPlainText(text)
        self.retranslateUi(Form)
        QtCore.QMetaObject.connectSlotsByName(Form)
        Form.show()
        self.pushButton_2.clicked.connect(self.write_to_file)
Beispiel #53
0
plt.figure(figsize=(15, 10))
h.dendrogram(result)
plt.show()

flat_single = h.fcluster(result, 5588, criterion='distance')

adjusted_rand_score(y.flatten(), flat_single)

adjusted_mutual_info_score(y.flatten(), flat_single)
"""### **Average-Link (Group-Link)**

Questa misura di similarità calcola la distanza tra i due cluster come la media delle distanze tra i singoli elementi. Questo criterio rappresenta una soluzione intermedia tra il *single-link* e il *complete-link*.
"""

result = h.average(X)

plt.figure(figsize=(15, 10))
h.dendrogram(result)
plt.show()

flat_single = h.fcluster(result, 1394, criterion='distance')

adjusted_rand_score(y.flatten(), flat_single)

adjusted_mutual_info_score(y.flatten(), flat_single)
"""### **Centroid**

Per ogni cluster viene calcolato un *centroide* che rappresenta la media. I cluster vengono uniti in base a i centroidi più simili tra loro. Tali cluster vengono uniti a due a due.
"""
# In[8]:

# Normalizando y centrando la tabla
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_values = scaler.fit_transform(data)
data.loc[:, :] = scaled_values
print(data)
datos = data

# In[9]:

ward_res = ward(datos)  #Ward
single_res = single(datos)  #Salto mínimo
complete_res = complete(datos)  #Salto Máxim
average_res = average(datos)  #Promedio

# ### b) Ejecute un Clustering Jerarquico con la agregacion del Salto Maximo, Salto Mınimo, Promedio y Ward. Grafique el dendograma con cortes para dos y tres clusteres.

# In[10]:

dendrogram(average_res, labels=datos.index.tolist())
plt.figure(figsize=(13, 10))
dendrogram(complete_res, labels=datos.index.tolist())
plt.figure(figsize=(13, 10))
dendrogram(single_res, labels=datos.index.tolist())
plt.figure(figsize=(13, 10))
dendrogram(ward_res, labels=datos.index.tolist())

# Agrega cortes con 2 y 3 clústeres con agregación de Ward
ax = plt.gca()
hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'average')
hclust_model.fit(X)
print('Cluster labels: {}\n'.format(hclust_model.labels_))

hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'complete')
hclust_model.fit(X)
print('Cluster labels: {}\n'.format(hclust_model.labels_))


print '''
*********************************************************************************************************************
                                 scipy: dendrogram
*********************************************************************************************************************
'''

# from: https://github.com/JWarmenhoven/ISLR-python/blob/master/Notebooks/Chapter%2010.ipynb

fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))

for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],
                                [ax1,ax2,ax3]):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage')

plt.show()

Beispiel #56
0
def classifyCluster(isolateList, spacermatch):

    lenIsolate = {}
    pairscore = {}

    with open(spacermatch) as fl:
        curIsolate = ''
        for line in fl:
            array = line.strip().split('\t')
            qmatch = array[0].split('||')[0]
            smatch = array[1].split('||')[0]
            if curIsolate != 'qmatch':
                n = 1
            else:
                n += 1
                lenIsolate[qmatch] = n
            pair = qmatch + '||' + smatch
            if pair in pairscore:
                pairscore[pair] += 1
            else:
                pairscore[pair] = 1
        for it in lenIsolate:
            pairscore[it + '||' + it] = lenIsolate[it]
    with open(spacermatch + '.score', 'w') as fl:
        for pair in pairscore:
            fl.write(
                '%s\t%s\t%i\n' %
                (pair.split('||')[0], pair.split('||')[1], pairscore[pair]))

    scoreFile = open(spacermatch + '.score', 'r')
    df = pd.read_table(scoreFile,
                       sep='\t',
                       names=['qmatch', 'smatch', 'score'])

    df_matrix = df.pivot(index='qmatch', columns='smatch', values='score')
    df_matrix_adjusted = df_matrix.fillna(0)

    from skbio.stats.distance import DistanceMatrix
    from numpy import zeros

    def bray_curtis_distance(table, sample1_id, sample2_id):
        numerator = 0
        denominator = 0
        sample1_counts = table[sample1_id]
        sample2_counts = table[sample2_id]
        for sample1_count, sample2_count in zip(sample1_counts,
                                                sample2_counts):
            numerator += abs(sample1_count - sample2_count)
            denominator += sample1_count + sample2_count
        return numerator / denominator

    def table_to_distances(table, pairwise_distance_fn):
        sample_ids = table.columns
        num_samples = len(sample_ids)
        data = zeros((num_samples, num_samples))
        for i, sample1_id in enumerate(sample_ids):
            for j, sample2_id in enumerate(sample_ids[:i]):
                data[i, j] = data[j, i] = pairwise_distance_fn(
                    table, sample1_id, sample2_id)
        return DistanceMatrix(data, sample_ids)

    bc_dm = table_to_distances(df_matrix_adjusted, bray_curtis_distance)

    from scipy.cluster.hierarchy import average, dendrogram
    lm = average(bc_dm.condensed_form())
    d = dendrogram(lm,
                   labels=bc_dm.ids,
                   orientation='right',
                   link_color_func=lambda x: 'black')
    orderedIsolates = d['ivl']
    return orderedIsolates
Beispiel #57
0
from matplotlib import pyplot as plt

#import data
file = './HumanAgeandFatness.csv'
target = open(file, 'r')
datalist = np.loadtxt(file, skiprows=1, delimiter=',')
print(datalist)
print('first row')
print(datalist[0, :])

X = datalist
print(X.shape)  # 150 samples with 2 dimensions
#print(X)

# generate the linkage matrix
Z = average(X)
print(Z)
print(Z.shape)
print(Z[0])
#row format [idx1, idx2, dist, sample_count].

plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()
# Calculate the mean of the pairwise similarities.
ii = np.tril_indices(distmat.shape[0], -1)
pwise = distmat[ii]
mdist = np.mean(pwise)
print mdist

# Generate a historgram of the pairwise similarities.
plt.clf()
plt.hist(pwise, 20, color='lightblue')
plt.xlabel("Similarity", size=17)
plt.ylabel("Frequency", size=17)
pdf.savefig()


# Do the clustering
h = clust.average(distmat)
print h
print len(h)

# Plot the dendrogram
plt.figure(figsize=(16,10))
#plt.figure(linewidth=100)
plt.clf()
ax = plt.axes()
for pos in 'right','bottom','top':
    ax.spines[pos].set_color('none')
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
#ax.spines['left'].set_position(('outward', 10))
clust.dendrogram(h, get_leaves="true", count_sort="true",show_leaf_counts="false", color_threshold=1.5)
#no_labels="true")
Beispiel #59
0
def similarity_matrix_plot(config_file,
                           plot_file,
                           burnin=0,
                           max_clusters=None,
                           min_cluster_size=0,
                           samples=None,
                           thin=1):

    sb.set_style('whitegrid')

    labels = post_process.cluster_pyclone_trace(config_file,
                                                burnin,
                                                thin,
                                                max_clusters=max_clusters)

    labels = labels.set_index('mutation_id')

    labels = labels['cluster_id']

    color_map = utils.get_clusters_color_map(labels)

    cluster_sizes = labels.value_counts()

    used_clusters = cluster_sizes[cluster_sizes >= min_cluster_size].index

    labels = labels[labels.isin(used_clusters)]

    used_loci = labels.index

    trace_file = paths.get_labels_trace_file(config_file)

    labels_trace = trace.load_cluster_labels_trace(trace_file, burnin, thin)

    labels_trace = labels_trace[used_loci]

    dist_mat = pdist(labels_trace.values.T, 'hamming')

    Z = average(dist_mat)

    dist_mat = pd.DataFrame(squareform(dist_mat),
                            index=labels_trace.columns,
                            columns=labels_trace.columns)

    sim_mat = 1 - dist_mat

    N = sim_mat.shape[0]

    cluster_colors = labels.map(color_map)

    size = 0.12 * N

    g = sb.clustermap(sim_mat,
                      cmap='Blues',
                      col_colors=cluster_colors,
                      row_colors=cluster_colors,
                      col_linkage=Z,
                      row_linkage=Z,
                      figsize=(size, size))

    ax = g.ax_heatmap

    utils.set_tick_label_font_sizes(ax, defaults.small_tick_label_font_size)

    utils.set_tick_label_rotations(ax)

    ax.set_xlabel('Loci', fontsize=defaults.axis_label_font_size)

    ax.set_ylabel('Loci', fontsize=defaults.axis_label_font_size)

    g.fig.savefig(plot_file, bbox_inches='tight')
Beispiel #60
0
                np.min(cdist(all_data.values, kmeans.cluster_centers_,
                             'euclidean'),
                       axis=1)))

    plt.figure()
    plt.plot(K, error, 'bx-')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.savefig('elbow.png')
    plt.close()

    plt.figure()
    single_linkage = hierarchy.single(all_data)
    dn = hierarchy.dendrogram(single_linkage)
    plt.savefig('single.png')
    plt.close()

    plt.figure()
    complete_linkage = hierarchy.complete(all_data)
    dn = hierarchy.dendrogram(complete_linkage)
    plt.savefig('complete_linkage.png')
    plt.close()

    plt.figure()
    average_linkage = hierarchy.average(all_data)
    dn = hierarchy.dendrogram(average_linkage)
    plt.savefig('average_linkage.png')
    plt.close()

    print('done')