def progressive_msa_and_tree(sequences, pairwise_aligner, metric=kmer_distance, guide_tree=None, display_aln=False, display_tree=False): """ Perform progressive msa of sequences and build a UPGMA tree Parameters ---------- sequences : skbio.SequenceCollection The sequences to be aligned. pairwise_aligner : function Function that should be used to perform the pairwise alignments, for example skbio.alignment.global_pairwise_align_nucleotide. Must support skbio.Sequence objects or skbio.TabularMSA objects as input. metric : function, optional Function that returns a single distance value when given a pair of skbio.Sequence objects. This will be used to build a guide tree if one is not provided. guide_tree : skbio.TreeNode, optional The tree that should be used to guide the alignment process. display_aln : bool, optional Print the alignment before returning. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.alignment skbio.TreeNode """ if guide_tree is None: guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids) msa = progressive_msa(sequences, guide_tree, pairwise_aligner=pairwise_aligner) if display_aln: print(msa) msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id') msa_lm = average(msa_dm.condensed_form()) msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids) if display_tree: print("\nOutput tree:") d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right', link_color_func=lambda x: 'black', leaf_font_size=24) return msa, msa_tree
def progressive_msa_and_tree( sequences, pairwise_aligner, sequence_distance_fn=kmer_distance, guide_tree=None, display_aln=False, display_tree=False, ): """ Perform progressive msa of sequences and build a UPGMA tree Parameters ---------- sequences : skbio.SequenceCollection The sequences to be aligned. pairwise_aligner : function Function that should be used to perform the pairwise alignments, for example skbio.Alignment.global_pairwise_align_nucleotide. Must support skbio.BiologicalSequence objects or skbio.Alignment objects as input. sequence_distance_fn : function Function that returns and skbio.DistanceMatrix given an skbio.SequenceCollection. This will be used to build a guide tree if one is not provided. guide_tree : skbio.TreeNode, optional The tree that should be used to guide the alignment process. display_aln : bool, optional Print the alignment before returning. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.alignment skbio.TreeNode """ if guide_tree is None: guide_dm = sequences.distances(sequence_distance_fn) guide_lm = average(guide_dm.condensed_form()) guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids) msa = progressive_msa(sequences, guide_tree, pairwise_aligner=pairwise_aligner) if display_aln: print(msa) msa_dm = msa.distances() msa_lm = average(msa_dm.condensed_form()) msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids) if display_tree: print("\nOutput tree:") d = dendrogram( msa_lm, labels=msa_dm.ids, orientation="right", link_color_func=lambda x: "black", leaf_font_size=24 ) return msa, msa_tree
def hierclust(dataset,k): f = open('biclusters/'+dataset+'M.bic') bics = [] for l in f: bics.append( json.loads(l) ) f.close() dist = [] for i,b1 in enumerate(bics): for b2 in bics[i+1:]: dist.append(Jaccard(b1,b2)) clusters = average(dist) clustdict = {i:[i] for i in xrange(len(clusters)+1)} for i in xrange(len(clusters)-k+1): clust1= int(clusters[i][0]) clust2= int(clusters[i][1]) clustdict[max(clustdict)+1] = clustdict[clust1] + clustdict[clust2] del clustdict[clust1], clustdict[clust2] newbics = [] for clusts in clustdict.values(): objs = reduce(lambda x,y: x+y,[bics[idx]['objs'] for idx in clusts]) feats = reduce(lambda x,y: x+y,[bics[idx]['feats'] for idx in clusts]) newbics.append({'objs': list(set(objs)), 'feats':list(set(feats))}) fw = open('biclusters/'+dataset+'.bic','w') for bic in newbics: fw.write(json.dumps(bic)+'\n') fw.close()
def linkage_along_graph(self): """ Return the UPGMA linkage matrix for the distances along the graph. """ if getattr(self, '_dist_linkage', None) is None: self._dist_linkage = average(squareform(self.graph_distances)) return self._dist_linkage
def cluster_alchemy(dataset, gamma=None, filter=False): doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy(gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy() print 'starting clustering: found %s document and %s features' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) linkage_matrix = hr.average(tfidf_matrix.toarray()) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def guide_tree_from_sequences(sequences, distance_fn=kmer_distance, display_tree = False): """ Build a UPGMA tree by applying distance_fn to sequences Parameters ---------- sequences : skbio.SequenceCollection The sequences to be represented in the resulting guide tree. sequence_distance_fn : function Function that returns and skbio.DistanceMatrix given an skbio.SequenceCollection. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = sequences.distances(distance_fn) guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def scipy_algo(dataset, abstract=False): doc_proc = dp.DocumentsProcessor(dataset) tfidf_matrix, f_score_dict = doc_proc.get_data(abstract) svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) #tfidf_matrix = lsa.fit_transform(tfidf_matrix) print 'starting clustering after lsa: found %s document and %s features' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) linkage_matrix = hr.average(tfidf_matrix.toarray()) #linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) print_f_score_dict(f) avg_f_score = average_f_score(f, tfidf_matrix.shape[0]) print 'average f_score: %s' % avg_f_score return avg_f_score
def guide_tree_from_sequences(sequences, metric=kmer_distance, display_tree = False): """ Build a UPGMA tree by applying metric to sequences Parameters ---------- sequences : list of skbio.Sequence objects (or subclasses) The sequences to be represented in the resulting guide tree. metric : function Function that returns a single distance value when given a pair of skbio.Sequence objects. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def cluster_dandelion_2(dataset, gamma=0.91, filter=False): #duplicato, mi serve solo per tornare la linkage_matrix doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion() svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) #linkage_matrix = hr.average(tfidf_matrix.toarray()) linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l return linkage_matrix
def hcluster(self): """ .. plot:: :include-source: :width: 50% from cno import XCNOGraph, cnodata c = XCNOGraph(cnodata("PKN-ToyPB.sif"), cnodata("MD-ToyPB.csv")) c.hcluster() .. warning:: experimental """ from scipy.cluster import hierarchy from scipy.spatial import distance path_length=nx.all_pairs_shortest_path_length(self.to_undirected()) n = len(self.nodes()) distances=np.zeros((n,n)) nodes = self.nodes() for u,p in path_length.iteritems(): for v,d in p.iteritems(): distances[nodes.index(u)-1][nodes.index(v)-1] = d sd = distance.squareform(distances) hier = hierarchy.average(sd) pylab.clf(); hierarchy.dendrogram(hier) pylab.xticks(pylab.xticks()[0], nodes)
def linkage_in_structure(self): """ Return the UPGMA linkage matrix based on the correlation structure of the topslam embedding MST """ if getattr(self, '_struct_linkage', None) is None: self._struct_linkage = average(pdist(self.distances_in_structure, metric='correlation')) return self._struct_linkage
def make_tree(X, C, method='single'): if method == 'single': tree = to_tree(single(C)) elif method == 'ward': tree = to_tree(ward(X)) elif method == 'average': tree = to_tree(average(C)) return Tree(root=construct_node(tree))
def group_tuples(items=None, val_ind=None, dist_thresh = 0.1, distance_matrix=None, metric='jaccard', linkage='complete', sp_areas=None): ''' items: a dict or list of tuples val_ind: the index of the item of interest within each tuple ''' if distance_matrix is not None: if items is not None: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: raise Exception('clusters is not the right type') assert items is not None, 'items must be provided' distance_matrix = compute_pairwise_distances(values, metric, sp_areas=sp_areas) if items is None: assert distance_matrix is not None, 'distance_matrix must be provided.' if linkage=='complete': lk = complete(squareform(distance_matrix)) elif linkage=='average': lk = average(squareform(distance_matrix)) elif linkage=='single': lk = single(squareform(distance_matrix)) # T = fcluster(lk, 1.15, criterion='inconsistent') T = fcluster(lk, dist_thresh, criterion='distance') n_groups = len(set(T)) groups = [None] * n_groups for group_id in range(n_groups): groups[group_id] = np.where(T == group_id+1)[0] index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0] item_groups = [[items[i] for i in g] for g in groups if len(g) > 0] return index_groups, item_groups, distance_matrix
def cluster_fabio(db, dataset, gamma=None, with_lsa=False, ranking_metric='r'): doc_proc = dp.DocumentsProcessor(dataset, db=db) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio( rank_metric=ranking_metric, gamma=gamma) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio( rank_metric=ranking_metric) doc, features = tfidf_matrix.shape print 'starting clustering: found %s document and %s features' \ % (doc, features) if with_lsa: svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) linkage_matrix = hr.average(tfidf_matrix) else: linkage_matrix = hr.average(tfidf_matrix.toarray()) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def cluster_matrix(matrix, labels, dpi, by_cols, algorithm): """ From a matrix, generate a distance matrix & perform hierarchical clustering :param matrix: a numpy matrix of scores :param labels: the ids for all row elements or column elements :param dpi: the resolution to save the diagram at :param by_cols: whether to perform the clustering by row similarity (default) or column similarity. :param algorithm: the clustering algorithm (linkage (default, False) or UPGMA) :type matrix: numpy matrix :type labels: list :type dpi: int :type by_cols: boolean (default == False) :type algorithm: boolean :returns: a tuple of the updated (clustered) matrix & the updated labels """ if by_cols: matrix = matrix.transpose() print "\nClustering the matrix" # Clear any matplotlib formatting plt.clf() fig = plt.figure() ax = fig.add_subplot(111) # Hide x labels/ticks ax.set_yticklabels([]) ax.set_yticks([]) plt.xticks(fontsize=6) Y = pdist(matrix) if not algorithm: Z = linkage(Y) print "Linkage algorithm\n" else: Z = average(Y) print "UPGMA algorithm\n" dend = dendrogram(Z, labels=labels, link_color_func=None) plt.savefig("dendrogram.png", dpi=dpi) # Reshape ordered_index = dend['leaves'] updated_labels = dend['ivl'] tmp = [] for i in range(0, len(ordered_index)): tmp.append(list(matrix[ordered_index[i], :])) matrix = np.array(tmp) if by_cols: matrix = matrix.transpose() return matrix, updated_labels
def __apply_cluster_alg(cluster_data=[], alg="kmean", prior_cluster_num=2, t=0.155): pass """clustering""" if alg == "kmean": from scipy.cluster.vq import whiten cluster_data = whiten(cluster_data) from scipy.cluster.vq import kmeans, vq centroids, _ = kmeans(cluster_data, prior_cluster_num, iter=250) idx, dist = vq(cluster_data, centroids) return idx, prior_cluster_num elif alg == "spec": from sklearn import cluster from sklearn.preprocessing import StandardScaler X = cluster_data X = StandardScaler().fit_transform(X) spectral = cluster.SpectralClustering(n_clusters=prior_cluster_num, eigen_solver="arpack") spectral.fit(X) import numpy as N idx = spectral.labels_.astype(N.int) return idx, prior_cluster_num else: """hierarchical clustering http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html""" import scipy.cluster.hierarchy as hcluster """needs distance matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html""" import scipy.spatial.distance as dist distmat = dist.pdist(cluster_data, "minkowski") #'euclidean') if alg == "hflat": link = hcluster.linkage(distmat) elif alg == "hcomp": link = hcluster.complete(distmat) elif alg == "hweight": link = hcluster.weighted(distmat) elif alg == "havg": link = hcluster.average(distmat) idx = hcluster.fcluster(link, t=t, criterion="distance") import numpy as N post_cluster_num = len(N.unique(idx)) print "# of channels established:", post_cluster_num assert post_cluster_num < 64, "number of cluster too large to be biological meaningful" return idx, post_cluster_num
def main(): URL = 'C:\Users\NYU\Desktop\\' ListofInputFiles = URL + "A31" + "\*" DendogramImage = "C:\Users\NYU\Desktop\Dendogram" + ".png" FieldNames = [] ReadGroups = [] CulturalHoleMatrix = [] ArraysOfFileReads = [] CleanedContent = defaultdict() FileContent = defaultdict() CulturalHole = defaultdict(list) Preprocessed = defaultdict(list) FileReads = CreateList(ListofInputFiles) with open(FileReads[1]) as f: for l in f: FieldNames.append(l.strip().split(",")[1]) with open(FileReads[0]) as f: for l in f: ArraysOfFileReads.append(l.strip().split("\t")) for i in range(len(ArraysOfFileReads)): FileContent[ArraysOfFileReads[i][0]] = ArraysOfFileReads[i][1] with open(FileReads[2]) as f1: for l in f1: ReadGroups.append(l.strip().split("\t")) NewList = ReadGroups[1:] for i,f in enumerate(NewList[1:]): if FileContent[NewList[i][0]] == 'null': continue else: Preprocessed[NewList[i][1]].append(FileContent[NewList[i][0]]) for k,v in Preprocessed.items(): Words = (word for word in str(v).split() if word.isalpha() and len(word)>1) #Remove all Single Letter & Alpha-Numeric Enteries CleanedContent[k] = StopWords(Words) KeyList = sorted([int(i) for i in CleanedContent]) for i in itertools.product(KeyList, repeat=2): Writer = str(i[0]) Reader = str(i[1]) CulturalHole[i[0]].append(1 - Calculate_CH(CleanedContent[Writer],CleanedContent[Reader],(CleanedContent[Writer] + " " + CleanedContent[Reader]))) CulturalHoleMatrix = [CulturalHole[key] for key in CulturalHole] UPGMAMatrix = np.array(CulturalHoleMatrix) #UPGMA Clustering UPGMACluster = UPGMA.average(UPGMAMatrix) fig = plt.figure(figsize=(20,10)) plt.title("Document Jargon Distance/Relation") #Dendogram Plotting UPGMA.dendrogram(UPGMACluster, labels=np.array(FieldNames)) plt.xlabel("Group Names") plt.savefig(DendogramImage)
def create_dendrogram(g): """ create_dendrogram(g) create dendrogram (tree structure) from graph from lowest to highest level :param g: source graph :return: hier - hierarchy """ logging.info(cs_ref, 'create graph') path_length = nx.all_pairs_shortest_path_length(g) n = len(g.nodes()) distances = np.zeros((n, n)) for u, p in path_length.iteritems(): for v, d in p.iteritems(): distances[int(u) - 1][int(v) - 1] = d sd = distance.squareform(distances) hier = hierarchy.average(sd) return hier
def plotHaplotypes(chr, startPos, endPos): snpsd = dataParsers.parseCSVData( "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv")[chr - 1] import scipy as sp import scipy.cluster.hierarchy as hc import Emma snpsd = snpsd.getSnpsData() newSnps = [] positions = [] for i in range(0, len(snpsd.positions)): pos = snpsd.positions[i] if pos > endPos: break elif pos >= startPos: newSnps.append(snpsd.snps[i]) positions.append(snpsd.positions[i]) print "calculating the kinship" K = Emma.calcKinship(newSnps) #print "K:",K Z = hc.average(K) #print "Z:",Z import pylab #hc.leaders(Z) dend_dict = hc.dendrogram(Z, labels=snpsd.accessions) new_acc_order = dend_dict['ivl'] print new_acc_order print snpsd.accessions pylab.savefig("/Users/bjarni/tmp/FRI_tree.pdf", format='pdf') #cluster to get ordering?? acc_mapping = [] for acc in snpsd.accessions: i = new_acc_order.index(acc) acc_mapping.append(i) snps = [] for snp in newSnps: newSNP = [0] * len(snp) for (nt, i) in zip(snp, acc_mapping): newSNP[i] = nt snps.append(newSNP) snps = sp.array(snps) pylab.matshow(snps.transpose()) pylab.savefig("/Users/bjarni/tmp/FRI_haplotype.pdf", format='pdf')
def write_tree(cluster_method): import scipy.spatial.distance as ssd dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() triu = np.square(dmx.values) distArray = ssd.squareform(triu) if cluster_method == "average": hclust = average(distArray) elif cluster_method == "weighted": hclust = weighted(distArray) else: print("invalid cluster method chosen") sys.exit() t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def dendrogram(data, by_cols): dist_mat = data.to_distance_matrix(by_cols) clusters = hierarchy.average(dist_mat) tree = hierarchy.to_tree(clusters, rd=False) leaf_labels = [] for i in leaves: if by_cols: leaf_labels.append(data.col_names[i]) else: leaf_labels.append(data.row_names[i]) response_output = { 'name': data.name, 'key': data.key, 'tree': dict_node(tree, leaf_labels, 'root'), 'labels': leaf_labels } return response_output
def run(self): try: g = networkx.Graph() sewer = self.getData("Sewer") CostsTotal = 0 LengthTot = 0 names = sewer.getNamesOfComponentsInView(self.conduits) pointnamelist = [] for nc in names: c = sewer.getEdge(nc) startNode = c.getStartpointName() endNode = c.getEndpointName() if startNode not in pointnamelist: pointnamelist.append(startNode) if endNode not in pointnamelist: pointnamelist.append(endNode) g.add_edge(pointnamelist.index(startNode), pointnamelist.index(endNode)) path_length = networkx.all_pairs_shortest_path_length(g) n = len(g.nodes()) distances = numpy.zeros((n, n)) for u, p in path_length.iteritems(): for v, d in p.iteritems(): distances[int(u) - 1][int(v) - 1] = d sd = distance.squareform(distances) hier = hierarchy.average(sd) hierarchy.dendrogram(hier) matplotlib.pylab.savefig("tree.png", format="png") partition = community.best_partition(g) print partition for i in set(partition.values()): print "Community", i members = list_nodes = [ nodes for nodes in partition.keys() if partition[nodes] == i ] print members except Exception, e: print e print "Unexpected error:"
def cluster_dandelion_entities(dataset, gamma=None, filter=False): doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities( ) doc, features = tfidf_matrix.shape print 'starting clustering: found %s document and %s features' \ % (doc, features) svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) print 'starting clustering: found %s document and %s features after LSA' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) #linkage_matrix = hr.average(tfidf_matrix.toarray()) linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def guide_tree_from_query_sequences(query_sequences, distance_fn=three_mer_distance, display_tree = False): guide_dm = [] seq_ids = [] for seq_id1, seq1 in query_sequences: seq_ids.append(seq_id1) row = [] for seq_id2, seq2 in query_sequences: row.append(kmer_distance(seq1, seq2, k=3)) guide_dm.append(row) guide_dm = DistanceMatrix(guide_dm, seq_ids) guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def write_tree(cluster_method): import scipy.spatial.distance as ssd dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() #triu = np.square(dmx.as_matrix()) triu = np.square(dmx.values) distArray = ssd.squareform(triu) if cluster_method == "average": hclust = average(distArray) elif cluster_method == "weighted": hclust = weighted(distArray) else: print("invalid cluster method chosen") sys.exit() t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def progressive_msa_and_tree(query_sequences, gap_open_penalty=8, gap_extend_penalty=1, substitution_matrix=nt_substitution_matrix, msa_distance_fn=compute_aligned_sequence_distances, guide_tree=None, display_aln=False, display_tree=False): msa, guide_tree = progressive_msa(query_sequences, guide_tree, gap_open_penalty, gap_extend_penalty, substitution_matrix) if display_aln: print "Multiple sequence alignment:\n" for seq_id, seq in msa: print seq, "(%s)" % seq_id dm = msa_distance_fn(msa) lm = average(dm.condensed_form()) tree = to_tree(lm) if display_tree: print "\nOutput tree:" d = dendrogram(lm, labels=dm.ids, orientation='right', link_color_func=lambda x: 'black', leaf_font_size=24) return msa, tree
def compute_distance_matrix(covers): # Compute stochastic clusters num_results = len(covers) distance_matrix= np.zeros((num_results,num_results)) print('Calculating distance matrix ... ') for i in range(num_results): for j in range(i+1,num_results): #score = metrics.omega_index(results['vc'][i].membership,results['vc'][j].membership) #score = skmetrics.f1_score(to_crisp_membership(results['vc'][i].membership), # to_crisp_membership(results['vc'][j].membership)) score = skmetrics.adjusted_rand_score(to_crisp_membership(covers[i].membership), to_crisp_membership(covers[j].membership)) distance_matrix[i,j] = 1-score distance_matrix[j,i] = 1-score distance_matrix = np.matrix(distance_matrix) y = squareform(distance_matrix) Z = average(y) return distance_matrix, y, Z
def guide_tree_from_query_sequences(query_sequences, distance_fn=three_mer_distance, display_tree = False): guide_dm = [] seq_ids = [] for seq_id1, seq1 in query_sequences: seq_ids.append(seq_id1) row = [] for seq_id2, seq2 in query_sequences: row.append(kmer_distance(seq1, seq2, k=3)) guide_dm.append(row) guide_dm = SymmetricDistanceMatrix(guide_dm, seq_ids) guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def cluster_dandelion_abstract(dataset, gamma=None, filter=False): doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_abstract( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_abstract(min_df=2, relevance_threshold=0.95) doc, features = tfidf_matrix.shape print 'starting clustering: found %s document and %s features' \ % (doc, features) svd = TruncatedSVD(1300) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) print 'starting clustering: found %s document and %s features after LSA' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) #linkage_matrix = hr.average(tfidf_matrix.toarray()) linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def demo_elbow_method(multiplexity_matrix): """ Performs agglomarative clustering with different cut-off levels, display silhouette scores. """ import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster import hierarchy from sklearn.metrics import silhouette_score from scipy.spatial.distance import squareform sns.set_style("whitegrid") sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5}) src_dist_matrix = multiplexity_matrix.max() - multiplexity_matrix num_objects = len(src_dist_matrix) # Converting (possibly) asymmetric distance matrix to symmetric # pairwise distance array as expected by scipy clustering pdist_array = [(src_dist_matrix[i, j] + src_dist_matrix[j, i]) / 2 for i in range(num_objects) for j in range(i + 1, num_objects)] pdist_matrix = squareform(pdist_array) linkage = hierarchy.average(pdist_array) nn = np.arange(2, 31) scores = [] for n in nn: labels = hierarchy.fcluster(linkage, n, criterion='maxclust') scores.append( silhouette_score(pdist_matrix, labels, metric='precomputed')) scores = pd.DataFrame({ 'Number of clusters': nn, 'Silhouette score': scores }) plt.title('Agglomerative clustering') sns.lineplot(data=scores, x='Number of clusters', y='Silhouette score', markers=False, dashes=True)
def compute_distance_matrix(covers): # Compute stochastic clusters num_results = len(covers) distance_matrix = np.zeros((num_results, num_results)) print('Calculating distance matrix ... ') for i in range(num_results): for j in range(i + 1, num_results): #score = metrics.omega_index(results['vc'][i].membership,results['vc'][j].membership) #score = skmetrics.f1_score(to_crisp_membership(results['vc'][i].membership), # to_crisp_membership(results['vc'][j].membership)) score = skmetrics.adjusted_rand_score( to_crisp_membership(covers[i].membership), to_crisp_membership(covers[j].membership)) distance_matrix[i, j] = 1 - score distance_matrix[j, i] = 1 - score distance_matrix = np.matrix(distance_matrix) y = squareform(distance_matrix) Z = average(y) return distance_matrix, y, Z
def scipy_algo(dataset): doc_proc = dp.DocumentsProcessor(dataset) tfidf_matrix, f_score_dict = doc_proc.get_data() linkage_matrix = hr.average(tfidf_matrix.toarray()) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) print_f_score_dict(f) print 'average f_score: %s' % average_f_score(f, tfidf_matrix.shape[0])
def create_correlation_tree(corr_matrix, method="single"): """ Creates hierarchical clustering (correlation tree) from a correlation matrix Parameters ---------- corr_matrix : np.ndarray ``(p, p)``-shaped correlation matrix method : str the method of hierarchical clustering: 'single', 'average', 'fro', or 'complete'. Defaults to 'single'. Returns ------- link : np.ndarray The `link` of the correlation tree, as in scipy """ # Distance matrix for tree method if method == "fro": dist_matrix = np.around(1 - np.power(corr_matrix, 2), decimals=7) else: dist_matrix = np.around(1 - np.abs(corr_matrix), decimals=7) dist_matrix -= np.diagflat(np.diag(dist_matrix)) condensed_dist_matrix = ssd.squareform(dist_matrix) # Create linkage if method == "single": link = hierarchy.single(condensed_dist_matrix) elif method == "average" or method == "fro": link = hierarchy.average(condensed_dist_matrix) elif method == "complete": link = hierarchy.complete(condensed_dist_matrix) else: raise ValueError( f'Only "single", "complete", "average", "fro" are valid methods, not {method}' ) return link
def create_hc(G, t=1.2): """ 从距离矩阵中创造一个图G的分层聚类 马克西姆注:对带有标签的图进行聚类的前处理和后处理,并返回聚类的结果 参数化门槛值之后,其取值范围应该通过对每个数据进行尝试的基础上确定 """ """在对德鲁•康威(Drew Conway)编写的代码进行优化的基础上而来""" ## 创造最短路径距离矩阵,但是保留节点标签 labels = list(G.nodes()) indx = {} for ind in range(len(labels)): word = labels[ind] indx[word] = ind path_length = nx.all_pairs_shortest_path_length(G) distances = numpy.zeros((len(G), len(G))) for i in range(len(labels)): for j in range(len(labels)): distances[i][j] = 10000 for u, p in path_length.items(): uind = indx[u] for v, d in p.items(): vind = indx[v] #u和v 都是词 distances[uind][vind] = d # 创造分层聚类 Y = distance.squareform(distances) #Z=hierarchy.single(Y) Z = hierarchy.average(Y) print("caonima", Z.shape) # 这种划分的选择是任意的,仅仅为了说明 的目的 membership = list(hierarchy.fcluster(Z, t=t)) partition = defaultdict(list) for n, p in zip(list(range(len(G))), membership): partition[p].append(labels[n]) return list(partition.values())
def create_hc(G, t=1.2): """ 从距离矩阵中创造一个图G的分层聚类 马克西姆注:对带有标签的图进行聚类的前处理和后处理,并返回聚类的结果 参数化门槛值之后,其取值范围应该通过对每个数据进行尝试的基础上确定 """ """在对德鲁•康威(Drew Conway)编写的代码进行优化的基础上而来""" ## 创造最短路径距离矩阵,但是保留节点标签 labels=list(G.nodes()) indx = {} for ind in range(len(labels)): word = labels[ind] indx[word] = ind path_length=nx.all_pairs_shortest_path_length(G) distances=numpy.zeros((len(G),len(G))) for i in range(len(labels)): for j in range(len(labels)): distances[i][j] = 10000 for u,p in path_length.items(): uind = indx[u] for v,d in p.items(): vind = indx[v] #u和v 都是词 distances[uind][vind]=d # 创造分层聚类 Y=distance.squareform(distances) #Z=hierarchy.single(Y) Z=hierarchy.average(Y) print("caonima",Z.shape) # 这种划分的选择是任意的,仅仅为了说明 的目的 membership=list(hierarchy.fcluster(Z,t=t)) partition=defaultdict(list) for n,p in zip(list(range(len(G))),membership): partition[p].append(labels[n]) return list(partition.values())
def run(self): try: g = networkx.Graph() sewer = self.getData("Sewer") CostsTotal = 0 LengthTot = 0 names = sewer.getNamesOfComponentsInView(self.conduits) pointnamelist = [] for nc in names: c = sewer.getEdge(nc) startNode = c.getStartpointName() endNode = c.getEndpointName() if startNode not in pointnamelist: pointnamelist.append(startNode) if endNode not in pointnamelist: pointnamelist.append(endNode) g.add_edge(pointnamelist.index(startNode), pointnamelist.index(endNode)) path_length=networkx.all_pairs_shortest_path_length(g) n = len(g.nodes()) distances=numpy.zeros((n,n)) for u,p in path_length.iteritems(): for v,d in p.iteritems(): distances[int(u)-1][int(v)-1] = d sd = distance.squareform(distances) hier = hierarchy.average(sd) hierarchy.dendrogram(hier) matplotlib.pylab.savefig("tree.png",format="png") partition = community.best_partition(g) print partition for i in set(partition.values()): print "Community", i members = list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == i] print members except Exception, e: print e print "Unexpected error:"
def get_clusters_average(): x = np.array(df) print(len(x)) linkage_array = average(x) print(linkage_array) print(len(linkage_array)) dendrogram(linkage_array) ax = plt.gca() bounds = ax.get_xbound() ax.plot(bounds, [2500, 2500], '--', c='k') ax.plot(bounds, [850, 850], '--', c='k') ax.text(bounds[1], 2500, ' два кластера', va='center', fontdict={'size': 5}) ax.text(bounds[1], 850, ' три кластера', va='center', fontdict={'size': 5}) plt.xlabel("Индекс наблюдения") plt.ylabel("Кластерное расстояние") plt.xlim(2567, 3000) plt.ylim(0, 10) plt.show()
def hierarchy_clustering(df): fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(50, 18)) for linkage, cluster, ax in zip([ hierarchy.complete(df), hierarchy.average(df), hierarchy.single(df), hierarchy.ward(df) ], ['c1', 'c2', 'c3', 'c4'], [ax1, ax2, ax3, ax4]): cluster = hierarchy.dendrogram(linkage, labels=df.index, p=12, truncate_mode="lastp", orientation='top', color_threshold=0, leaf_font_size=10, distance_sort=True, ax=ax) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage') ax4.set_title('Ward') plt.show()
def clustering(self): counts = np.log10(self.counts().transpose() + 1) labels = counts.index.values # calculate correlation distance dist = pdist(counts, 'correlation') # calculate correlation from distance corr = 1 - dist dist = np.clip(dist, 0, 1) # cluster clustering = average(dist) #import matplotlib.pyplot as plt d = dendrogram(clustering, labels=labels, get_leaves=True, no_plot=True) #plt.savefig("/tmp/ddg.pdf") leaves = d["leaves"] # calculate correlation matrix corr = np.round(squareform(corr), 2) # fix diagonal, that will contain zeros because squareform expects a dist np.fill_diagonal(corr, 1) return corr, leaves, labels, d
def arrangeClusters(pairwiseDistanceMatrix): y = spatial.distance.squareform(pairwiseDistanceMatrix) # print(y) Z = average(y) clusters = fcluster(Z, 0.8, criterion='distance') numberOfClusters = max(clusters) for point in range(0, (numberOfClusters)): orderedCluster.append([]) for point in range(0, len(clusters)): orderedCluster[clusters[point] - 1].append(point) for element in orderedCluster: print(element) # Z = linkage(y, 'average') # print(Z) #fig = plt.figure(figsize=(25, 10)) dn = dendrogram(Z) plt.savefig('result.png') plt.show()
def plot_dendrogram(self,topology_type='litho',path=None): ''' Calculates the average number of nodes in all of the model realisations that are part of this experiment. **Arguments** - *topology_type* = The type of topology you are interested in. This should be either 'litho' or 'struct' - *path* = A path to save the image to. If left as None the image is drawn to the screen. ''' #get difference matrix (NB. squareform converts it to a condensed matrix for scipy) import scipy.spatial.distance as dist import scipy.cluster.hierarchy as clust m_dif = dist.squareform( self.get_difference_matrix(topology_type),force='tovector' ) if len(m_dif) > 2: #generate dendrogram using UPGMA Z = clust.average(m_dif) #generate plot clust.dendrogram(Z) else: #we cant build a tree with only one topology... print "Error: only a single unique topology of this type has been found"
def cluster_with_mpear(X, max_clusters=None): ''' Args: X : (array) An array with as many rows as (post-burnin) MCMC iterations and columns as data points. ''' X = np.array(X).T dist_mat = pdist(X, metric='hamming') sim_mat = 1 - squareform(dist_mat) Z = average(dist_mat) max_pear = 0 best_cluster_labels = _get_flat_clustering(Z, 1) if max_clusters is None: max_clusters = len(X) + 1 else: max_clusters = min(max_clusters, len(X)) max_clusters = max(max_clusters, 1) for i in range(2, max_clusters + 1): cluster_labels = _get_flat_clustering(Z, i) pear = _compute_mpear(cluster_labels, sim_mat) if pear > max_pear: max_pear = pear best_cluster_labels = cluster_labels return best_cluster_labels
def CalculateClusterTree(self): fullMatrix = self.GenerateFullMatrix(self.results) dissMatrix = [] labels = fullMatrix.keys() for i in xrange(0, len(labels)): sampleNameI = labels[i] for j in xrange(i+1, len(labels)): sampleNameJ = labels[j] dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ]) # calculate hierarchical cluster tree if self.radioSingleLinkage.GetValue(): linkageMatrix = single(dissMatrix) elif self.radioUPGMA.GetValue(): linkageMatrix = average(dissMatrix) elif self.radioCompleteLinkage.GetValue(): linkageMatrix = complete(dissMatrix) elif self.radioWeighted.GetValue(): linkageMatrix = weighted(dissMatrix) root = to_tree(linkageMatrix) # create Newick string return self.CreateNewickString(root, labels) + ';'
def CalculateClusterTree(self): fullMatrix = self.GenerateFullMatrix(self.results) dissMatrix = [] labels = fullMatrix.keys() for i in xrange(0, len(labels)): sampleNameI = labels[i] for j in xrange(i + 1, len(labels)): sampleNameJ = labels[j] dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ]) # calculate hierarchical cluster tree if self.radioSingleLinkage.GetValue(): linkageMatrix = single(dissMatrix) elif self.radioUPGMA.GetValue(): linkageMatrix = average(dissMatrix) elif self.radioCompleteLinkage.GetValue(): linkageMatrix = complete(dissMatrix) elif self.radioWeighted.GetValue(): linkageMatrix = weighted(dissMatrix) root = to_tree(linkageMatrix) # create Newick string return self.CreateNewickString(root, labels) + ';'
def cluster_zips(area_features, linkage, t, return_dist=False): """ Clusters zip codes using a hierachial method with euclidean distance and the inputted feature vector. """ if type(area_features) == str: features = json.load( open('data/{}/census/features.json'.format(area_features), 'r')) elif type(area_features) == dict: features = area_features feat = [] for i in features.values(): feat.append(i) y = pdist(np.matrix(feat), 'euclidean') if linkage == 'single': Z = hierarchy.single(y) elif linkage == 'average': Z = hierarchy.average(y) elif linkage == 'complete': Z = hierarchy.complete(y) f = hierarchy.fcluster(Z, criterion='distance', t=t) if return_dist == True: return (squareform(y), f) else: return f
nx.draw_networkx(z,pos) plt.draw() plt.savefig('Karate_graph.pdf') #----------------------------------------------------------------------------- # 3: Hierarchical clustering: path_length=nx.all_pairs_shortest_path_length(z) n = len(z.nodes()) distances=np.zeros((n,n)) for u,p in path_length.iteritems(): for v,d in p. iteritems (): distances[u][v] = d hier = hierarchy.average( distances ) plt.figure(2) hierarchy.dendrogram(hier) plt.savefig('h_clustering.pdf') #----------------------------------------------------------------------------- # 4: Spectral clustering: def spectralClustering(W, k): # Create degree matrix D = diag(sum(W, axis=0)) # Create Laplacian matrix L = D - W eigval, eigvec = linalg.eig(L) # Calculate eigenvalues and eigenvectors eigval = eigval.real # Keep the real part eigvec = eigvec.real # Keep the real part
print("#", cmd_args) num_clusters = args.num_clusters if args.index_file is not None: index_filename = args.index_file else: index_filename = None # read the RMSD file and # convert to condensed upper triangular rmsd = numpy.loadtxt(args.rmsd_file) upper = squareform(rmsd) link = sch.average(upper) if args.link: numpy.savetxt(args.prefix + ".link", link, header=cmd_args) assignments = sch.fcluster(link, num_clusters, criterion='maxclust') # Read the index file if one was supplied indices = {} if index_filename: with(open(index_filename)) as index_file: for line in index_file.readlines(): (first, last, filename) = line.split() first = int(first) last = int(last) t = basename(filename)
# print (m) # -------------- # dm = DistanceMatrix(X, labels) # sys.exit(1) # tree = nj(dm) # nj() # print(tree.ascii_art()) sys.exit(1) # --------------------- # calculating UPGMA x = average(X) # average (X) file_1 = open('results/clustered_data2.txt','w') for i in x: file_1.write(f'{int(i[0])}\t{int(i[1])}\t{i[2]}\t{int(i[3])}\n') print("Done avg") # fig = plt.figure(figsize=(350,120), dpi=100) fig = plt.figure() # figsize=(200, 200) dn = dendrogram(x, labels=labels, orientation='left') plt.xticks(rotation='horizontal') plt.yticks(rotation='horizontal')
#single link H = h.single(X) print H.shape #sono tutti i link effettuati (#esempi-1) e per ciascuno abbiamo # coppie di cluster uniti, distanza e #esempi contenuti in nuovo cluster h.dendrogram(H) pl.show() #il dendogramma e' lungo perche' c'e' chain effect tipico problema del single link #comlpete link H = h.complete(X) h.dendrogram(H) pl.show() #average link H = h.average(X) h.dendrogram(H) pl.show() #centroid link H = h.centroid(X) h.dendrogram(H) pl.show() #ci sono delle inversioni perche' la distanza qui non e' monotona #per ottenere un cluster devo definire una distanza H = h.average(X) C = h.fcluster(H, 1.9, criterion='distance') #la soglia 3.5 sembra buona dal grafico #per vedere il numero di cluster: print "n cluster:", len(np.unique(C))
def setupUi(self, Form,Matrix,list_d, c): yourarray=Matrix center=yourarray.mean(axis=1) min_mean=center[0] mean_tree=[] for r in range(0,len(center)): if(center[r]<min_mean): mean_tree=[] mean_tree.append(r+1) min_mean=center[r] #print(mean_tree,min_mean) elif(center[r]==min_mean): mean_tree.append(r+1) #print(mean_tree,min_mean) print(mean_tree) min=0 first_row=yourarray[0] for i in range(0,len(first_row)): min+= math.sqrt((first_row[i]-center[i])**2) min_tree=[] for r in range(len(yourarray)): row=yourarray[r] sum=0 for i in range(0,len(row)): sum+= math.sqrt((row[i]-center[i])**2) if(sum>min):break if(sum<min): min_tree=[] min_tree.append(r+1) min=sum elif(sum==min): min_tree.append(r+1) print(min_tree) text="----------------------------------------------------------------------------------------\n" #text+="Cluster ID:"+str(key)+"\n" #text+="Cluster tree set:"+ str(clusters[key])+"\n" text+="Center tree-approach #1:"+str(mean_tree)+"\n" text+="Center tree-approach #2:"+ str(min_tree)+"\n" text+="----------------------------------------------------------------------------------------\n" if(c=="Y"): distances=Matrix distArray = ssd.squareform(distances) arr =list_d linkage_matrix = average(distArray) fc= hier.fcluster(linkage_matrix, 6, criterion='maxclust') clusters = defaultdict(lambda:[]) for pos in range(0,len(fc)): clusters[fc[pos]].append(pos+1) for key in clusters: Cluster_matrix=[] Cluster_distances=[] tempSim=[] array=clusters[key] for x in range (0,len(array)): temp=[] for y in range (0,len(array)): temp.append(distances[x,y]) tempSim.append(temp) Cluster_distances=np.array(tempSim) center=Cluster_distances.mean(axis=1) min_mean= float("inf") mean_tree=[] for r in range(0,len(center)): if(center[r]<min_mean): mean_tree=[] mean_tree.append(array[r]) min_mean=center[r] elif(center[r]==min_mean): mean_tree.append(array[r]) min=0 first_row=Cluster_distances[0] for i in range(0,len(first_row)): min+= math.sqrt((first_row[i]-center[i])**2) min_tree=[] for r in range(len(Cluster_distances)): row=Cluster_distances[r] sum=0 for i in range(0,len(row)): sum+= math.sqrt((row[i]-center[i])**2) if(sum>min):break if(sum<min): min_tree=[] min_tree.append(array[r]) min=sum elif(sum==min): min_tree.append(array[r]) text+="Cluster ID:"+str(key)+"\n" text+="Cluster tree set:"+ str(clusters[key])+"\n" text+="Center tree-approach #1:"+str(mean_tree)+"\n" text+="Center tree-approach #2:"+ str(min_tree)+"\n" text+="----------------------------------------------------------------------------------------\n" self.text=text Form.setObjectName("Form") Form.resize(587, 515) self.textEdit = QtWidgets.QTextEdit(Form) self.textEdit.setGeometry(QtCore.QRect(30, 60, 531, 391)) self.textEdit.setObjectName("textEdit") self.label = QtWidgets.QLabel(Form) self.label.setGeometry(QtCore.QRect(40, 20, 221, 21)) font = QtGui.QFont() font.setPointSize(12) self.label.setFont(font) self.label.setObjectName("label") self.lineEdit = QtWidgets.QLineEdit(Form) self.lineEdit.setGeometry(QtCore.QRect(30, 480, 251, 20)) self.lineEdit.setObjectName("lineEdit") self.pushButton_2 = QtWidgets.QPushButton(Form) self.pushButton_2.setGeometry(QtCore.QRect(290, 480, 71, 23)) self.pushButton_2.setObjectName("pushButton_2") self.label_2 = QtWidgets.QLabel(Form) self.label_2.setGeometry(QtCore.QRect(30, 460, 111, 16)) self.label_2.setObjectName("label_2") self.textEdit.setPlainText(text) self.retranslateUi(Form) QtCore.QMetaObject.connectSlotsByName(Form) Form.show() self.pushButton_2.clicked.connect(self.write_to_file)
plt.figure(figsize=(15, 10)) h.dendrogram(result) plt.show() flat_single = h.fcluster(result, 5588, criterion='distance') adjusted_rand_score(y.flatten(), flat_single) adjusted_mutual_info_score(y.flatten(), flat_single) """### **Average-Link (Group-Link)** Questa misura di similarità calcola la distanza tra i due cluster come la media delle distanze tra i singoli elementi. Questo criterio rappresenta una soluzione intermedia tra il *single-link* e il *complete-link*. """ result = h.average(X) plt.figure(figsize=(15, 10)) h.dendrogram(result) plt.show() flat_single = h.fcluster(result, 1394, criterion='distance') adjusted_rand_score(y.flatten(), flat_single) adjusted_mutual_info_score(y.flatten(), flat_single) """### **Centroid** Per ogni cluster viene calcolato un *centroide* che rappresenta la media. I cluster vengono uniti in base a i centroidi più simili tra loro. Tali cluster vengono uniti a due a due. """
# In[8]: # Normalizando y centrando la tabla from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaled_values = scaler.fit_transform(data) data.loc[:, :] = scaled_values print(data) datos = data # In[9]: ward_res = ward(datos) #Ward single_res = single(datos) #Salto mínimo complete_res = complete(datos) #Salto Máxim average_res = average(datos) #Promedio # ### b) Ejecute un Clustering Jerarquico con la agregacion del Salto Maximo, Salto Mınimo, Promedio y Ward. Grafique el dendograma con cortes para dos y tres clusteres. # In[10]: dendrogram(average_res, labels=datos.index.tolist()) plt.figure(figsize=(13, 10)) dendrogram(complete_res, labels=datos.index.tolist()) plt.figure(figsize=(13, 10)) dendrogram(single_res, labels=datos.index.tolist()) plt.figure(figsize=(13, 10)) dendrogram(ward_res, labels=datos.index.tolist()) # Agrega cortes con 2 y 3 clústeres con agregación de Ward ax = plt.gca()
hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'average') hclust_model.fit(X) print('Cluster labels: {}\n'.format(hclust_model.labels_)) hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'complete') hclust_model.fit(X) print('Cluster labels: {}\n'.format(hclust_model.labels_)) print ''' ********************************************************************************************************************* scipy: dendrogram ********************************************************************************************************************* ''' # from: https://github.com/JWarmenhoven/ISLR-python/blob/master/Notebooks/Chapter%2010.ipynb fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18)) for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'], [ax1,ax2,ax3]): cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage') plt.show()
def classifyCluster(isolateList, spacermatch): lenIsolate = {} pairscore = {} with open(spacermatch) as fl: curIsolate = '' for line in fl: array = line.strip().split('\t') qmatch = array[0].split('||')[0] smatch = array[1].split('||')[0] if curIsolate != 'qmatch': n = 1 else: n += 1 lenIsolate[qmatch] = n pair = qmatch + '||' + smatch if pair in pairscore: pairscore[pair] += 1 else: pairscore[pair] = 1 for it in lenIsolate: pairscore[it + '||' + it] = lenIsolate[it] with open(spacermatch + '.score', 'w') as fl: for pair in pairscore: fl.write( '%s\t%s\t%i\n' % (pair.split('||')[0], pair.split('||')[1], pairscore[pair])) scoreFile = open(spacermatch + '.score', 'r') df = pd.read_table(scoreFile, sep='\t', names=['qmatch', 'smatch', 'score']) df_matrix = df.pivot(index='qmatch', columns='smatch', values='score') df_matrix_adjusted = df_matrix.fillna(0) from skbio.stats.distance import DistanceMatrix from numpy import zeros def bray_curtis_distance(table, sample1_id, sample2_id): numerator = 0 denominator = 0 sample1_counts = table[sample1_id] sample2_counts = table[sample2_id] for sample1_count, sample2_count in zip(sample1_counts, sample2_counts): numerator += abs(sample1_count - sample2_count) denominator += sample1_count + sample2_count return numerator / denominator def table_to_distances(table, pairwise_distance_fn): sample_ids = table.columns num_samples = len(sample_ids) data = zeros((num_samples, num_samples)) for i, sample1_id in enumerate(sample_ids): for j, sample2_id in enumerate(sample_ids[:i]): data[i, j] = data[j, i] = pairwise_distance_fn( table, sample1_id, sample2_id) return DistanceMatrix(data, sample_ids) bc_dm = table_to_distances(df_matrix_adjusted, bray_curtis_distance) from scipy.cluster.hierarchy import average, dendrogram lm = average(bc_dm.condensed_form()) d = dendrogram(lm, labels=bc_dm.ids, orientation='right', link_color_func=lambda x: 'black') orderedIsolates = d['ivl'] return orderedIsolates
from matplotlib import pyplot as plt #import data file = './HumanAgeandFatness.csv' target = open(file, 'r') datalist = np.loadtxt(file, skiprows=1, delimiter=',') print(datalist) print('first row') print(datalist[0, :]) X = datalist print(X.shape) # 150 samples with 2 dimensions #print(X) # generate the linkage matrix Z = average(X) print(Z) print(Z.shape) print(Z[0]) #row format [idx1, idx2, dist, sample_count]. plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show()
# Calculate the mean of the pairwise similarities. ii = np.tril_indices(distmat.shape[0], -1) pwise = distmat[ii] mdist = np.mean(pwise) print mdist # Generate a historgram of the pairwise similarities. plt.clf() plt.hist(pwise, 20, color='lightblue') plt.xlabel("Similarity", size=17) plt.ylabel("Frequency", size=17) pdf.savefig() # Do the clustering h = clust.average(distmat) print h print len(h) # Plot the dendrogram plt.figure(figsize=(16,10)) #plt.figure(linewidth=100) plt.clf() ax = plt.axes() for pos in 'right','bottom','top': ax.spines[pos].set_color('none') ax.xaxis.set_ticks_position('none') ax.yaxis.set_ticks_position('none') #ax.spines['left'].set_position(('outward', 10)) clust.dendrogram(h, get_leaves="true", count_sort="true",show_leaf_counts="false", color_threshold=1.5) #no_labels="true")
def similarity_matrix_plot(config_file, plot_file, burnin=0, max_clusters=None, min_cluster_size=0, samples=None, thin=1): sb.set_style('whitegrid') labels = post_process.cluster_pyclone_trace(config_file, burnin, thin, max_clusters=max_clusters) labels = labels.set_index('mutation_id') labels = labels['cluster_id'] color_map = utils.get_clusters_color_map(labels) cluster_sizes = labels.value_counts() used_clusters = cluster_sizes[cluster_sizes >= min_cluster_size].index labels = labels[labels.isin(used_clusters)] used_loci = labels.index trace_file = paths.get_labels_trace_file(config_file) labels_trace = trace.load_cluster_labels_trace(trace_file, burnin, thin) labels_trace = labels_trace[used_loci] dist_mat = pdist(labels_trace.values.T, 'hamming') Z = average(dist_mat) dist_mat = pd.DataFrame(squareform(dist_mat), index=labels_trace.columns, columns=labels_trace.columns) sim_mat = 1 - dist_mat N = sim_mat.shape[0] cluster_colors = labels.map(color_map) size = 0.12 * N g = sb.clustermap(sim_mat, cmap='Blues', col_colors=cluster_colors, row_colors=cluster_colors, col_linkage=Z, row_linkage=Z, figsize=(size, size)) ax = g.ax_heatmap utils.set_tick_label_font_sizes(ax, defaults.small_tick_label_font_size) utils.set_tick_label_rotations(ax) ax.set_xlabel('Loci', fontsize=defaults.axis_label_font_size) ax.set_ylabel('Loci', fontsize=defaults.axis_label_font_size) g.fig.savefig(plot_file, bbox_inches='tight')
np.min(cdist(all_data.values, kmeans.cluster_centers_, 'euclidean'), axis=1))) plt.figure() plt.plot(K, error, 'bx-') plt.xlabel('K') plt.ylabel('Error') plt.savefig('elbow.png') plt.close() plt.figure() single_linkage = hierarchy.single(all_data) dn = hierarchy.dendrogram(single_linkage) plt.savefig('single.png') plt.close() plt.figure() complete_linkage = hierarchy.complete(all_data) dn = hierarchy.dendrogram(complete_linkage) plt.savefig('complete_linkage.png') plt.close() plt.figure() average_linkage = hierarchy.average(all_data) dn = hierarchy.dendrogram(average_linkage) plt.savefig('average_linkage.png') plt.close() print('done')