def process_hierarchy(inf, h, method): df = pd.read_csv(inf, header=0, index_col=0) df = df.fillna(0) strains = df.index df = 1 - (df / 100) df_v = ssd.squareform( df, force='tovector', checks=False) # flatten matrix to condensed distance vector if method == 'single': li = sch.single(df_v) elif method == 'complete': li = sch.complete(df_v) elif method == 'average': li = sch.average(df_v) elif method == 'weighted': li = sch.weighted(df_v) else: print('\nERROR: Please enter a valid clustering method\n') sys.exit() hclus = cut_tree( li, height=h ) # using the height (percent ID as decimal, for example), cluster OFUs from dendrogram hclus = pd.DataFrame(hclus, index=strains) hclus.ix[:, 0] += 1 # cut_tree defaults to the first 'cluster' being named "0"; this just bumps all IDs +1 return hclus
def get_centroids(train_pack): # unpack x_train x_train = train_pack[0] distance_threshold = train_pack[1] clustering_type = train_pack[2] if clustering_type == 'Agglomerative': dist_mat=pdist(x_train,metric='euclidean') Z = weighted(dist_mat) dn = hierarchy.dendrogram(Z) labels=fcluster(Z, t=distance_threshold, criterion='distance') total_number = [0 for x in range(0,max(labels))] centroids = [[0 for x in range(len(x_train[0]))] for y in range(0,max(labels))] for j in range(0,len(x_train)): centroids[labels[j]-1]+=x_train[j] total_number[labels[j]-1]+=1 for j in range(0,len(centroids)): centroids[j] = np.divide(centroids[j],total_number[j]) elif clustering_type == 'Agg_Var': if len(x_train)>0: centroids = [[0 for x in range(len(x_train[0]))]] # initalize centroids centroids[0] = x_train[0] total_num = [1] for i in range(1,len(x_train)): distances=[] indices = [] for j in range(0,len(centroids)): d = find_distance(x_train[i],centroids[j],distance_metric) if d<distance_threshold: distances.append(d) indices.append(j) if len(distances)==0: centroids.append(x_train[i]) total_num.append(1) else: min_d = np.argmin(distances) centroids[indices[min_d]] = np.add(np.multiply(total_num[indices[min_d]],centroids[indices[min_d]]),x_train[i]) total_num[indices[min_d]]+=1 centroids[indices[min_d]] = np.divide(centroids[indices[min_d]],(total_num[indices[min_d]])) #min_d = np.argmin(distances) #centroids[indices[min_d]] = np.add(centroids[indices[min_d]],x_train[i]) #total_num[indices[min_d]]+=1 #for j in range(0,len(total_num)): # centroids[j]=np.divide(centroids[j],total_num[j]) else: centroids = [] elif clustering_type == 'k_means': kmeans = KMeans(n_clusters=distance_threshold, random_state = 0).fit(x_train) centroids = kmeans.cluster_centers_ elif clustering_type == 'NCM': centroids = [[0 for x in range(len(x_train[0]))]] centroids[0] = np.average(x_train,0) return centroids
def write_tree(): dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() triu = np.square(dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def detectHierarchical(G, numClusters, sites, unipartite, fast): numNodes = G.number_of_nodes() if unipartite == True: if fast == True: W = pickle.load(open("weightsUnipartite.p", "rb")) else: W = getStartingWeights(G, numNodes, True) pickle.dump(W, open("weightsUnipartite.p", "wb")) else: if fast == True: W = W = pickle.load(open("weightsBipartite.p", "rb")) else: W = getStartingWeights(G, numNodes, False) pickle.dump(W, open("weightsBipartite.p", "wb")) if unipartite == True: Z=hierarchy.weighted(W) #pickle.dump(Z, open("ZUnipartite.p", "wb")) #Z = pickle.load(open("ZUnipartite.p", "rb")) else: Z=hierarchy.weighted(W) #pickle.dump(Z, open("ZBipartite.p", "wb")) #Z = pickle.load(open"ZBipartite.p", "rb") membership=list(hierarchy.fcluster(Z,numClusters, 'maxclust')) # print "number of distinct clusters: ", len(set(membership)) # for i in xrange(len(set(membership))): # k = 0 # for j in xrange(len(membership)): # if membership[j] == i+1: # k+=1 # print k, "nodes in cluster number", i+1 # k=0 clusters = {} for i in xrange(len(membership)): if i in sites: clusters[i] = membership[i] return clusters
def __apply_cluster_alg(cluster_data=[], alg="kmean", prior_cluster_num=2, t=0.155): pass """clustering""" if alg == "kmean": from scipy.cluster.vq import whiten cluster_data = whiten(cluster_data) from scipy.cluster.vq import kmeans, vq centroids, _ = kmeans(cluster_data, prior_cluster_num, iter=250) idx, dist = vq(cluster_data, centroids) return idx, prior_cluster_num elif alg == "spec": from sklearn import cluster from sklearn.preprocessing import StandardScaler X = cluster_data X = StandardScaler().fit_transform(X) spectral = cluster.SpectralClustering(n_clusters=prior_cluster_num, eigen_solver="arpack") spectral.fit(X) import numpy as N idx = spectral.labels_.astype(N.int) return idx, prior_cluster_num else: """hierarchical clustering http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html""" import scipy.cluster.hierarchy as hcluster """needs distance matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html""" import scipy.spatial.distance as dist distmat = dist.pdist(cluster_data, "minkowski") #'euclidean') if alg == "hflat": link = hcluster.linkage(distmat) elif alg == "hcomp": link = hcluster.complete(distmat) elif alg == "hweight": link = hcluster.weighted(distmat) elif alg == "havg": link = hcluster.average(distmat) idx = hcluster.fcluster(link, t=t, criterion="distance") import numpy as N post_cluster_num = len(N.unique(idx)) print "# of channels established:", post_cluster_num assert post_cluster_num < 64, "number of cluster too large to be biological meaningful" return idx, post_cluster_num
def write_tree(cluster_method): import scipy.spatial.distance as ssd dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() triu = np.square(dmx.values) distArray = ssd.squareform(triu) if cluster_method == "average": hclust = average(distArray) elif cluster_method == "weighted": hclust = weighted(distArray) else: print("invalid cluster method chosen") sys.exit() t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def get_tree(self): from ete3.coretype.tree import TreeError import numpy as np from skbio.tree import TreeNode from scipy.cluster.hierarchy import weighted ids = self.dmx.index.tolist() triu = np.triu(self.dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") self.tree = Tree(nw) try: # midpoint root tree self.tree.set_outgroup(self.tree.get_midpoint_outgroup()) except TreeError: self.log.error("Unable to midpoint root tree") self.tree.write(outfile=self.nw_path)
def write_tree(cluster_method): import scipy.spatial.distance as ssd dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() #triu = np.square(dmx.as_matrix()) triu = np.square(dmx.values) distArray = ssd.squareform(triu) if cluster_method == "average": hclust = average(distArray) elif cluster_method == "weighted": hclust = weighted(distArray) else: print("invalid cluster method chosen") sys.exit() t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def get_tree(self): # Use decorator instead of if statement if self.tree_complete is False: from ete3.coretype.tree import TreeError import numpy as np # import matplotlib as mpl # mpl.use('TkAgg') from skbio.tree import TreeNode from scipy.cluster.hierarchy import weighted ids = ['{}.fasta'.format(i) for i in self.dmx.index.tolist()] triu = np.triu(self.dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") self.tree = Tree(nw) # midpoint root tree try: self.tree.set_outgroup(self.tree.get_midpoint_outgroup()) except TreeError as e: self.log.exception() self.tree.write(outfile=self.nw_path)
def CalculateClusterTree(self): fullMatrix = self.GenerateFullMatrix(self.results) dissMatrix = [] labels = fullMatrix.keys() for i in xrange(0, len(labels)): sampleNameI = labels[i] for j in xrange(i+1, len(labels)): sampleNameJ = labels[j] dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ]) # calculate hierarchical cluster tree if self.radioSingleLinkage.GetValue(): linkageMatrix = single(dissMatrix) elif self.radioUPGMA.GetValue(): linkageMatrix = average(dissMatrix) elif self.radioCompleteLinkage.GetValue(): linkageMatrix = complete(dissMatrix) elif self.radioWeighted.GetValue(): linkageMatrix = weighted(dissMatrix) root = to_tree(linkageMatrix) # create Newick string return self.CreateNewickString(root, labels) + ';'
def CalculateClusterTree(self): fullMatrix = self.GenerateFullMatrix(self.results) dissMatrix = [] labels = fullMatrix.keys() for i in xrange(0, len(labels)): sampleNameI = labels[i] for j in xrange(i + 1, len(labels)): sampleNameJ = labels[j] dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ]) # calculate hierarchical cluster tree if self.radioSingleLinkage.GetValue(): linkageMatrix = single(dissMatrix) elif self.radioUPGMA.GetValue(): linkageMatrix = average(dissMatrix) elif self.radioCompleteLinkage.GetValue(): linkageMatrix = complete(dissMatrix) elif self.radioWeighted.GetValue(): linkageMatrix = weighted(dissMatrix) root = to_tree(linkageMatrix) # create Newick string return self.CreateNewickString(root, labels) + ';'
def testSelfRecruitment(pathWork, pathRai, numSpecies, sizeChop, numDB, numSeq): import os import random import numpy as np import scipy.cluster.vq as scv import scipy.cluster.hierarchy as sch #import networkx as nx #import community as cm #import matplotlib.pyplot as plt if pathWork[-1] != '/': pathWork = pathWork + '/' if pathRai[-1] != '/': pathRai = pathRai + '/' allList = os.listdir(pathWork) genomeList = [] for file in allList: if file[-4:] == ".fna": genomeList.append(file) subset = random.sample(genomeList, numSpecies) for file in subset: # make sequences to be matched ensureDir(pathWork+"Sequences/") chopRandom(pathWork+file, pathWork+"Sequences/", sizeChop, sizeChop/10, numSeq) # make sequences to be matched to ensureDir(pathWork+"DataBase/") chopRandom(pathWork+file, pathWork+"DataBase/", sizeChop, sizeChop/10, numDB) # Make RAI databases os.system("{!s}raiphy -e .fna -m 2 -I {!s}Sequences/ -d {!s}seqs".format(pathRai,pathWork,pathWork)) os.system("{!s}raiphy -e .fna -m 2 -I {!s}DataBase/ -d {!s}db".format(pathRai,pathWork,pathWork)) # Data sets for further evaluation namesSq, RaiSq = rai2Numpy(pathWork+"seqs") namesDb, RaiDb = rai2Numpy(pathWork+"db") # namesAll = namesSq + namesDb RaiAll = np.concatenate([RaiSq,RaiDb]) csvout = open("{!s}seqs.csv".format(pathIn),'w') for r in RaiAll: csvout.write(",".join(str(x) for x in r)+"\n") # Run RAIphy os.system("{!s}raiphy -e .fna -m 0 -I {!s}Sequences/ -d {!s}db -o {!s}output".format(pathRai,pathWork,pathWork,pathWork)) # Evaluate RAIphy results raiDict = {} for k in namesDb: raiDict[k] = [k] raiRes = open("{!s}output".format(pathWork),'r') raiRes.readline() buf = raiRes.readline().rstrip() while buf: key = buf[3:] buf = raiRes.readline().rstrip() sq = buf[1:] raiDict[key].append(sq) buf = raiRes.readline().rstrip() raiList = [] for k in raiDict: raiList.append(raiDict[k]) print "RAIphy cluster list: {!s}".format(raiList) os.system("rm -r {!s}Sequences/".format(pathWork)) os.system("rm -r {!s}seqs".format(pathWork)) os.system("rm -r {!s}DataBase/".format(pathWork)) os.system("rm -r {!s}db".format(pathWork)) os.system("rm -r {!s}output".format(pathWork)) print "Files removed" # K-means whitened = newWhiten(RaiAll) centroidsNoSeeds, _ = scv.kmeans(whitened, numSpecies) idsNoSeeds, _ = scv.vq(whitened, centroidsNoSeeds) # Want to implement k-means with initial seeds, but holy moly things go wrong in spectacular fashion. # TODO: k-means with initial seeds. Might have to code my own version. # Compute distance matrix and graph for other clusterings D = np.zeros((len(RaiAll),len(RaiAll)),dtype=np.float) for i in range(len(RaiAll)): for j in range(i): dist = np.linalg.norm(RaiAll[i]-RaiAll[j]) D[i][j] = dist D[j][i] = dist #max = numpy.max(D) #G = nx.Graph() #for i in range(len(D)): # for j in range(i): # G.add_edge(namesAll[i], namesAll[j], weight = max-D[i][j]) # hierarchy-based clusterings WeightedLink = sch.weighted(D) # mylen = len(namesDb) clustWeightedLink = sch.fcluster(WeightedLink,numSpecies,criterion='maxclust') hist, bins = np.histogram(clustWeightedLink, bins=numSpecies) print hist
#print a distArray = ssd.squareform(matrix) #z = linkage(b, method='single', metric='Y') #print z print distArray z = linkage(distArray) #euclidean and simple print z x = single(distArray) print x y = complete(distArray) print y a = average(distArray) print a b = weighted(distArray) print b """ c = centroid(distArray) print c m = median(distArray) print m w = ward(distArray) print w """ #d = dendrogram(z,labels=labels) #d1 = dendrogram(x, labels=labels) d2 = dendrogram(w, labels=labels) plt.figure(1) plt.title(
euclid_data = pdist(data, 'euclidean'); logging.info("Time: %s" % (time.time() - start)); logging.info("Clustering start"); start = time.time(); Z = hierarchy.complete(euclid_data); worker.hierarchy_draw(Z, niks, 'study_complete_euclid', 0.4); logging.info("Time complete: %s" % (time.time() - start)); start = time.time(); Z = hierarchy.average(euclid_data); worker.hierarchy_draw(Z, niks, 'study_average_euclid', 0.25); logging.info("Time average: %s" % (time.time() - start)); start = time.time(); Z = hierarchy.weighted(euclid_data); worker.hierarchy_draw(Z, niks, 'study_weighted_euclid', 0.25); logging.info("Time weighted: %s" % (time.time() - start)); logging.info("\nSecondStep"); logging.info("Distance other"); start = time.time(); sqeuclid_data = pdist(data, 'sqeuclidean'); cityblock_data = pdist(data, 'cityblock'); logging.info("Time: %s" % (time.time() - start)); logging.info("Clustering start");