def test_somcluster(module): if module=='Bio.Cluster': from Bio.Cluster import somcluster elif module=='Pycluster': from Pycluster import somcluster else: raise 'Unknown module name', module print "test_somcluster:" # First data set weight1 = [ 1,1,1,1,1 ] data1 = array([[ 1.1, 2.2, 3.3, 4.4, 5.5], [ 3.1, 3.2, 1.3, 2.4, 1.5], [ 4.1, 2.2, 0.3, 5.4, 0.5], [ 12.1, 2.0, 0.0, 5.0, 0.0]]) mask1 = array([[ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1]]) # Second data set weight2 = [ 1,1 ] data2 = array([[ 1.1, 1.2 ], [ 1.4, 1.3 ], [ 1.1, 1.5 ], [ 2.0, 1.5 ], [ 1.7, 1.9 ], [ 1.7, 1.9 ], [ 5.7, 5.9 ], [ 5.7, 5.9 ], [ 3.1, 3.3 ], [ 5.4, 5.3 ], [ 5.1, 5.5 ], [ 5.0, 5.5 ], [ 5.1, 5.2 ]]) mask2 = array([[ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ]]) print "First data set:" clusterid, celldata = somcluster(data=data1, mask=mask1, weight=weight1, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data1)) print "Grid is %d-dimensional (should be 2-dimensional)" % len(clusterid[0]) print "Second data set:" clusterid, celldata = somcluster(data=data2, mask=mask2, weight=weight2, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data2)) print "Grid is %d-dimensional (should be 2-dimensional)" % len(clusterid[0]) print
def test_somcluster(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import somcluster elif TestCluster.module == 'Pycluster': from Pycluster import somcluster # First data set weight = [1, 1, 1, 1, 1] data = numpy.array([[ 1.1, 2.2, 3.3, 4.4, 5.5], [ 3.1, 3.2, 1.3, 2.4, 1.5], [ 4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]) mask = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int) clusterid, celldata = somcluster(data=data, mask=mask, weight=weight, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') self.assertEqual(len(clusterid), len(data)) self.assertEqual(len(clusterid[0]), 2) # Second data set weight = [1, 1] data = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) mask = numpy.array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int) clusterid, celldata = somcluster(data=data, mask=mask, weight=weight, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') self.assertEqual(len(clusterid), len(data)) self.assertEqual(len(clusterid[0]), 2)
def test_somcluster(self): if TestCluster.module=='Bio.Cluster': from Bio.Cluster import somcluster elif TestCluster.module=='Pycluster': from Pycluster import somcluster # First data set weight = [ 1, 1, 1, 1, 1 ] data = numpy.array([[ 1.1, 2.2, 3.3, 4.4, 5.5], [ 3.1, 3.2, 1.3, 2.4, 1.5], [ 4.1, 2.2, 0.3, 5.4, 0.5], [ 12.1, 2.0, 0.0, 5.0, 0.0]]) mask = numpy.array([[ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1]], int) clusterid, celldata = somcluster(data=data, mask=mask, weight=weight, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') self.assertEqual(len(clusterid), len(data)) self.assertEqual(len(clusterid[0]), 2) # Second data set weight = [ 1, 1 ] data = numpy.array([[ 1.1, 1.2 ], [ 1.4, 1.3 ], [ 1.1, 1.5 ], [ 2.0, 1.5 ], [ 1.7, 1.9 ], [ 1.7, 1.9 ], [ 5.7, 5.9 ], [ 5.7, 5.9 ], [ 3.1, 3.3 ], [ 5.4, 5.3 ], [ 5.1, 5.5 ], [ 5.0, 5.5 ], [ 5.1, 5.2 ]]) mask = numpy.array([[ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ]], int) clusterid, celldata = somcluster(data=data, mask=mask, weight=weight, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') self.assertEqual(len(clusterid), len(data)) self.assertEqual(len(clusterid[0]), 2)
def test_somcluster(module): if module == 'Bio.Cluster': from Bio.Cluster import somcluster elif module == 'Pycluster': from Pycluster import somcluster else: raise 'Unknown module name', module print "test_somcluster:" # First data set weight1 = [1, 1, 1, 1, 1] data1 = array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]) mask1 = array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]) # Second data set weight2 = [1, 1] data2 = array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) mask2 = array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) print "First data set:" clusterid, celldata = somcluster(data=data1, mask=mask1, weight=weight1, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data1)) print "Grid is %d-dimensional (should be 2-dimensional)" % len( clusterid[0]) print "Second data set:" clusterid, celldata = somcluster(data=data2, mask=mask2, weight=weight2, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data2)) print "Grid is %d-dimensional (should be 2-dimensional)" % len( clusterid[0]) print
def test_somcluster(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import somcluster elif TestCluster.module == 'Pycluster': from Pycluster import somcluster # First data set weight = [1, 1, 1, 1, 1] data = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]) mask = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int) # TODO - Use a context manager here once we drop Python 2.6 # Distance should be one letter: self.assertRaises(ValueError, somcluster, **{"data": data, "mask": mask, "weight": weight, "transpose": 0, "nxgrid": 10, "nygrid": 10, "inittau": 0.02, "niter": 100, "dist": "euclidean"}) clusterid, celldata = somcluster(data=data, mask=mask, weight=weight, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') self.assertEqual(len(clusterid), len(data)) self.assertEqual(len(clusterid[0]), 2) # Second data set weight = [1, 1] data = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) mask = numpy.array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int) clusterid, celldata = somcluster(data=data, mask=mask, weight=weight, transpose=0, nxgrid=10, nygrid=10, inittau=0.02, niter=100, dist='e') self.assertEqual(len(clusterid), len(data)) self.assertEqual(len(clusterid[0]), 2)
def main(): args = ParseArg() frag_l = args.frag_l # determin if intervals or bams is multiple if len(args.intervals) == 1 and not args.intervalnames: interval_names = ['interval'] else: interval_names = args.intervalnames if len(args.bams) == 1 and not args.bamnames: bam_names = ['bam'] else: bam_names = args.bamnames if len(args.bams) != len(bam_names) or len( args.intervals) != len(interval_names): print >> sys.stderr, "length of names are not matching length of files, please check your command" sys.exit(0) if len(args.bams) != len(frag_l): print >> sys.stderr, "number of BAMs are not matching number of frag_l, assign all to 300bp" frag_l = np.array([300] * len(args.bams)) # store bam files with indexing and count information bams = {} read_numbers = {} print >> sys.stderr, "## Starting read and index BAM files: " for i in range(len(args.bams)): temp_name = bam_names[i] print >> sys.stderr, " ## Indexing for bam file of '" + temp_name + "'" bams[temp_name] = pysam.Samfile(args.bams[i], 'rb') print >> sys.stderr, " ## counting total reads number <slow>" ss = 0 for chr in bams[temp_name].references: ss += bams[temp_name].count(chr) read_numbers[temp_name] = ss print >> sys.stderr # store interval files intervals = {} print >> sys.stderr, "## Starting reading intervals:" for i in range(len(args.intervals)): temp_name = interval_names[i] print >> sys.stderr, " ## Reading for interval file of '" + temp_name + "'\r", intervals[temp_name] = TableIO.parse(args.intervals[i], 'bed') print >> sys.stderr resol = args.resolution leng = args.length # draw heatmap if args.Heatmap: print >> sys.stderr, "## Start count reads" collects = {} interval_n = [0] order = {} for k, nab in enumerate(bam_names): collect = [] for l, name in enumerate(interval_names): if k >= 1: intervals[name] = TableIO.parse(args.intervals[l], 'bed') print >> sys.stderr, " ## counting for bam[" + nab + "] - interval[" + name + "]" H_counts = get_count(intervals[name], bams[nab], resol, leng, frag_l[k], args.direction, args.win_l) H_counts = H_counts * 5E7 / read_numbers[nab] H_counts = np.log(H_counts + 1) #H_counts=feature_scale(H_counts) if k == 0: if args.method_c == 'kmeans': centroids, _ = kmeans(H_counts, 5) idx, _ = vq(H_counts, centroids) print >> sys.stderr, " ## size of clusters using kmeansfor bam[" + nab + "] - interval[" + name + "]: " cluster_size = Counter(idx) for c in cluster_size: print >> sys.stderr, " Cluster[%d]:%d" % ( c, cluster_size[c]) order[name] = [ i[0] for i in sorted(enumerate(idx), key=lambda x: x[1]) ] elif args.method_c == 'somcluster': clusterid, _ = somcluster(data=H_counts, nxgrid=5, nygrid=5) order[name] = np.lexsort( (clusterid[:, 1], clusterid[:, 0])) elif args.method_c == 'hcluster': distMatrix = dist.pdist(H_counts) distSquareMatrix = dist.squareform(distMatrix) linkageMatrix = hier.linkage(distSquareMatrix) dendro = hier.dendrogram(linkageMatrix) order[name] = dendro['leaves'] interval_n.append(H_counts.shape[0]) H_counts = H_counts[order[name], :] collect.append(H_counts) collect = np.vstack(collect) collects[nab] = collect cum_interval_n = np.cumsum(interval_n) fig = pylab.figure(figsize=(3 * len(bam_names), 8)) print >> sys.stderr, "## Start draw heatmap for intereval" #heatmap #yticks aylabel = fig.add_axes([0.1, 0.1, 0.0, 0.8]) aylabel.set_yticks(cum_interval_n) aylabel.set_xticks([]) j = 0 width = 0.8 / len(bam_names) for name in bam_names: heatmap_oneBam(collects[name], fig, 0.15 + j * width, 0.7 * width, cum_interval_n, leng, name) j = j + 1 fig.savefig('heatmap_' + args.output) # print clustering information for first bam cluster_info = open("interval_with_cluster.txt", 'w') name = interval_names[-1] intervals[name] = TableIO.parse(args.intervals[l], 'bed') n = 0 for l in intervals[name]: if 'random' in l.chr: continue try: print >> cluster_info, '\t'.join( str(f) for f in [l.chr, l.start, l.stop, idx[n]]) except: print "Error: the number of intervals are not consistent" n = n + 1 # draw averge patterns if args.Average: print >> sys.stderr, "## Start count reads" collect = {} y_max = 0 for k, nab in enumerate(bam_names): for j, name in enumerate(interval_names): if k >= 1: intervals[name] = TableIO.parse(args.intervals[j], 'bed') print >> sys.stderr, " ## counting for bam[" + nab + "] - interval[" + name + "]" H_counts = get_count(intervals[name], bams[nab], resol, leng, frag_l[k], args.direction, args.win_l) print H_counts.shape[0] if name == 'interval': collect[nab] = np.sum( H_counts, axis=0) / H_counts.shape[0] * 5E7 / read_numbers[nab] y_max = max(y_max, max(collect[nab])) else: collect[(nab, name)] = np.sum( H_counts, axis=0) / H_counts.shape[0] * 5E7 / read_numbers[nab] y_max = max(y_max, max(collect[nab, name])) fig = plt.figure(figsize=(3 * len(bam_names), 4)) for i, nab in enumerate(bam_names): if len(interval_names) > 1: ax = plt.subplot2grid((1, len(bam_names)), (0, i)) for j, name in enumerate(interval_names): col = matplotlib.cm.Paired( (j + 1) * 1.0 / (len(interval_names) + 2), 1) ax.plot(np.array(range(-leng, leng, resol)) + resol / 2.0, collect[(nab, name)], color=col) ax.legend(interval_names, loc='upper right') ax.set_ylim(0, y_max + 1) ax.set_title(nab) else: col = matplotlib.cm.Paired( (i + 1) * 1.0 / (len(bam_names) + 2), 1) plt.plot(np.array(range(-leng, leng, resol)) + resol / 2.0, collect[nab], color=col) if len(interval_names) == 1: plt.legend(bam_names, bbox_to_anchor=(0., 0.95, 1., .100), loc=3, ncol=len(bam_names), mode="expand", borderaxespad=0., fontsize=5) #,prop={'size':15},fontsize=10) plt.ylim(0, y_max + 1) plt.xlabel('Distance to center') plt.ylabel('Average coverage for 5E7 reads') plt.tight_layout() fig.savefig('average_' + args.output)
from Bio.Cluster import somcluster import numpy as np data = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [0, 1, 2, 3]]) clusterid, celldata = somcluster(data) print("clusterid:", clusterid, "\n", "celldata:", celldata) # SOM clustering이라던가... 아 어렵.
def main(): args=ParseArg() frag_l=args.frag_l # determin if intervals or bams is multiple if len(args.intervals)==1 and not args.intervalnames: interval_names=['interval'] else: interval_names=args.intervalnames if len(args.bams)==1 and not args.bamnames: bam_names=['bam'] else: bam_names=args.bamnames if len(args.bams)!=len(bam_names) or len(args.intervals)!=len(interval_names): print >>sys.stderr,"length of names are not matching length of files, please check your command" sys.exit(0) if len(args.bams)!=len(frag_l): print >>sys.stderr,"number of BAMs are not matching number of frag_l, assign all to 300bp" frag_l=np.array([300]*len(args.bams)) # store bam files with indexing and count information bams={} read_numbers={} print >> sys.stderr, "## Starting read and index BAM files: " for i in range(len(args.bams)): temp_name=bam_names[i] print >> sys.stderr," ## Indexing for bam file of '"+temp_name+"'" bams[temp_name]=pysam.Samfile(args.bams[i],'rb') print >> sys.stderr," ## counting total reads number <slow>" ss=0 for chr in bams[temp_name].references: ss+=bams[temp_name].count(chr) read_numbers[temp_name]=ss print >> sys.stderr # store interval files intervals={} print >> sys.stderr, "## Starting reading intervals:" for i in range(len(args.intervals)): temp_name=interval_names[i] print >> sys.stderr," ## Reading for interval file of '"+temp_name+"'\r", intervals[temp_name]=TableIO.parse(args.intervals[i],'bed') print >> sys.stderr resol=args.resolution leng=args.length # draw heatmap if args.Heatmap: print >> sys.stderr,"## Start count reads" collects={} interval_n=[0] order={} for k,nab in enumerate(bam_names): collect=[] for l,name in enumerate(interval_names): if k>=1: intervals[name]=TableIO.parse(args.intervals[l],'bed') print >> sys.stderr, " ## counting for bam["+nab+"] - interval["+name+"]" H_counts=get_count(intervals[name],bams[nab],resol,leng,frag_l[k],args.direction,args.win_l) H_counts=H_counts*5E7/read_numbers[nab] H_counts=np.log(H_counts+1) #H_counts=feature_scale(H_counts) if k==0: if args.method_c=='kmeans': centroids,_=kmeans(H_counts,5) idx,_=vq(H_counts,centroids) print >> sys.stderr," ## size of clusters using kmeansfor bam["+nab+"] - interval["+name+"]: " cluster_size=Counter(idx) for c in cluster_size: print >> sys.stderr," Cluster[%d]:%d"%(c,cluster_size[c]) order[name]=[i[0] for i in sorted(enumerate(idx), key=lambda x:x[1])] elif args.method_c=='somcluster': clusterid,_=somcluster(data=H_counts,nxgrid=5,nygrid=5) order[name] = np.lexsort((clusterid[:,1],clusterid[:,0])) elif args.method_c=='hcluster': distMatrix = dist.pdist(H_counts) distSquareMatrix = dist.squareform(distMatrix) linkageMatrix = hier.linkage(distSquareMatrix) dendro = hier.dendrogram(linkageMatrix) order[name] = dendro['leaves'] interval_n.append(H_counts.shape[0]) H_counts=H_counts[order[name],:] collect.append(H_counts) collect=np.vstack(collect) collects[nab]=collect cum_interval_n=np.cumsum(interval_n) fig=pylab.figure(figsize=(3*len(bam_names),8)) print >> sys.stderr,"## Start draw heatmap for intereval" #heatmap #yticks aylabel=fig.add_axes([0.1,0.1,0.0,0.8]) aylabel.set_yticks(cum_interval_n) aylabel.set_xticks([]) j=0 width=0.8/len(bam_names) for name in bam_names: heatmap_oneBam(collects[name],fig,0.15+j*width,0.7*width,cum_interval_n,leng,name) j=j+1 fig.savefig('heatmap_'+args.output) # print clustering information for first bam cluster_info=open("interval_with_cluster.txt",'w') name=interval_names[-1] intervals[name]=TableIO.parse(args.intervals[l],'bed') n=0 for l in intervals[name]: if 'random' in l.chr: continue try: print >>cluster_info,'\t'.join(str(f) for f in [l.chr,l.start,l.stop,idx[n]]) except: print "Error: the number of intervals are not consistent" n=n+1 # draw averge patterns if args.Average: print >> sys.stderr,"## Start count reads" collect={} y_max=0 for k,nab in enumerate(bam_names): for j,name in enumerate(interval_names): if k>=1: intervals[name]=TableIO.parse(args.intervals[j],'bed') print >> sys.stderr, " ## counting for bam["+nab+"] - interval["+name+"]" H_counts=get_count(intervals[name],bams[nab],resol,leng,frag_l[k],args.direction,args.win_l) print H_counts.shape[0] if name=='interval': collect[nab]=np.sum(H_counts,axis=0)/H_counts.shape[0]*5E7/read_numbers[nab] y_max=max(y_max,max(collect[nab])) else: collect[(nab,name)]=np.sum(H_counts,axis=0)/H_counts.shape[0]*5E7/read_numbers[nab] y_max=max(y_max,max(collect[nab,name])) fig=plt.figure(figsize=(3*len(bam_names),4)) for i,nab in enumerate(bam_names): if len(interval_names)>1: ax = plt.subplot2grid((1,len(bam_names)),(0,i)) for j,name in enumerate(interval_names): col=matplotlib.cm.Paired((j+1)*1.0/(len(interval_names)+2),1) ax.plot(np.array(range(-leng,leng,resol))+resol/2.0,collect[(nab,name)],color=col) ax.legend(interval_names,loc='upper right') ax.set_ylim(0,y_max+1) ax.set_title(nab) else: col=matplotlib.cm.Paired((i+1)*1.0/(len(bam_names)+2),1) plt.plot(np.array(range(-leng,leng,resol))+resol/2.0,collect[nab],color=col) if len(interval_names)==1: plt.legend(bam_names,bbox_to_anchor=(0., 0.95, 1., .100), loc=3, ncol=len(bam_names), mode="expand", borderaxespad=0.,fontsize=5)#,prop={'size':15},fontsize=10) plt.ylim(0,y_max+1) plt.xlabel('Distance to center') plt.ylabel('Average coverage for 5E7 reads') plt.tight_layout() fig.savefig('average_'+args.output)