def get_overlaping_clusters(self, region, overlap=1): clusters = [] bam_tell, read_start = self.get_bam_tell(region) print "TELL", bam_tell, read_start if bam_tell or region.start < LINEAR_SIZE*4: r = BamReader(self.bam_path, self.logger, bam_tell, read_start, self.chr_dict_inv) for line in r: c = Cluster(read=SAM, cached=False, read_half_open=self.read_half_open, rounding=self.rounding) try: c.read_line(line) except InvalidLine: print "Invalid line, .bam or .bai corrupt" break if c.overlap(region) >= overlap: clusters.append(c) elif c.start > region.end or c.name != region.name: break if len(clusters) > 0: print "Num clusters", len(clusters) print "first:", clusters[0].start, clusters[0].end if len(clusters) > 1: print "end:", clusters[-1].start, clusters[-1].end else: print "No clusters found!" print return clusters
def test_extend_bug(self): cluster = Cluster(read=BED, write=PK) cluster.read_line('chr3 1 35 666 noname +') cluster2 = Cluster(read=BED, write=PK) cluster2.read_line('chr3 156 200 666 noname -') cluster.extend(100) cluster2.extend(100) result = cluster + cluster2 self.assertEqual(200, len(result))
def test_split_subtract_result(self): sub_result = Cluster(write_half_open=True, cached=True) sub_result.read_line('chr4 1 300 20:1|40:0|20:3|20:0.3|10:-6|80:1|10:0') clusters = sub_result.absolute_split(threshold=0) result = [] result.append('chr4\t0\t20\t20:1.00\t1.0\t.\t10\t20.0\n') result.append('chr4\t60\t100\t20:3.00|20:0.30\t3.0\t.\t70\t66.0\n') result.append('chr4\t110\t190\t80:1.00\t1.0\t.\t150\t80.0\n') for i in range(0, len(clusters)): self.assertEqual(clusters[i].write_line(), result[i])
def setUp(self): self.tag_list_short = [Cluster('chr1', 1, 10, read=PK), Cluster('chr1', 5, 14, read=PK)] self.cluster_short = Cluster(rounding=True, read=PK) self.cluster_short.read_line('chr1 1 14 4:1|6:2|4:1') self.pk_cluster = Cluster(rounding=True, read=PK, write=PK) self.pk_cluster.read_line('chr1 1 15 4:1|1:2|2:1|3:4|2:5|2:2|1:1') self.bed_tag = Cluster(read=BED, write=BED, rounding=True) self.bed_tag.read_line('chr1 1 100 hola 666 +') self.cluster1 = Cluster(rounding=True, read=PK) self.cluster2 = Cluster(rounding=True, read=PK)
def test_split(self): double_cluster = Cluster(rounding=True) double_cluster.read_line('chr1 100 215 5:1|10:5|5:7|5:80|5:1|5:40|15:1|10:2|5:3|5:8|10:6|10:5|10:4|10:3|6:2') results = double_cluster.split(0.01) correct_clusters = [Cluster(rounding=True), Cluster(rounding=True), Cluster(rounding=True)] correct_clusters[0].read_line('chr1 100 125 5:1|10:5|5:7|5:80|2:1') correct_clusters[1].read_line('chr1 128 141 2:1|5:40|7:1') correct_clusters[2].read_line('chr1 143 215 7:1|10:2|5:3|5:8|10:6|10:5|10:4|10:3|6:2') for i in range (0,len(correct_clusters)): self.assertEqual(results[i].write_line(), correct_clusters[i].write_line())
def test_wig_read_write(self): cluster = Cluster(read=WIG, write=WIG, read_half_open=True, write_half_open=True, rounding=True) cluster.read_line('chr2 1 10 1') cluster.read_line('chr2 10 16 2') cluster.read_line('chr2 16 26 1') self.assertEqual(cluster.write_line(), 'chr2\t1\t10\t1\nchr2\t10\t16\t2\nchr2\t16\t26\t1\n') cluster.write_as(WIG, False) self.assertEqual(cluster.write_line(), 'chr2\t2\t10\t1\nchr2\t11\t16\t2\nchr2\t17\t26\t1\n')
def test_simple_ucsc_representation(self): """Confirmed visually at the UCSC browser track name=simple_read visibility=full chr3 101 200 noname 555 + track type=wiggle_0 name=the_test visibility=full chr3 101 200 1 """ cluster = Cluster(read=BED, write=WIG, read_half_open=True, write_half_open=True, rounding = True, cached=True) cluster.read_line('chr3 101 200 noname 555 +') self.assertEqual(cluster.write_line(), 'chr3\t101\t200\t1\n')
def test_eq(self): cluster = Cluster(read=PK) cluster2 = Cluster(read=PK) cluster3 = Cluster(read=PK) cluster.read_line('chr1 1 15 4:1|1:2|2:1|3:4|2:5|2:2|1:1') cluster2.read_line('chr1 1 15 4:1|1:2|2:1|3:4|2:5|2:2|1:1') cluster3.read_line('chr1 1 15 4:1|1:2|2:1|3:4|2:5|2:2') self.assertEqual(cluster, cluster2) self.assertNotEqual(cluster, cluster3)
def test_add2(self): cluster = Cluster(read=BED) cluster.read_line('chr1 1 20000 666 hola +') cluster.read_line('chr1 1 20000 666 hola +') cluster.read_line('chr1 1 20000 666 hola +') cluster.read_line('chr1 1001 20000 666 hola +') self.assertEqual(cluster.write_line(), 'chr1\t1\t20000\t1000:3.00|19000:4.00\t4.0\t+\t10500\t79000.0\n')
def read_and_extend(self, cluster, line, extension): cluster_aux = Cluster(read=BED, write=BED, read_half_open=True, write_half_open=True, rounding = True) if cluster.is_empty(): cluster.read_line(line) cluster.extend(extension) #print cluster.write_line(), cluster._profile else: cluster_aux.read_line(line) cluster_aux.extend(extension) #print cluster_aux.write_line(), cluster_aux._profile cluster += cluster_aux #print cluster.write_line() return cluster
def test_bed_to_half_open_wig(self): """Confirmed visually at the UCSC browser track name=simple_cluster visibility=full chr1 1 100 hola 666 + chr1 10 130 hola 666 + track type=wiggle_0 name=the_test visibility=full chr1 1 10 1 chr1 10 100 2 chr1 100 130 1 """ cluster = Cluster(read=BED, write=WIG, read_half_open=True, write_half_open=True, rounding = True, cached=True) cluster.read_line('chr1 1 100 hola 666 +') cluster.read_line('chr1 10 130 hola 666 +') self.assertEqual(cluster.write_line(), 'chr1\t1\t10\t1\nchr1\t10\t100\t2\nchr1\t100\t130\t1\n')
def test_is_significant(self): cluster = Cluster(rounding=True) cluster.read_line('chr1 1 15 4:1|1:2|2:1|3:4|2:5|2:2|1:1') #area 35 self.assertTrue(cluster.is_significant(5, "numreads")) self.assertTrue(cluster.is_significant(34, "numreads")) self.assertFalse(cluster.is_significant(36, "numreads")) self.assertFalse(cluster.is_significant(20)) self.assertTrue(cluster.is_significant(1)) self.assertTrue(cluster.is_significant(5))
def test_comparison(self): cluster = Region("chr1", 1, 100) clusterdup = Cluster(read=BED) clusterdup.read_line("chr1 1 100") cluster2 = Cluster(read=BED) cluster2.read_line("chr4 1000 1010") cluster3 = Cluster(read=BED) cluster3.read_line("chr5 3 103") self.assertTrue(cluster < cluster2) self.assertTrue(cluster2 < cluster3) self.assertFalse(cluster > cluster3) self.assertFalse(cluster < clusterdup) self.assertTrue(cluster <= clusterdup)
def test_bug_contiguous_peaks(self): cluster = Cluster(rounding=True, read=PK, write=PK) cluster2 = Cluster(rounding=True, read=PK, write=PK) cluster.read_line('chr1 849917 850408 8:2|10:4|80:5|23:6|29:7|8:5|10:3|39:2|12:3|29:4|5:3|18:4|41:3|30:4|15:5|12:4|34:3|59:2|30:1') cluster2.read_line('chr1 850408 850648 66:2|25:3|59:4|66:2|25:1 +') result = cluster + cluster2 self.assertTrue(cluster.intersects(cluster2))
def test_sub_and_print(self): cluster = Cluster() cluster2 = Cluster(write_half_open=False) cluster.read_line('chr1 1 1000 10:2|10:4|80:5|500:7|100:7|100:5') cluster2.read_line('chr1 11 1000 10:4|80:5|500:6|100:7|99:5|1:4.99') cluster2 = cluster - cluster2 self.assertEqual(cluster2.write_line(), 'chr1\t1\t10\t10:2.00\t2.0\t.\t5\t20.0\nchr1\t101\t600\t500:1.00\t1.0\t.\t350\t500.0\nchr1\t800\t800\t1:0.01\t0.01\t.\t800\t0.01\n')
def test_is_contiguous_wig(self): cluster = Cluster(read=WIG, read_half_open=True) cluster.read_line('chr1 1599888 1599949 1.77') cluster2 = Cluster(read=WIG, write=WIG, read_half_open=True) cluster2.read_line('chr1 1599949 1600001 2.65') #print cluster2.write_line() self.assertTrue(cluster.is_contiguous(cluster2))
def test_normalized_counts(self): total_number_reads = 1e7 region = Region("chr1", 1, 300) region_bed12 = Region("chr1", 1, 300, exome_size = 200) c = Cluster(read=BED) for i in range(0, 5): c.read_line("chr1 1 100") region.add_tags(c, True) region_bed12.add_tags(c, True) c.clear() self.assertEqual(region.normalized_counts(), 5.) #simple-counts self.assertEqual(region.normalized_counts(region_norm=True, total_n_norm=True, total_reads = total_number_reads), 1.666666666666667) #rpkm self.assertEqual(region_bed12.normalized_counts(region_norm=True, total_n_norm=True, total_reads = total_number_reads), 2.5) #rpkm with exon_size self.assertEqual(region.normalized_counts(pseudocount=True), 6.) #with pseudocounts self.assertEqual(region.normalized_counts(region_norm=True, total_n_norm=True, total_reads = total_number_reads, regions_analyzed=10000, pseudocount=True), 1.998001998001998)
def test_sub_fast(self): #random/experiment.pk experiment = Cluster(rounding=True) experiment.read_line("chr1 1 1107 101:2|7:1 2.0 . 263 238.0") control = Cluster() control.read_line( "chr1 46 1222 47:1|54:2|47:1 2.0 . 71331 202.0") experiment -= control self.assertEqual(experiment.write_line(), 'chr1\t1\t92\t45:2|47:1\t2.0\t.\t23\t137.0\n')#chr1 1 92 45:2|47:1 2.0 . 23 137.0
def test_intersects(self): self.assertTrue(Cluster('chr1', 1, 10).intersects(Cluster('chr1', 10, 14))) self.assertFalse(Cluster('chr2', 1, 10).intersects(Cluster('chr1', 4, 14))) self.assertTrue(Cluster('chr1', 1, 10).intersects(Cluster('chr1', 4, 14))) cluster = Cluster(read=PK, rounding=True) cluster.read_line('chr1 1 100 100:1') cluster2 = Cluster(read=PK,rounding=True) cluster2.read_line('chr1 100 199 100:1') result = cluster + cluster2 self.assertTrue(cluster.intersects(cluster2))
def test_add_pk(self): cluster1 = Cluster(read=PK) cluster2 = Cluster(read=PK) cluster1.read_line('chr1\t1\t145\t9:2.00|41:3.00|50:2.00|45:1.00\n') cluster2.read_line('chr1\t1\t125\t9:4.00|41:3.00|30:2.00|45:1.00\n') result = cluster1 + cluster2 self.assertEqual(result.write_line(), 'chr1\t1\t145\t50:6.00|30:4.00|20:3.00|25:2.00|20:1.00\t6.0\t.\t25\t550.0\n')
def test_subtract_with_gaps(self): cluster1 = Cluster() cluster2 = Cluster() cluster1.read_line("chr2 1 100 30:1|50:2|40:1|3000:3") cluster2.read_line("chr2 1 100 30:1|50:0|40:1|200:0|5000:1") cluster1 -= cluster2 self.assertEqual(cluster1._profile, [[50, 2.0], [40, 0.0], [200, 3.0], [2800, 2.0]])
def get_overlaping_clusters(self, region, overlap=1): clusters = [] self.logger.debug('Launching Samtools for %s...'%region) proc = subprocess.Popen("samtools view %s %s:%s-%s"%(self.bam_path, region.name, region.start, region.end), stdout=subprocess.PIPE, shell=True) out, err = proc.communicate() self.logger.debug('... done') lines = filter(None, out.split("\n")) self.logger.debug('Numlines in %s: %s'%(region, len(lines))) for line in lines: c = Cluster(read=SAM, cached=False, read_half_open=self.read_half_open, rounding=self.rounding) try: c.read_line(line) except InvalidLine: print "Invalid line, .bam or .bai corrupt" break if c.overlap(region) >= overlap: clusters.append(c) elif c.start > region.end or c.name != region.name: break return clusters
def test_get_profile(self): r = Region(start=1, end=1999) c = Cluster(read=BED) c.read_line('chr4 1 40') r.add_tags(c, True) c = Cluster(read=BED, read_half_open=True) c.read_line('chr4 400 500') r.add_tags(c, True) meta = r.get_metacluster() self.assertEqual(meta._levels, [[40, 1.0], [360, 0.0], [100, 1.0]])
def test_FDR(self): r = Region('', 1, 1999) tags = [] for i in range(0, 50): c = Cluster() c.read_line('chr4 %s %s 20:1'%(i, i+50)) tags.append(c) r.add_tags(tags, True) c = Cluster() c.read_line('chr4 55555 55558 7:1') r.add_tags(c) result = r.get_FDR_clusters() self.assertEqual(len(result), 1)
def test_is_empty(self): cluster = Cluster(read=BED) self.assertEqual(cluster.is_empty(), True) cluster.read_line('chr1 10 130 666 hola +') self.assertEqual(cluster.is_empty(), False) cluster = Cluster(read=PK) self.assertEqual(cluster.is_empty(), True) cluster.read_line('chr1 1 15 4:1|1:2|2:1|3:4|2:5|2:2|1:1') self.assertEqual(cluster.is_empty(), False) cluster2 = Cluster(read=PK) cluster2.read_line('chr1 1 15 4:1|1:2|2:1|3:4|2:5|2:2|1:1') result = cluster - cluster2 self.assertEqual(result.is_empty(), True)
def DifferentChromosome(self): cluster1 = Cluster(read=PK) cluster1.read_line('chr1 1 15 4:1|1:2|2:10|3:4|2:15|2:2|1:1') cluster2 = Cluster(read=PK) cluster2.read_line('chr2 1 15 4:1|1:2|2:10|3:4|2:15|2:2|1:1') a = cluster1 +cluster2
def read_invalid_lines(self): cluster = Cluster(read=SAM) for i in range(100): cluster.read_line('chr1 1 15 4:1|1:2|2:10|3:4|2:15|2:2|1:1')
def test_cluster_pk(self): cluster = Cluster(read=PK) cluster.read_line('chr1\t1\t145\t9:2.00|41:3.00|50:2.00|45:1.00\n') cluster.read_line('chr1\t1\t125\t9:4.00|41:3.00|30:2.00|45:1.00\n') self.assertEqual(cluster.write_line(), 'chr1\t1\t145\t50:6.00|30:4.00|20:3.00|25:2.00|20:1.00\t6.0\t.\t25\t550.0\n')
def test_add_bed(self): cluster1 = Cluster(read=BED) cluster2 = Cluster(read=BED) cluster3 = Cluster(read=BED) cluster4 = Cluster(read=BED) cluster5 = Cluster(read=BED) cluster6 = Cluster(read=BED) cluster1.read_line('chr1 1 100 666 hola +') cluster2.read_line('chr1 1 100 666 hola +') cluster3.read_line('chr1 1 50 666 hola +') cluster4.read_line('chr1 10 145 666 hola +') cluster5.read_line('chr1 45 95 666 hola +') cluster6.read_line('chr1 1 200 666 hola +') result1 = cluster1 + cluster2 result2 = cluster1 + cluster3 result3 = cluster4 + cluster1 #result4 = cluster1 + cluster3 + cluster4 result5 = cluster1 + cluster5 result6 = cluster1 + cluster6 self.assertEqual(result1.write_line(), 'chr1\t1\t100\t100:2.00\t2.0\t+\t50\t200.0\n') self.assertEqual(result2.write_line(), 'chr1\t1\t100\t50:2.00|50:1.00\t2.0\t+\t25\t150.0\n') self.assertEqual(result3.write_line(), 'chr1\t1\t145\t9:1.00|91:2.00|45:1.00\t2.0\t+\t55\t236.0\n') #self.assertEqual(result4.write_line(), 'chr1\t1\t145\t9:2.00|41:3.00|50:2.00|45:1.00\n') self.assertEqual(result5.write_line(), 'chr1\t1\t100\t44:1.00|51:2.00|5:1.00\t2.0\t+\t70\t151.0\n')
def test_max_height(self): cluster = Cluster(read=PK) cluster.read_line('chr1 1 15 4:1|1:2|2:10|3:4|2:15|2:2|1:1') self.assertEqual(cluster.max_height(), 15)