def do_add_single_exon_fuzzy_gpd(self, fuz2): if not self.params["do_add_single_exon"]: return False # make sure we are allowed to be doing this # build the bounds from the average start and end s1 = mean(self.start.get_payload()) e1 = mean(self.end.get_payload()) s2 = mean(fuz2.start.get_payload()) e2 = mean(fuz2.end.get_payload()) l1 = e1 - s1 + 1 l2 = e2 - s2 + 1 if l1 < self.params["single_exon_minimum_length"]: return False if l2 < self.params["single_exon_minimum_length"]: return False if l1 < 1 or l2 < 1: return False # shouldn't happen chr1 = self.start.chr chr2 = self.end.chr if chr1 != chr2: return False # shouldn't happen r1 = Bed(chr1, s1 - 1, e1, self.dir) r2 = Bed(chr2, s2 - 1, e2, self.dir) over = r1.overlap_size(r2) if over < self.params["single_exon_minimum_overlap_bases"]: return False # print r1.get_range_string() # print r2.get_range_string() cov = min(float(over) / float(l1), float(over) / float(l2)) if cov < self.params["single_exon_minimum_overlap_fraction"]: return False if abs(e1 - e2) > self.params["single_exon_maximum_endpoint_distance"]: return False if abs(s1 - s2) > self.params["single_exon_maximum_endpoint_distance"]: return False # If we're still here, we can add result output = self.copy() newstart = output.start.merge(fuz2.start) newstart.set_payload([]) for s in output.start.get_payload(): newstart.get_payload().append(s) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) newend = output.end.merge(fuz2.end) newend.set_payload([]) for e in output.end.get_payload(): newend.get_payload().append(e) for e in fuz2.end.get_payload(): newend.get_payload().append(e) output.start = newstart output.end = newend for gpd in fuz2.gpds: output.gpds.append(gpd) sjun = get_simple_junction(gpd) if sjun: output.simple_junction_set.add(gpd) return output
def do_add_single_exon_fuzzy_gpd(self, fuz2): if not self.params['do_add_single_exon']: return False # make sure we are allowed to be doing this #build the bounds from the average start and end s1 = mean(self.start.get_payload()) e1 = mean(self.end.get_payload()) s2 = mean(fuz2.start.get_payload()) e2 = mean(fuz2.end.get_payload()) l1 = e1 - s1 + 1 l2 = e2 - s2 + 1 if l1 < self.params['single_exon_minimum_length']: return False if l2 < self.params['single_exon_minimum_length']: return False if l1 < 1 or l2 < 1: return False #shouldn't happen chr1 = self.start.chr chr2 = self.end.chr if chr1 != chr2: return False #shouldn't happen r1 = Bed(chr1, s1 - 1, e1, self.dir) r2 = Bed(chr2, s2 - 1, e2, self.dir) over = r1.overlap_size(r2) if over < self.params['single_exon_minimum_overlap_bases']: return False #print r1.get_range_string() #print r2.get_range_string() cov = min(float(over) / float(l1), float(over) / float(l2)) if cov < self.params['single_exon_minimum_overlap_fraction']: return False if abs(e1 - e2) > self.params['single_exon_maximum_endpoint_distance']: return False if abs(s1 - s2) > self.params['single_exon_maximum_endpoint_distance']: return False #If we're still here, we can add result output = self.copy() newstart = output.start.merge(fuz2.start) newstart.set_payload([]) for s in output.start.get_payload(): newstart.get_payload().append(s) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) newend = output.end.merge(fuz2.end) newend.set_payload([]) for e in output.end.get_payload(): newend.get_payload().append(e) for e in fuz2.end.get_payload(): newend.get_payload().append(e) output.start = newstart output.end = newend for gpd in fuz2.gpds: output.gpds.append(gpd) sjun = get_simple_junction(gpd) if sjun: output.simple_junction_set.add(gpd) return output
def query_overlap_size(self, psl2): if self.value('qName') != psl2.value('qName'): return 0 # on same query output = 0 for i in range(0, self.value('blockCount')): for j in range(0, psl2.value('blockCount')): b1 = Bed( self.value('qName'), self.value('qStarts_actual')[i], self.value('qStarts_actual')[i] + self.value('blockSizes')[i]) b2 = Bed( psl2.value('qName'), psl2.value('qStarts_actual')[j], psl2.value('qStarts_actual')[j] + psl2.value('blockSizes')[j]) size = b1.overlap_size(b2) output += size return output
def target_overlap_size(self, psl2, use_direction=False): if self.value('tName') != psl2.value('tName'): return 0 if use_direction and self.value('strand') != psl2.value('strand'): return 0 # on same chromosome output = 0 for i in range(0, self.value('blockCount')): for j in range(0, psl2.value('blockCount')): b1 = Bed( self.value('tName'), self.value('tStarts')[i], self.value('tStarts')[i] + self.value('blockSizes')[i]) b2 = Bed( psl2.value('tName'), psl2.value('tStarts')[j], psl2.value('tStarts')[j] + psl2.value('blockSizes')[j]) size = b1.overlap_size(b2) output += size return output
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer',default=10000,type=int) parser.add_argument('--window_size',default=10000,type=int) parser.add_argument('--bin_size',default=1000,type=int) parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons',action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0,g.get_exon_count()): erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum)/float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds,already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True) intergenic_beds = window_break(intergenic_beds,args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True) intron_beds = merge_ranges(intron_beds,already_sorted=True) intron_beds = window_break(intron_beds,args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py "+args.bam_input p = Popen(cmd.split(),stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0],int(f[1]),int(f[2])) if section_count %100==0: sys.stderr.write(curr.get_range_string()+" \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0,size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0,size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0,size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer', default=10000, type=int) parser.add_argument('--window_size', default=10000, type=int) parser.add_argument('--bin_size', default=1000, type=int) parser.add_argument( '--use_off_regions', action='store_true', help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons', action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0, g.get_exon_count()): erng = Bed(g.value('chrom'), g.value('exonStarts')[i], g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum) / float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds, already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds, padded_gene_beds, already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True) intergenic_beds = window_break(intergenic_beds, args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True) intron_beds = merge_ranges(intron_beds, already_sorted=True) intron_beds = window_break(intron_beds, args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py " + args.bam_input p = Popen(cmd.split(), stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0], int(f[1]), int(f[2])) if section_count % 100 == 0: sys.stderr.write(curr.get_range_string() + " \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len( exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0, size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len( intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0, size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len( intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr, introndepth, intergenicdepth, pseudoreadcount, avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0, size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()