def get_loci(transcripts_genepred): loci = Loci() loci.verbose = True with open(transcripts_genepred) as inf: for line in inf: if line[0] == '#': continue gpd = GenePredEntry(line.rstrip()) rng = Bed(gpd.value('chrom'), gpd.value('txStart'), gpd.value('txEnd')) rng.set_payload(gpd.value('name')) loc1 = Locus() loc1.add_member(rng) loci.add_locus(loc1) sys.stderr.write("Organizing genepred data into overlapping loci\n") sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n") loci.update_loci() sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n") m = 0 locus2name = {} name2locus = {} for locus in loci.loci: m += 1 for member in locus.members: name = member.get_payload() if m not in locus2name: locus2name[m] = set() locus2name[m].add(name) name2locus[name] = m return [locus2name, name2locus]
def read_first(self, ingpd): self.gpds.append(ingpd) sjun = get_simple_junction(ingpd) if sjun: self.simple_junction_set.add(sjun) if self.params["use_dir"]: self.dir = ingpd.value("strand") # add fuzzy junctions chr = ingpd.value("chrom") for i in range(0, len(ingpd.value("exonStarts")) - 1): self.fuzzy_junctions.append( FuzzyJunction(chr, ingpd.value("exonEnds")[i], ingpd.value("exonStarts")[i + 1] + 1, self.dir) ) if len(ingpd.value("exonStarts")) > 1: # we have junctions self.fuzzy_junctions[0].left.get_payload()["start"] = Bed( chr, ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir ) self.fuzzy_junctions[0].left.get_payload()["start"].set_payload([]) self.fuzzy_junctions[0].left.get_payload()["start"].get_payload().append(ingpd.value("txStart") + 1) self.fuzzy_junctions[-1].right.get_payload()["end"] = Bed( chr, ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir ) self.fuzzy_junctions[-1].right.get_payload()["end"].set_payload([]) self.fuzzy_junctions[-1].right.get_payload()["end"].get_payload().append(ingpd.value("txEnd")) # add fuzzy starts self.start = Bed(ingpd.value("chrom"), ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir) self.start.set_payload([]) self.start.get_payload().append(ingpd.value("txStart") + 1) self.end = Bed(ingpd.value("chrom"), ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir) self.end.set_payload([]) self.end.get_payload().append(ingpd.value("txEnd"))
def get_loci(transcripts_genepred): loci = Loci() loci.verbose= True with open(transcripts_genepred) as inf: for line in inf: if line[0]=='#': continue gpd = GenePredEntry(line.rstrip()) rng = Bed(gpd.value('chrom'),gpd.value('txStart'),gpd.value('txEnd')) rng.set_payload(gpd.value('name')) loc1 = Locus() loc1.add_member(rng) loci.add_locus(loc1) sys.stderr.write("Organizing genepred data into overlapping loci\n") sys.stderr.write("Started with "+str(len(loci.loci))+" loci\n") loci.update_loci() sys.stderr.write("Ended with "+str(len(loci.loci))+" loci\n") m = 0 locus2name = {} name2locus = {} for locus in loci.loci: m+=1 for member in locus.members: name = member.get_payload() if m not in locus2name: locus2name[m] = set() locus2name[m].add(name) name2locus[name] = m return [locus2name,name2locus]
def get_mode(self): m1 = mode(self.left.get_payload()['junc']) m2 = mode(self.right.get_payload()['junc']) return [ Bed(self.chr, m1 - 1, m1, self.dir), Bed(self.chr, m2 - 1, m2, self.dir) ]
def copy(self): g = FuzzyGenePred() # start with a blank one why not # get the settings for pname in self.params: g.params[pname] = self.params[pname] # copy the genepreds for orig in self.gpds: g.gpds.append(GenePredEntry(orig.get_line())) #store direction g.dir = self.dir # copy the fuzzy junctions for orig in self.fuzzy_junctions: g.fuzzy_junctions.append(orig.copy()) # copy the simple junction set for orig in self.simple_junction_set: g.simple_junction_set.add(orig) # copy the start if self.start: g.start = Bed(self.start.chr,\ self.start.start-1,\ self.start.end,\ self.start.direction) g.start.set_payload([]) for v in self.start.get_payload(): g.start.get_payload().append(v) # copy the end if self.end: g.end = Bed(self.end.chr, self.end.start - 1, self.end.end, self.end.direction) g.end.set_payload([]) for v in self.end.get_payload(): g.end.get_payload().append(v) return g
def do_add_single_exon_fuzzy_gpd(self, fuz2): if not self.params["do_add_single_exon"]: return False # make sure we are allowed to be doing this # build the bounds from the average start and end s1 = mean(self.start.get_payload()) e1 = mean(self.end.get_payload()) s2 = mean(fuz2.start.get_payload()) e2 = mean(fuz2.end.get_payload()) l1 = e1 - s1 + 1 l2 = e2 - s2 + 1 if l1 < self.params["single_exon_minimum_length"]: return False if l2 < self.params["single_exon_minimum_length"]: return False if l1 < 1 or l2 < 1: return False # shouldn't happen chr1 = self.start.chr chr2 = self.end.chr if chr1 != chr2: return False # shouldn't happen r1 = Bed(chr1, s1 - 1, e1, self.dir) r2 = Bed(chr2, s2 - 1, e2, self.dir) over = r1.overlap_size(r2) if over < self.params["single_exon_minimum_overlap_bases"]: return False # print r1.get_range_string() # print r2.get_range_string() cov = min(float(over) / float(l1), float(over) / float(l2)) if cov < self.params["single_exon_minimum_overlap_fraction"]: return False if abs(e1 - e2) > self.params["single_exon_maximum_endpoint_distance"]: return False if abs(s1 - s2) > self.params["single_exon_maximum_endpoint_distance"]: return False # If we're still here, we can add result output = self.copy() newstart = output.start.merge(fuz2.start) newstart.set_payload([]) for s in output.start.get_payload(): newstart.get_payload().append(s) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) newend = output.end.merge(fuz2.end) newend.set_payload([]) for e in output.end.get_payload(): newend.get_payload().append(e) for e in fuz2.end.get_payload(): newend.get_payload().append(e) output.start = newstart output.end = newend for gpd in fuz2.gpds: output.gpds.append(gpd) sjun = get_simple_junction(gpd) if sjun: output.simple_junction_set.add(gpd) return output
def do_add_single_exon_fuzzy_gpd(self, fuz2): if not self.params['do_add_single_exon']: return False # make sure we are allowed to be doing this #build the bounds from the average start and end s1 = mean(self.start.get_payload()) e1 = mean(self.end.get_payload()) s2 = mean(fuz2.start.get_payload()) e2 = mean(fuz2.end.get_payload()) l1 = e1 - s1 + 1 l2 = e2 - s2 + 1 if l1 < self.params['single_exon_minimum_length']: return False if l2 < self.params['single_exon_minimum_length']: return False if l1 < 1 or l2 < 1: return False #shouldn't happen chr1 = self.start.chr chr2 = self.end.chr if chr1 != chr2: return False #shouldn't happen r1 = Bed(chr1, s1 - 1, e1, self.dir) r2 = Bed(chr2, s2 - 1, e2, self.dir) over = r1.overlap_size(r2) if over < self.params['single_exon_minimum_overlap_bases']: return False #print r1.get_range_string() #print r2.get_range_string() cov = min(float(over) / float(l1), float(over) / float(l2)) if cov < self.params['single_exon_minimum_overlap_fraction']: return False if abs(e1 - e2) > self.params['single_exon_maximum_endpoint_distance']: return False if abs(s1 - s2) > self.params['single_exon_maximum_endpoint_distance']: return False #If we're still here, we can add result output = self.copy() newstart = output.start.merge(fuz2.start) newstart.set_payload([]) for s in output.start.get_payload(): newstart.get_payload().append(s) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) newend = output.end.merge(fuz2.end) newend.set_payload([]) for e in output.end.get_payload(): newend.get_payload().append(e) for e in fuz2.end.get_payload(): newend.get_payload().append(e) output.start = newstart output.end = newend for gpd in fuz2.gpds: output.gpds.append(gpd) sjun = get_simple_junction(gpd) if sjun: output.simple_junction_set.add(gpd) return output
def get_info_string(self): ostr = "" ostr += "== FUZZY GENEPRED INFO ==" + "\n" ostr += str(len(self.gpds)) + " total GPDs" + "\n" totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction) ostr += totalbounds.get_range_string() + " total bounds\n" ostr += "---- start ----" + "\n" ostr += str(len(self.start.get_payload())) + " reads supporting start" + "\n" ostr += " " + str(mean(self.start.get_payload())) + " mean" + "\n" ostr += " " + str(mode(self.start.get_payload())) + " mode" + "\n" ostr += " " + self.start.get_range_string() + " start range\n" ostr += "---- end ----" + "\n" ostr += str(len(self.end.get_payload())) + " reads supporting end" + "\n" ostr += " " + str(mean(self.end.get_payload())) + " mean" + "\n" ostr += " " + str(mode(self.end.get_payload())) + " mode" + "\n" ostr += " " + self.end.get_range_string() + " end range\n" ostr += "---- junctions ----" + "\n" ostr += str(len(self.fuzzy_junctions)) + " total fuzzy junctions" + "\n" cnt = 0 for j in self.fuzzy_junctions: cnt += 1 ostr += ( " " + str(cnt) + ". " + str(mode(j.left.get_payload()["junc"])) + " ^ " + str(mode(j.right.get_payload()["junc"])) + "\n" ) ostr += " " + j.left.get_range_string() + " ^ " + j.right.get_range_string() + "\n" ostr += " " + str(len(j.left.get_payload()["junc"])) + " read support" + "\n" if j.left.get_payload()["start"]: ostr += " " + "---starts----" + "\n" ostr += ( " " + str(len(j.left.get_payload()["start"].get_payload())) + " starts at " + j.left.get_payload()["start"].get_range_string() + "\n" ) if j.right.get_payload()["end"]: ostr += " " + "---ends----" + "\n" ostr += ( " " + str(len(j.right.get_payload()["end"].get_payload())) + " ends at " + j.right.get_payload()["end"].get_range_string() + "\n" ) return ostr
def target_distance(self, psl_entry, use_direction=False): if self.value('tName') != psl_entry.value('tName'): return -1 if use_direction and self.value('strand') != psl_entry.value('strand'): return -1 b1 = Bed(self.entry['tName'], self.entry['tStart'], self.entry['tEnd']) b2 = Bed(psl_entry.entry['tName'], psl_entry.entry['tStart'], psl_entry.entry['tEnd']) if b1.overlaps(b2): return 0 if b1.end < b2.start: return b2.start - b1.end - 1 if b1.start > b2.end: return b1.start - b2.end - 1 sys.stderr.write("ERROR un accounted for state\n") sys.exit()
def set_conversion_string(self, conversion_string): self.conversion_string = conversion_string self.ars_name = encode_ars_name(conversion_string, self.name) self.bounds = [] for part in conversion_string.split('/'): m = re.match('^([^,]+),(\d+)-(\d+)\|([+-])$', part) self.bounds.append( Bed(m.group(1), int(m.group(2)), int(m.group(3)), m.group(4)))
def get_info_string(self): ostr = '' ostr += "== FUZZY GENEPRED INFO ==" + "\n" ostr += str(len(self.gpds)) + ' total GPDs' + "\n" totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction) ostr += totalbounds.get_range_string() + " total bounds\n" ostr += '---- start ----' + "\n" ostr += str(len( self.start.get_payload())) + " reads supporting start" + "\n" ostr += ' ' + str(mean(self.start.get_payload())) + ' mean' + "\n" ostr += ' ' + str(mode(self.start.get_payload())) + ' mode' + "\n" ostr += ' ' + self.start.get_range_string() + " start range\n" ostr += '---- end ----' + "\n" ostr += str(len( self.end.get_payload())) + " reads supporting end" + "\n" ostr += ' ' + str(mean(self.end.get_payload())) + ' mean' + "\n" ostr += ' ' + str(mode(self.end.get_payload())) + ' mode' + "\n" ostr += ' ' + self.end.get_range_string() + " end range\n" ostr += '---- junctions ----' + "\n" ostr += str(len( self.fuzzy_junctions)) + ' total fuzzy junctions' + "\n" cnt = 0 for j in self.fuzzy_junctions: cnt += 1 ostr += ' ' + str(cnt) + '. ' + str( mode(j.left.get_payload()['junc'])) + " ^ " + str( mode(j.right.get_payload()['junc'])) + "\n" ostr += " " + j.left.get_range_string( ) + " ^ " + j.right.get_range_string() + "\n" ostr += " " + str(len( j.left.get_payload()['junc'])) + " read support" + "\n" if j.left.get_payload()['start']: ostr += " " + "---starts----" + "\n" ostr += " " + str( len(j.left.get_payload()['start'].get_payload()) ) + " starts at " + j.left.get_payload( )['start'].get_range_string() + "\n" if j.right.get_payload()['end']: ostr += " " + "---ends----" + "\n" ostr += " " + str( len(j.right.get_payload()['end'].get_payload()) ) + " ends at " + j.right.get_payload( )['end'].get_range_string() + "\n" return ostr
def get_beds_from_entry(entry, use_direction=False): query_beds = [] target_beds = [] print entry for i in range(0, entry['blockCount']): if use_direction: tb = Bed(entry['tName'], entry['tStarts'][i], entry['tStarts'][i] + entry['blockSizes'][i], entry['strand']) target_beds.append(tb) else: tb = Bed(entry['tName'], entry['tStarts'][i], entry['tStarts'][i] + entry['blockSizes'][i]) target_beds.append(tb) qb = Bed(entry['qName'], entry['qStarts_actual'][i], entry['qStarts_actual'][i] + entry['blockSizes'][i]) query_beds.append(tb) return [query_beds, target_beds]
def window_break(inranges, window_size): outputs = [] if len(inranges) == 0: return outputs for inrange in inranges: start = inrange.start while start + window_size < inrange.end: outputs.append(Bed(inrange.chr, start, start + window_size - 1)) start += window_size return outputs
def get_query_bed(self): s1 = self.value('qStarts_actual')[0] s2 = self.value('qStarts_actual')[-1] + self.value('blockSizes')[-1] if self.value('strand') == '-': s1 = self.convert_coordinate_query_to_actual_query( self.value('qStarts')[-1] + self.value('blockSizes')[-1]) - 1 s2 = self.convert_coordinate_query_to_actual_query( self.value('qStarts')[0] + 1) return Bed(self.value('qName'), s1, s2)
def add_junction(self, inchr, inleft, inright, indir=None): if not self.left: # this is our first one t1 = {} t1["junc"] = [] t1["start"] = None self.left = Bed(inchr, inleft - 1, inleft, indir) self.left.set_payload(t1) self.left.get_payload()["junc"].append(inleft) self.right = Bed(inchr, inright - 1, inright, indir) t2 = {} t2["junc"] = [] t2["end"] = None self.right = Bed(inchr, inright - 1, inright, indir) self.right.set_payload(t2) self.right.get_payload()["junc"].append(inright) return # Lets add this one to our current one newfuz = FuzzyJunction(inchar, inleft, inright, indir) self.add_fuzzy_junction(newfuz)
def query_overlap_size(self, psl2): if self.value('qName') != psl2.value('qName'): return 0 # on same query output = 0 for i in range(0, self.value('blockCount')): for j in range(0, psl2.value('blockCount')): b1 = Bed( self.value('qName'), self.value('qStarts_actual')[i], self.value('qStarts_actual')[i] + self.value('blockSizes')[i]) b2 = Bed( psl2.value('qName'), psl2.value('qStarts_actual')[j], psl2.value('qStarts_actual')[j] + psl2.value('blockSizes')[j]) size = b1.overlap_size(b2) output += size return output
def target_overlap_size(self, psl2, use_direction=False): if self.value('tName') != psl2.value('tName'): return 0 if use_direction and self.value('strand') != psl2.value('strand'): return 0 # on same chromosome output = 0 for i in range(0, self.value('blockCount')): for j in range(0, psl2.value('blockCount')): b1 = Bed( self.value('tName'), self.value('tStarts')[i], self.value('tStarts')[i] + self.value('blockSizes')[i]) b2 = Bed( psl2.value('tName'), psl2.value('tStarts')[j], psl2.value('tStarts')[j] + psl2.value('blockSizes')[j]) size = b1.overlap_size(b2) output += size return output
def copy(self): newjunc = FuzzyJunction() newjunc.chr = self.chr newjunc.left = Bed(self.left.chr,\ self.left.start-1,\ self.left.end,\ self.left.direction) t1 = {} t1['junc'] = [] t1['start'] = None newjunc.left.set_payload(t1) for j in self.left.get_payload()['junc']: newjunc.left.get_payload()['junc'].append(j) newjunc.right = Bed(self.right.chr, self.right.start - 1, self.right.end, self.right.direction) #copy any starts for the junction if self.left.get_payload()['start']: ls = self.left.get_payload()['start'] newjunc.left.get_payload()['start'] = Bed(ls.chr, ls.start - 1, ls.end, ls.direction) newjunc.left.get_payload()['start'].set_payload([]) for p in self.left.get_payload()['start'].get_payload(): newjunc.left.get_payload()['start'].get_payload().append(p) t2 = {} t2['junc'] = [] t2['end'] = None newjunc.right.set_payload(t2) for j in self.right.get_payload()['junc']: newjunc.right.get_payload()['junc'].append(j) #copy any ends for the junction if self.right.get_payload()['end']: ren = self.right.get_payload()['end'] newjunc.right.get_payload()['end'] = Bed(ren.chr, ren.start - 1, ren.end, ren.direction) newjunc.right.get_payload()['end'].set_payload([]) for p in self.right.get_payload()['end'].get_payload(): newjunc.right.get_payload()['end'].get_payload().append(p) return newjunc
def main(): parser = argparse.ArgumentParser(description='Create artifical reference sequences from a genepred') parser.add_argument('gpd_file') parser.add_argument('reference_fasta') parser.add_argument('-o','--output',help="output file to write to or STDOUT if not set") args = parser.parse_args() of = sys.stdout if args.output: of = open(args.output,'w') f = read_fasta_into_hash(args.reference_fasta) with open(args.gpd_file) as inf: for line in inf: gpd = GenePredBasics.GenePredEntry() gpd.line_to_entry(line.rstrip()) ars = ARS() beds = [] for i in range(0,gpd.value('exonCount')): b = Bed(gpd.value('chrom'),gpd.value('exonStarts')[i],gpd.value('exonEnds')[i],gpd.value('strand')) beds.append(b) ars.set_bounds(beds) ars.set_name(gpd.value('name')) ars.set_sequence_from_original_reference_hash(f) of.write(ars.get_fasta())
def add_junction(self, inchr, inleft, inright, indir=None): if not self.left: # this is our first one t1 = {} t1['junc'] = [] t1['start'] = None self.left = Bed(inchr, inleft - 1, inleft, indir) self.left.set_payload(t1) self.left.get_payload()['junc'].append(inleft) self.right = Bed(inchr, inright - 1, inright, indir) t2 = {} t2['junc'] = [] t2['end'] = None self.right = Bed(inchr, inright - 1, inright, indir) self.right.set_payload(t2) self.right.get_payload()['junc'].append(inright) return #Lets add this one to our current one newfuz = FuzzyJunction(inchar, inleft, inright, indir) self.add_fuzzy_junction(newfuz)
def read_first(self, ingpd): self.gpds.append(ingpd) sjun = get_simple_junction(ingpd) if sjun: self.simple_junction_set.add(sjun) if self.params['use_dir']: self.dir = ingpd.value('strand') # add fuzzy junctions chr = ingpd.value('chrom') for i in range(0, len(ingpd.value('exonStarts')) - 1): self.fuzzy_junctions.append( FuzzyJunction(chr, ingpd.value('exonEnds')[i], ingpd.value('exonStarts')[i + 1] + 1, self.dir)) if len(ingpd.value('exonStarts')) > 1: # we have junctions self.fuzzy_junctions[0].left.get_payload()['start'] = Bed( chr, ingpd.value('txStart'), ingpd.value('txStart') + 1, self.dir) self.fuzzy_junctions[0].left.get_payload()['start'].set_payload([]) self.fuzzy_junctions[0].left.get_payload()['start'].get_payload( ).append(ingpd.value('txStart') + 1) self.fuzzy_junctions[-1].right.get_payload()['end'] = Bed( chr, ingpd.value('txEnd') - 1, ingpd.value('txEnd'), self.dir) self.fuzzy_junctions[-1].right.get_payload()['end'].set_payload([]) self.fuzzy_junctions[-1].right.get_payload()['end'].get_payload( ).append(ingpd.value('txEnd')) # add fuzzy starts self.start = Bed(ingpd.value('chrom'), ingpd.value('txStart'), ingpd.value('txStart') + 1, self.dir) self.start.set_payload([]) self.start.get_payload().append(ingpd.value('txStart') + 1) self.end = Bed(ingpd.value('chrom'), ingpd.value('txEnd') - 1, ingpd.value('txEnd'), self.dir) self.end.set_payload([]) self.end.get_payload().append(ingpd.value('txEnd'))
def get_range(self): endpos = self.value('pos') - 1 for c in self.value('cigar_array'): if re.match('[MDNX=]', c['op']): endpos += c['val'] return Bed(self.value('rname'), self.value('pos') - 1, endpos, self.strand())
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer', default=10000, type=int) parser.add_argument('--window_size', default=10000, type=int) parser.add_argument('--bin_size', default=1000, type=int) parser.add_argument( '--use_off_regions', action='store_true', help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons', action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0, g.get_exon_count()): erng = Bed(g.value('chrom'), g.value('exonStarts')[i], g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum) / float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds, already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds, padded_gene_beds, already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True) intergenic_beds = window_break(intergenic_beds, args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True) intron_beds = merge_ranges(intron_beds, already_sorted=True) intron_beds = window_break(intron_beds, args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py " + args.bam_input p = Popen(cmd.split(), stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0], int(f[1]), int(f[2])) if section_count % 100 == 0: sys.stderr.write(curr.get_range_string() + " \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len( exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0, size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len( intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0, size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len( intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr, introndepth, intergenicdepth, pseudoreadcount, avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0, size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
def get_target_bed(self): return Bed(self.value('tName'), self.value('tStart'), self.value('tEnd'), self.value('strand'))
class FuzzyGenePred: #set use_dir true if you want to use direction and make it direction specific #set proper_set false if you want to do awesome extending that doesn't really work yet def __init__(self, ingpd=None, params=None, juntol=10): # Here is the basic data self.fuzzy_junctions = [] self.gpds = [] #contributing member genepreds self.start = None self.end = None self.dir = None # Higher level data self.simple_junction_set = set( ) # quickly search for if a multi exon gene has been added #Here is the parameters self.params = {} self.params['use_dir'] = False self.params['junction_tolerance'] = juntol #Not fully implemented. Do we require a full length match self.params['proper_set'] = True # Define thresholds for overlapping single exons self.params['do_add_single_exon'] = True self.params['single_exon_minimum_length'] = 200 self.params[ 'single_exon_minimum_overlap_fraction'] = 0.8 #reciprocal ... must be this fraction or more on both self.params[ 'single_exon_minimum_overlap_bases'] = 1 #minimum number of bases self.params['single_exon_maximum_endpoint_distance'] = 1000 if params: for pname in params: self.params[pname] = params[pname] if ingpd: self.add_gpd(ingpd) def get_genepred_line(self, end_select='extremes', junction_select='mode', name=None): if not name: name = 'fuzGPD_' + random_string(8) + '_' + str( len(self.fuzzy_junctions) + 1) + '_' + str(len(self.gpds)) ostr = '' ostr += name + "\t" ostr += name + "\t" ostr += self.start.chr + "\t" ostr += self.gpds[0].value('strand') + "\t" ostr += str(self.start.start - 1) + "\t" ostr += str(self.end.end) + "\t" ostr += str(self.start.start - 1) + "\t" ostr += str(self.end.end) + "\t" ostr += str(len(self.fuzzy_junctions) + 1) + "\t" exonstarts = [] exonends = [] exonstarts.append(self.start.start - 1) for j in self.fuzzy_junctions: exonends.append(mode(j.left.get_payload()['junc'])) exonstarts.append(mode(j.right.get_payload()['junc']) - 1) exonends.append(self.end.end) ostr += ','.join([str(x) for x in exonstarts]) + ',' + "\t" ostr += ','.join([str(x) for x in exonends]) + ',' return ostr # Return a copy of the fuzzy geneprep def copy(self): g = FuzzyGenePred() # start with a blank one why not # get the settings for pname in self.params: g.params[pname] = self.params[pname] # copy the genepreds for orig in self.gpds: g.gpds.append(GenePredEntry(orig.get_line())) #store direction g.dir = self.dir # copy the fuzzy junctions for orig in self.fuzzy_junctions: g.fuzzy_junctions.append(orig.copy()) # copy the simple junction set for orig in self.simple_junction_set: g.simple_junction_set.add(orig) # copy the start if self.start: g.start = Bed(self.start.chr,\ self.start.start-1,\ self.start.end,\ self.start.direction) g.start.set_payload([]) for v in self.start.get_payload(): g.start.get_payload().append(v) # copy the end if self.end: g.end = Bed(self.end.chr, self.end.start - 1, self.end.end, self.end.direction) g.end.set_payload([]) for v in self.end.get_payload(): g.end.get_payload().append(v) return g def exon_count(self): return len(self.fuzzy_junctions) + 1 def gpd_count(self): return len(self.gpds) def get_bed(self): return Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction) #This is an inspection tool for a fuzzy gpd def get_info_string(self): ostr = '' ostr += "== FUZZY GENEPRED INFO ==" + "\n" ostr += str(len(self.gpds)) + ' total GPDs' + "\n" totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction) ostr += totalbounds.get_range_string() + " total bounds\n" ostr += '---- start ----' + "\n" ostr += str(len( self.start.get_payload())) + " reads supporting start" + "\n" ostr += ' ' + str(mean(self.start.get_payload())) + ' mean' + "\n" ostr += ' ' + str(mode(self.start.get_payload())) + ' mode' + "\n" ostr += ' ' + self.start.get_range_string() + " start range\n" ostr += '---- end ----' + "\n" ostr += str(len( self.end.get_payload())) + " reads supporting end" + "\n" ostr += ' ' + str(mean(self.end.get_payload())) + ' mean' + "\n" ostr += ' ' + str(mode(self.end.get_payload())) + ' mode' + "\n" ostr += ' ' + self.end.get_range_string() + " end range\n" ostr += '---- junctions ----' + "\n" ostr += str(len( self.fuzzy_junctions)) + ' total fuzzy junctions' + "\n" cnt = 0 for j in self.fuzzy_junctions: cnt += 1 ostr += ' ' + str(cnt) + '. ' + str( mode(j.left.get_payload()['junc'])) + " ^ " + str( mode(j.right.get_payload()['junc'])) + "\n" ostr += " " + j.left.get_range_string( ) + " ^ " + j.right.get_range_string() + "\n" ostr += " " + str(len( j.left.get_payload()['junc'])) + " read support" + "\n" if j.left.get_payload()['start']: ostr += " " + "---starts----" + "\n" ostr += " " + str( len(j.left.get_payload()['start'].get_payload()) ) + " starts at " + j.left.get_payload( )['start'].get_range_string() + "\n" if j.right.get_payload()['end']: ostr += " " + "---ends----" + "\n" ostr += " " + str( len(j.right.get_payload()['end'].get_payload()) ) + " ends at " + j.right.get_payload( )['end'].get_range_string() + "\n" return ostr #Add a new gpd return true if successful #Return false if it didn't work, return the new combined if it worked def add_gpd(self, ingpd): if len(self.gpds) == 0: # first one self.read_first(ingpd) return self #return ourself if we are adding our first # more difficult situation where we must try to combine # See if it can match first before actually adding stuff to it #if self. newfuz = FuzzyGenePred(ingpd, params=self.params) output = self.add_fuzzy_gpd(newfuz) return output # combine together compatible overlapping sets def concat_fuzzy_gpd(self, fuz2): if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0: return False if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0: return False # Lets work combine the single exon step and exit if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0: return self.do_add_single_exon_fuzzy_gpd(fuz2) # For now don't add them if one is single exon if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0: return False # See if its already a subset easy_subset = False for simplejunction in fuz2.simple_junction_set: if simplejunction in self.simple_junction_set: easy_subset = True # If its not already a subset look deeper #1. First we need perfect junctions for a run of them if not easy_subset: if not self.compatible_overlap(fuz2): return False # still here. we will work on combining these output = self.copy() # first lets put add any overlapping junctions for i in range(0, len(output.fuzzy_junctions)): for j in range(0, len(fuz2.fuzzy_junctions)): if output.fuzzy_junctions[i].overlaps( fuz2.fuzzy_junctions[j], fuz2.params['junction_tolerance']): output.fuzzy_junctions[i].add_fuzzy_junction( fuz2.fuzzy_junctions[j]) if j == 0: # put the start in too if not output.fuzzy_junctions[i].left.get_payload( )['start']: output.fuzzy_junctions[i].left.get_payload( )['start'] = fuz2.start.copy() else: # merge starts = output.fuzzy_junctions[ i].left.get_payload()['start'].get_payload() for v in fuz2.start.get_payload(): starts.append(v) nrange = output.fuzzy_junctions[ i].left.get_payload()['start'].merge( fuz2.start) nrange.set_payload(starts[:]) output.fuzzy_junctions[i].left.get_payload( )['start'] = nrange if j == len( fuz2.fuzzy_junctions) - 1: # put the end in too if not output.fuzzy_junctions[i].right.get_payload( )['end']: output.fuzzy_junctions[i].right.get_payload( )['end'] = fuz2.end.copy() else: # merge ends = output.fuzzy_junctions[i].right.get_payload( )['end'].get_payload() for v in fuz2.end.get_payload(): ends.append(v) nrange = output.fuzzy_junctions[ i].right.get_payload()['end'].merge(fuz2.end) nrange.set_payload(ends[:]) output.fuzzy_junctions[i].right.get_payload( )['end'] = nrange # see if we should build onto the left leftnum = -1 leftmost = self.fuzzy_junctions[0] if fuz2.fuzzy_junctions[0].right.end < leftmost.left.start: for i in range(0, len(fuz2.fuzzy_junctions)): if fuz2.fuzzy_junctions[i].overlaps( leftmost, fuz2.params['junction_tolerance']): leftnum = i break #leftnum is now -1 if no additions to the left zero if it starts on the same if leftnum > 0: for i in reversed(range(0, leftnum)): output.fuzzy_junctions.insert(0, fuz2.fuzzy_junctions[i].copy()) output.start = fuz2.start.copy() rightnum = -1 # get the right point ... our first one comes after this rightmost = self.fuzzy_junctions[-1] if fuz2.fuzzy_junctions[-1].left.start > rightmost.right.end: for i in reversed(range(0, len(fuz2.fuzzy_junctions))): if fuz2.fuzzy_junctions[i].overlaps( rightmost, fuz2.params['junction_tolerance']): rightnum = i break if rightnum != -1: rightnum += 1 if rightnum < len(fuz2.fuzzy_junctions): for i in range(rightnum, len(fuz2.fuzzy_junctions)): output.fuzzy_junctions.append( fuz2.fuzzy_junctions[i].copy()) output.end = fuz2.end.copy() #print leftnum #print rightnum #print fuz2.params['junction_tolerance'] #print 'combining' return output # add together subsets def add_fuzzy_gpd(self, fuz2): # see if we can add this fuzzy gpd to another # We treat single exon genes seprately so if only one of them is # single exon we can't compare them if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0: return False if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0: return False # Lets work combine the single exon step and exit if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0: return self.do_add_single_exon_fuzzy_gpd(fuz2) # For now don't add them if one is single exon if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0: return False # See if its already a subset easy_subset = False for simplejunction in fuz2.simple_junction_set: if simplejunction in self.simple_junction_set: easy_subset = True # If its not already a subset look deeper #1. First we need perfect junctions for a run of them if not easy_subset: if not self.compatible_overlap(fuz2): return False # still here. we will work on combining these output = self.copy() #switch over to working on the output now # If we are still here we can add the two of them together # If they have the same starting junction we can add their starting points together if output.fuzzy_junctions[0].overlaps( fuz2.fuzzy_junctions[0], output.params['junction_tolerance']): #print 'samestart' newstart = output.start.merge(fuz2.start) newstart.set_payload(output.start.get_payload()) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) output.start = newstart # Check if the other one is new start elif mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) < mode( output.fuzzy_junctions[0].left.get_payload()['junc']): #print "2 start" output.start = fuz2.start elif mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) > mode( output.fuzzy_junctions[0].left.get_payload()['junc']): True # #print "1 start" # #we're good to go else: sys.stderr.write("WARNING: strange start case abort merge\n") return False # lets work the ends now if output.fuzzy_junctions[-1].overlaps( fuz2.fuzzy_junctions[-1], output.params['junction_tolerance']): #print 'sameend' newend = output.end.merge(fuz2.end) newend.set_payload(output.end.get_payload()) for s in fuz2.end.get_payload(): newend.get_payload().append(s) output.end = newend # Check if the other one is new start elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc']) > mode( output.fuzzy_junctions[-1].right.get_payload()['junc']): #print "2 end" output.end = fuz2.end elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc']) < mode( output.fuzzy_junctions[-1].right.get_payload()['junc']): True # #print "1 end" # #we're good to go else: sys.stderr.write("WARNING: strange end case abort merge\n") u1 = mode(output.fuzzy_junctions[-1].left.get_payload()['junc']) u2 = mode(fuz2.fuzzy_junctions[-1].left.get_payload()['junc']) v1 = mode(output.fuzzy_junctions[-1].right.get_payload()['junc']) v2 = mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc']) sys.stderr.write(str(u1) + "\t" + str(u2) + "\n") sys.stderr.write(str(v1) + "\t" + str(v2) + "\n") return False # now the starts and ends have been updated in output. # iterate through the junctions. # check for a left overhang. numfuz2left = 0 numoutleft = 0 if not output.fuzzy_junctions[0].overlaps( fuz2.fuzzy_junctions[0], output.params['junction_tolerance']): # see if we need to add sequences from fuz2 if mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) < mode( output.fuzzy_junctions[0].left.get_payload()['junc']): #print 'left over2' i = 0 while not output.fuzzy_junctions[0].overlaps( fuz2.fuzzy_junctions[i], output.params['junction_tolerance']) and i < len( fuz2.fuzzy_junctions): i += 1 numfuz2left = i # number to push on from the fuz2 and increment in #print numfuz2left elif mode( fuz2.fuzzy_junctions[0].left.get_payload()['junc']) > mode( output.fuzzy_junctions[0].left.get_payload()['junc']): #print 'left over1' i = 0 while not output.fuzzy_junctions[i].overlaps( fuz2.fuzzy_junctions[0], output.params['junction_tolerance']) and i < len( output.fuzzy_junctions): i += 1 numoutleft = i # number to increment in from output #print numoutleft else: sys.stderr.write("WARNING: strange case \n") return False # next we can check how long we have a run of the same ind1 = numoutleft ind2 = numfuz2left overlap_size = 0 while ind1 < len(output.fuzzy_junctions) and ind2 < len(fuz2.fuzzy_junctions) \ and output.fuzzy_junctions[ind1].overlaps(fuz2.fuzzy_junctions[ind2],output.params['junction_tolerance']): overlap_size += 1 ind1 += 1 ind2 += 1 #print 'overlap size '+str(overlap_size) numoutright = len(output.fuzzy_junctions) - overlap_size - numoutleft numfuz2right = len(fuz2.fuzzy_junctions) - overlap_size - numfuz2left if min(numoutright, numfuz2right) != 0: sys.stderr.write("WARNING: expected one of them to be zero\n") #print self.get_info_string() #print '=====================' #print fuz2.get_info_string() #sys.exit() return False if min(numoutleft, numfuz2left) != 0: sys.stderr.write("WARNING: expected one of them to be zero\n") return False #print numoutright #print numfuz2right #print output.fuzzy_junctions[numoutleft].overlaps(fuz2.fuzzy_junctions[numfuz2left],output.junction_tolerance) #print 'add' #Now we have what we need to go through and do some updating #Lets just make new fuzzy junctions newjuncs = [] for i in range(0, numfuz2left): newjuncs.append(fuz2.fuzzy_junctions[i]) for i in range(0, numoutleft): newjuncs.append(output.fuzzy_junctions[i]) #Now we do both down the center range1 = range(numoutleft, overlap_size + numoutleft) range2 = range(numfuz2left, overlap_size + numfuz2left) for i in range(0, len(range1)): newjuncs.append(output.fuzzy_junctions[range1[i]]) newjuncs[-1].add_fuzzy_junction(fuz2.fuzzy_junctions[range2[i]]) #print i #Make the right size for i in range(overlap_size + numfuz2left, overlap_size + numfuz2left + numfuz2right): newjuncs.append(fuz2.fuzzy_junctions[i]) for i in range(overlap_size + numoutleft, overlap_size + numoutleft + numoutright): newjuncs.append(output.fuzzy_junctions[i]) output.fuzzy_junctions = newjuncs #print 'adding gpd '+str(len(fuz2.gpds))+' entries' for g in fuz2.gpds: output.gpds.append(g) sjun = get_simple_junction(g) if sjun: output.simple_junction_set.add(sjun) #print 'new entry' #print self.get_info_string() return output def do_add_single_exon_fuzzy_gpd(self, fuz2): if not self.params['do_add_single_exon']: return False # make sure we are allowed to be doing this #build the bounds from the average start and end s1 = mean(self.start.get_payload()) e1 = mean(self.end.get_payload()) s2 = mean(fuz2.start.get_payload()) e2 = mean(fuz2.end.get_payload()) l1 = e1 - s1 + 1 l2 = e2 - s2 + 1 if l1 < self.params['single_exon_minimum_length']: return False if l2 < self.params['single_exon_minimum_length']: return False if l1 < 1 or l2 < 1: return False #shouldn't happen chr1 = self.start.chr chr2 = self.end.chr if chr1 != chr2: return False #shouldn't happen r1 = Bed(chr1, s1 - 1, e1, self.dir) r2 = Bed(chr2, s2 - 1, e2, self.dir) over = r1.overlap_size(r2) if over < self.params['single_exon_minimum_overlap_bases']: return False #print r1.get_range_string() #print r2.get_range_string() cov = min(float(over) / float(l1), float(over) / float(l2)) if cov < self.params['single_exon_minimum_overlap_fraction']: return False if abs(e1 - e2) > self.params['single_exon_maximum_endpoint_distance']: return False if abs(s1 - s2) > self.params['single_exon_maximum_endpoint_distance']: return False #If we're still here, we can add result output = self.copy() newstart = output.start.merge(fuz2.start) newstart.set_payload([]) for s in output.start.get_payload(): newstart.get_payload().append(s) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) newend = output.end.merge(fuz2.end) newend.set_payload([]) for e in output.end.get_payload(): newend.get_payload().append(e) for e in fuz2.end.get_payload(): newend.get_payload().append(e) output.start = newstart output.end = newend for gpd in fuz2.gpds: output.gpds.append(gpd) sjun = get_simple_junction(gpd) if sjun: output.simple_junction_set.add(gpd) return output #Return true if these fuzzy genepreds can be added together def compatible_overlap(self, fingpd): f1 = self f2 = fingpd #### Forget about trying zero exon cases for now if len(f1.fuzzy_junctions) == 0 or len(f2.fuzzy_junctions) == 0: return False #Find all matches matches = [] for i in range(0, len(f1.fuzzy_junctions)): for j in range(0, len(f2.fuzzy_junctions)): if f1.fuzzy_junctions[i].overlaps( f2.fuzzy_junctions[j], self.params['junction_tolerance']): matches.append([i, j]) # This is our matched junctions in f1 and f2 if len(matches) == 0: return False # Nothing matched.. certainly no overlap # This is the number of extra exons it would take in the middle of the run (shifts) if len(set([x[0] - x[1] for x in matches])) != 1: return False # Lets make sure all our exons are consecutive if len(matches) > 1: consec1 = list( set([ matches[i + 1][0] - matches[i][0] for i in range(0, len(matches) - 1) ])) consec2 = list( set([ matches[i + 1][1] - matches[i][1] for i in range(0, len(matches) - 1) ])) if len(consec1) != 1: return False if len(consec2) != 1: return False if consec1[0] != 1: return False if consec2[0] != 1: return False # one of them should be zero if not (matches[0][1] == 0 or matches[0][0] == 0): return False # and one of our last matches should be the last junction if not (len(f1.fuzzy_junctions) - 1 == matches[-1][0] or len(f2.fuzzy_junctions) - 1 == matches[-1][1]): return False #### most of the time we will probably be looking for a proper set #### unless we are extending the long read for isoform prediction if self.params['proper_set']: # check those last overhangs # one of the two needs to have the start and end points in the consecutive matches if (matches[0][0] == 0 and len(f1.fuzzy_junctions)-1 == matches[-1][0]) or \ (matches[0][1] == 0 and len(f2.fuzzy_junctions)-1 == matches[-1][1]): return True return False return True def read_first(self, ingpd): self.gpds.append(ingpd) sjun = get_simple_junction(ingpd) if sjun: self.simple_junction_set.add(sjun) if self.params['use_dir']: self.dir = ingpd.value('strand') # add fuzzy junctions chr = ingpd.value('chrom') for i in range(0, len(ingpd.value('exonStarts')) - 1): self.fuzzy_junctions.append( FuzzyJunction(chr, ingpd.value('exonEnds')[i], ingpd.value('exonStarts')[i + 1] + 1, self.dir)) if len(ingpd.value('exonStarts')) > 1: # we have junctions self.fuzzy_junctions[0].left.get_payload()['start'] = Bed( chr, ingpd.value('txStart'), ingpd.value('txStart') + 1, self.dir) self.fuzzy_junctions[0].left.get_payload()['start'].set_payload([]) self.fuzzy_junctions[0].left.get_payload()['start'].get_payload( ).append(ingpd.value('txStart') + 1) self.fuzzy_junctions[-1].right.get_payload()['end'] = Bed( chr, ingpd.value('txEnd') - 1, ingpd.value('txEnd'), self.dir) self.fuzzy_junctions[-1].right.get_payload()['end'].set_payload([]) self.fuzzy_junctions[-1].right.get_payload()['end'].get_payload( ).append(ingpd.value('txEnd')) # add fuzzy starts self.start = Bed(ingpd.value('chrom'), ingpd.value('txStart'), ingpd.value('txStart') + 1, self.dir) self.start.set_payload([]) self.start.get_payload().append(ingpd.value('txStart') + 1) self.end = Bed(ingpd.value('chrom'), ingpd.value('txEnd') - 1, ingpd.value('txEnd'), self.dir) self.end.set_payload([]) self.end.get_payload().append(ingpd.value('txEnd')) # Have finished reading in the first case # Pre: another fuzzy gpd # Post: True if they are all overlapping junctions def is_equal_fuzzy(self, fuz2, use_direction=False): if use_direction: if self.dir != fuz2.dir: return False if len(self.fuzzy_junctions) < 0: return False if len(fuz2.fuzzy_junctions) < 0: return False if len(self.fuzzy_junctions) != len(fuz2.fuzzy_junctions): return False for i in range(0, len(self.fuzzy_junctions)): if not self.fuzzy_junctions[i].overlaps( fuz2.fuzzy_junctions[i], self.params['junction_tolerance']): return False return True
def get_random_gpds_from_pair(pair, genes, ref): #print 'gene 1 ('+pair[0]+'): ' j1s = set() j1chrom = genes[pair[0]][0].value('chrom') j1starts = [] j1ends = [] j1strand = genes[pair[0]][0].value('strand') j2s = set() j2chrom = genes[pair[1]][0].value('chrom') j2starts = [] j2ends = [] j2strand = genes[pair[1]][0].value('strand') for gpd in genes[pair[0]]: if gpd.value('strand') != j1strand: continue if gpd.value('chrom') != j1chrom: continue j1starts.append(gpd.value('exonStarts')[0]) j1ends.append(gpd.value('exonEnds')[-1]) for j in gpd.calculate_junctions(): j1s.add(j) #print 'gene 2 ('+pair[1]+'): ' for gpd in genes[pair[1]]: if gpd.value('strand') != j2strand: continue if gpd.value('chrom') != j2chrom: continue j2starts.append(gpd.value('exonStarts')[0]) j2ends.append(gpd.value('exonEnds')[-1]) for j in gpd.calculate_junctions(): j2s.add(j) j1shuf = list(j1s) shuffle(j1shuf) j2shuf = list(j2s) shuffle(j2shuf) #print j1shuf[0] #print j2shuf[0] if j1strand == '+': m = re.match('[^:]+:(\d+)', j1shuf[0]) left = Bed(j1chrom, min(j1starts) - 500, int(m.group(1)) + 500, j1strand) fsite1 = int(m.group(1)) else: m = re.match('[^:]+:(\d+),[^:]+:(\d+)', j1shuf[0]) left = Bed(j1chrom, int(m.group(2)) - 500, max(j1ends) + 500, j1strand) fsite1 = int(m.group(2)) if j2strand == '+': m = re.match('[^:]+:(\d+),[^:]+:(\d+)', j2shuf[0]) right = Bed(j2chrom, int(m.group(2)) - 500, max(j2ends) + 500, j2strand) fsite2 = int(m.group(2)) else: m = re.match('[^:]+:(\d+),[^:]+:(\d+)', j2shuf[0]) right = Bed(j2chrom, min(j2starts) - 500, int(m.group(1)) + 500, j2strand) fsite2 = int(m.group(1)) #print left.get_range_string()+' '+left.direction #print right.get_range_string()+' '+right.direction [leftcomp, rightcomp] = get_compatible_transcripts(genes[pair[0]], fsite1, genes[pair[1]], fsite2) #print fsite1 #print fsite2 acf = ACF() acf.add_bounds(left) acf.add_bounds(right) ln = leftcomp[0].value('gene_name') rn = rightcomp[0].value('gene_name') site_string = leftcomp[0].value('chrom') + ":" + str( fsite1) + leftcomp[0].value('strand') + '/' + rightcomp[0].value( 'chrom') + ":" + str(fsite2) + rightcomp[0].value('strand') ars = ARS(ref=ref, conversion_string=acf.get_conversion_string(), name=ln + "," + rn + "," + site_string) #print ars.conversion_string #print ars.name #print ars.get_ars_name() gpds = make_new_genepreds(leftcomp, fsite1, rightcomp, fsite2, ars) return [gpds, ars]
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer',default=10000,type=int) parser.add_argument('--window_size',default=10000,type=int) parser.add_argument('--bin_size',default=1000,type=int) parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons',action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0,g.get_exon_count()): erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum)/float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds,already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True) intergenic_beds = window_break(intergenic_beds,args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True) intron_beds = merge_ranges(intron_beds,already_sorted=True) intron_beds = window_break(intron_beds,args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py "+args.bam_input p = Popen(cmd.split(),stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0],int(f[1]),int(f[2])) if section_count %100==0: sys.stderr.write(curr.get_range_string()+" \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0,size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0,size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0,size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
class FuzzyGenePred: # set use_dir true if you want to use direction and make it direction specific # set proper_set false if you want to do awesome extending that doesn't really work yet def __init__(self, ingpd=None, params=None, juntol=10): # Here is the basic data self.fuzzy_junctions = [] self.gpds = [] # contributing member genepreds self.start = None self.end = None self.dir = None # Higher level data self.simple_junction_set = set() # quickly search for if a multi exon gene has been added # Here is the parameters self.params = {} self.params["use_dir"] = False self.params["junction_tolerance"] = juntol # Not fully implemented. Do we require a full length match self.params["proper_set"] = True # Define thresholds for overlapping single exons self.params["do_add_single_exon"] = True self.params["single_exon_minimum_length"] = 200 self.params[ "single_exon_minimum_overlap_fraction" ] = 0.8 # reciprocal ... must be this fraction or more on both self.params["single_exon_minimum_overlap_bases"] = 1 # minimum number of bases self.params["single_exon_maximum_endpoint_distance"] = 1000 if params: for pname in params: self.params[pname] = params[pname] if ingpd: self.add_gpd(ingpd) def get_genepred_line(self, end_select="extremes", junction_select="mode", name=None): if not name: name = "fuzGPD_" + random_string(8) + "_" + str(len(self.fuzzy_junctions) + 1) + "_" + str(len(self.gpds)) ostr = "" ostr += name + "\t" ostr += name + "\t" ostr += self.start.chr + "\t" ostr += self.gpds[0].value("strand") + "\t" ostr += str(self.start.start - 1) + "\t" ostr += str(self.end.end) + "\t" ostr += str(self.start.start - 1) + "\t" ostr += str(self.end.end) + "\t" ostr += str(len(self.fuzzy_junctions) + 1) + "\t" exonstarts = [] exonends = [] exonstarts.append(self.start.start - 1) for j in self.fuzzy_junctions: exonends.append(mode(j.left.get_payload()["junc"])) exonstarts.append(mode(j.right.get_payload()["junc"]) - 1) exonends.append(self.end.end) ostr += ",".join([str(x) for x in exonstarts]) + "," + "\t" ostr += ",".join([str(x) for x in exonends]) + "," return ostr # Return a copy of the fuzzy geneprep def copy(self): g = FuzzyGenePred() # start with a blank one why not # get the settings for pname in self.params: g.params[pname] = self.params[pname] # copy the genepreds for orig in self.gpds: g.gpds.append(GenePredEntry(orig.get_line())) # store direction g.dir = self.dir # copy the fuzzy junctions for orig in self.fuzzy_junctions: g.fuzzy_junctions.append(orig.copy()) # copy the simple junction set for orig in self.simple_junction_set: g.simple_junction_set.add(orig) # copy the start if self.start: g.start = Bed(self.start.chr, self.start.start - 1, self.start.end, self.start.direction) g.start.set_payload([]) for v in self.start.get_payload(): g.start.get_payload().append(v) # copy the end if self.end: g.end = Bed(self.end.chr, self.end.start - 1, self.end.end, self.end.direction) g.end.set_payload([]) for v in self.end.get_payload(): g.end.get_payload().append(v) return g def exon_count(self): return len(self.fuzzy_junctions) + 1 def gpd_count(self): return len(self.gpds) def get_bed(self): return Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction) # This is an inspection tool for a fuzzy gpd def get_info_string(self): ostr = "" ostr += "== FUZZY GENEPRED INFO ==" + "\n" ostr += str(len(self.gpds)) + " total GPDs" + "\n" totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction) ostr += totalbounds.get_range_string() + " total bounds\n" ostr += "---- start ----" + "\n" ostr += str(len(self.start.get_payload())) + " reads supporting start" + "\n" ostr += " " + str(mean(self.start.get_payload())) + " mean" + "\n" ostr += " " + str(mode(self.start.get_payload())) + " mode" + "\n" ostr += " " + self.start.get_range_string() + " start range\n" ostr += "---- end ----" + "\n" ostr += str(len(self.end.get_payload())) + " reads supporting end" + "\n" ostr += " " + str(mean(self.end.get_payload())) + " mean" + "\n" ostr += " " + str(mode(self.end.get_payload())) + " mode" + "\n" ostr += " " + self.end.get_range_string() + " end range\n" ostr += "---- junctions ----" + "\n" ostr += str(len(self.fuzzy_junctions)) + " total fuzzy junctions" + "\n" cnt = 0 for j in self.fuzzy_junctions: cnt += 1 ostr += ( " " + str(cnt) + ". " + str(mode(j.left.get_payload()["junc"])) + " ^ " + str(mode(j.right.get_payload()["junc"])) + "\n" ) ostr += " " + j.left.get_range_string() + " ^ " + j.right.get_range_string() + "\n" ostr += " " + str(len(j.left.get_payload()["junc"])) + " read support" + "\n" if j.left.get_payload()["start"]: ostr += " " + "---starts----" + "\n" ostr += ( " " + str(len(j.left.get_payload()["start"].get_payload())) + " starts at " + j.left.get_payload()["start"].get_range_string() + "\n" ) if j.right.get_payload()["end"]: ostr += " " + "---ends----" + "\n" ostr += ( " " + str(len(j.right.get_payload()["end"].get_payload())) + " ends at " + j.right.get_payload()["end"].get_range_string() + "\n" ) return ostr # Add a new gpd return true if successful # Return false if it didn't work, return the new combined if it worked def add_gpd(self, ingpd): if len(self.gpds) == 0: # first one self.read_first(ingpd) return self # return ourself if we are adding our first # more difficult situation where we must try to combine # See if it can match first before actually adding stuff to it # if self. newfuz = FuzzyGenePred(ingpd, params=self.params) output = self.add_fuzzy_gpd(newfuz) return output # combine together compatible overlapping sets def concat_fuzzy_gpd(self, fuz2): if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0: return False if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0: return False # Lets work combine the single exon step and exit if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0: return self.do_add_single_exon_fuzzy_gpd(fuz2) # For now don't add them if one is single exon if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0: return False # See if its already a subset easy_subset = False for simplejunction in fuz2.simple_junction_set: if simplejunction in self.simple_junction_set: easy_subset = True # If its not already a subset look deeper # 1. First we need perfect junctions for a run of them if not easy_subset: if not self.compatible_overlap(fuz2): return False # still here. we will work on combining these output = self.copy() # first lets put add any overlapping junctions for i in range(0, len(output.fuzzy_junctions)): for j in range(0, len(fuz2.fuzzy_junctions)): if output.fuzzy_junctions[i].overlaps(fuz2.fuzzy_junctions[j], fuz2.params["junction_tolerance"]): output.fuzzy_junctions[i].add_fuzzy_junction(fuz2.fuzzy_junctions[j]) if j == 0: # put the start in too if not output.fuzzy_junctions[i].left.get_payload()["start"]: output.fuzzy_junctions[i].left.get_payload()["start"] = fuz2.start.copy() else: # merge starts = output.fuzzy_junctions[i].left.get_payload()["start"].get_payload() for v in fuz2.start.get_payload(): starts.append(v) nrange = output.fuzzy_junctions[i].left.get_payload()["start"].merge(fuz2.start) nrange.set_payload(starts[:]) output.fuzzy_junctions[i].left.get_payload()["start"] = nrange if j == len(fuz2.fuzzy_junctions) - 1: # put the end in too if not output.fuzzy_junctions[i].right.get_payload()["end"]: output.fuzzy_junctions[i].right.get_payload()["end"] = fuz2.end.copy() else: # merge ends = output.fuzzy_junctions[i].right.get_payload()["end"].get_payload() for v in fuz2.end.get_payload(): ends.append(v) nrange = output.fuzzy_junctions[i].right.get_payload()["end"].merge(fuz2.end) nrange.set_payload(ends[:]) output.fuzzy_junctions[i].right.get_payload()["end"] = nrange # see if we should build onto the left leftnum = -1 leftmost = self.fuzzy_junctions[0] if fuz2.fuzzy_junctions[0].right.end < leftmost.left.start: for i in range(0, len(fuz2.fuzzy_junctions)): if fuz2.fuzzy_junctions[i].overlaps(leftmost, fuz2.params["junction_tolerance"]): leftnum = i break # leftnum is now -1 if no additions to the left zero if it starts on the same if leftnum > 0: for i in reversed(range(0, leftnum)): output.fuzzy_junctions.insert(0, fuz2.fuzzy_junctions[i].copy()) output.start = fuz2.start.copy() rightnum = -1 # get the right point ... our first one comes after this rightmost = self.fuzzy_junctions[-1] if fuz2.fuzzy_junctions[-1].left.start > rightmost.right.end: for i in reversed(range(0, len(fuz2.fuzzy_junctions))): if fuz2.fuzzy_junctions[i].overlaps(rightmost, fuz2.params["junction_tolerance"]): rightnum = i break if rightnum != -1: rightnum += 1 if rightnum < len(fuz2.fuzzy_junctions): for i in range(rightnum, len(fuz2.fuzzy_junctions)): output.fuzzy_junctions.append(fuz2.fuzzy_junctions[i].copy()) output.end = fuz2.end.copy() # print leftnum # print rightnum # print fuz2.params['junction_tolerance'] # print 'combining' return output # add together subsets def add_fuzzy_gpd(self, fuz2): # see if we can add this fuzzy gpd to another # We treat single exon genes seprately so if only one of them is # single exon we can't compare them if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0: return False if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0: return False # Lets work combine the single exon step and exit if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0: return self.do_add_single_exon_fuzzy_gpd(fuz2) # For now don't add them if one is single exon if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0: return False # See if its already a subset easy_subset = False for simplejunction in fuz2.simple_junction_set: if simplejunction in self.simple_junction_set: easy_subset = True # If its not already a subset look deeper # 1. First we need perfect junctions for a run of them if not easy_subset: if not self.compatible_overlap(fuz2): return False # still here. we will work on combining these output = self.copy() # switch over to working on the output now # If we are still here we can add the two of them together # If they have the same starting junction we can add their starting points together if output.fuzzy_junctions[0].overlaps(fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]): # print 'samestart' newstart = output.start.merge(fuz2.start) newstart.set_payload(output.start.get_payload()) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) output.start = newstart # Check if the other one is new start elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) < mode( output.fuzzy_junctions[0].left.get_payload()["junc"] ): # print "2 start" output.start = fuz2.start elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) > mode( output.fuzzy_junctions[0].left.get_payload()["junc"] ): True # #print "1 start" # #we're good to go else: sys.stderr.write("WARNING: strange start case abort merge\n") return False # lets work the ends now if output.fuzzy_junctions[-1].overlaps(fuz2.fuzzy_junctions[-1], output.params["junction_tolerance"]): # print 'sameend' newend = output.end.merge(fuz2.end) newend.set_payload(output.end.get_payload()) for s in fuz2.end.get_payload(): newend.get_payload().append(s) output.end = newend # Check if the other one is new start elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"]) > mode( output.fuzzy_junctions[-1].right.get_payload()["junc"] ): # print "2 end" output.end = fuz2.end elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"]) < mode( output.fuzzy_junctions[-1].right.get_payload()["junc"] ): True # #print "1 end" # #we're good to go else: sys.stderr.write("WARNING: strange end case abort merge\n") u1 = mode(output.fuzzy_junctions[-1].left.get_payload()["junc"]) u2 = mode(fuz2.fuzzy_junctions[-1].left.get_payload()["junc"]) v1 = mode(output.fuzzy_junctions[-1].right.get_payload()["junc"]) v2 = mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"]) sys.stderr.write(str(u1) + "\t" + str(u2) + "\n") sys.stderr.write(str(v1) + "\t" + str(v2) + "\n") return False # now the starts and ends have been updated in output. # iterate through the junctions. # check for a left overhang. numfuz2left = 0 numoutleft = 0 if not output.fuzzy_junctions[0].overlaps(fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]): # see if we need to add sequences from fuz2 if mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) < mode( output.fuzzy_junctions[0].left.get_payload()["junc"] ): # print 'left over2' i = 0 while not output.fuzzy_junctions[0].overlaps( fuz2.fuzzy_junctions[i], output.params["junction_tolerance"] ) and i < len(fuz2.fuzzy_junctions): i += 1 numfuz2left = i # number to push on from the fuz2 and increment in # print numfuz2left elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) > mode( output.fuzzy_junctions[0].left.get_payload()["junc"] ): # print 'left over1' i = 0 while not output.fuzzy_junctions[i].overlaps( fuz2.fuzzy_junctions[0], output.params["junction_tolerance"] ) and i < len(output.fuzzy_junctions): i += 1 numoutleft = i # number to increment in from output # print numoutleft else: sys.stderr.write("WARNING: strange case \n") return False # next we can check how long we have a run of the same ind1 = numoutleft ind2 = numfuz2left overlap_size = 0 while ( ind1 < len(output.fuzzy_junctions) and ind2 < len(fuz2.fuzzy_junctions) and output.fuzzy_junctions[ind1].overlaps(fuz2.fuzzy_junctions[ind2], output.params["junction_tolerance"]) ): overlap_size += 1 ind1 += 1 ind2 += 1 # print 'overlap size '+str(overlap_size) numoutright = len(output.fuzzy_junctions) - overlap_size - numoutleft numfuz2right = len(fuz2.fuzzy_junctions) - overlap_size - numfuz2left if min(numoutright, numfuz2right) != 0: sys.stderr.write("WARNING: expected one of them to be zero\n") # print self.get_info_string() # print '=====================' # print fuz2.get_info_string() # sys.exit() return False if min(numoutleft, numfuz2left) != 0: sys.stderr.write("WARNING: expected one of them to be zero\n") return False # print numoutright # print numfuz2right # print output.fuzzy_junctions[numoutleft].overlaps(fuz2.fuzzy_junctions[numfuz2left],output.junction_tolerance) # print 'add' # Now we have what we need to go through and do some updating # Lets just make new fuzzy junctions newjuncs = [] for i in range(0, numfuz2left): newjuncs.append(fuz2.fuzzy_junctions[i]) for i in range(0, numoutleft): newjuncs.append(output.fuzzy_junctions[i]) # Now we do both down the center range1 = range(numoutleft, overlap_size + numoutleft) range2 = range(numfuz2left, overlap_size + numfuz2left) for i in range(0, len(range1)): newjuncs.append(output.fuzzy_junctions[range1[i]]) newjuncs[-1].add_fuzzy_junction(fuz2.fuzzy_junctions[range2[i]]) # print i # Make the right size for i in range(overlap_size + numfuz2left, overlap_size + numfuz2left + numfuz2right): newjuncs.append(fuz2.fuzzy_junctions[i]) for i in range(overlap_size + numoutleft, overlap_size + numoutleft + numoutright): newjuncs.append(output.fuzzy_junctions[i]) output.fuzzy_junctions = newjuncs # print 'adding gpd '+str(len(fuz2.gpds))+' entries' for g in fuz2.gpds: output.gpds.append(g) sjun = get_simple_junction(g) if sjun: output.simple_junction_set.add(sjun) # print 'new entry' # print self.get_info_string() return output def do_add_single_exon_fuzzy_gpd(self, fuz2): if not self.params["do_add_single_exon"]: return False # make sure we are allowed to be doing this # build the bounds from the average start and end s1 = mean(self.start.get_payload()) e1 = mean(self.end.get_payload()) s2 = mean(fuz2.start.get_payload()) e2 = mean(fuz2.end.get_payload()) l1 = e1 - s1 + 1 l2 = e2 - s2 + 1 if l1 < self.params["single_exon_minimum_length"]: return False if l2 < self.params["single_exon_minimum_length"]: return False if l1 < 1 or l2 < 1: return False # shouldn't happen chr1 = self.start.chr chr2 = self.end.chr if chr1 != chr2: return False # shouldn't happen r1 = Bed(chr1, s1 - 1, e1, self.dir) r2 = Bed(chr2, s2 - 1, e2, self.dir) over = r1.overlap_size(r2) if over < self.params["single_exon_minimum_overlap_bases"]: return False # print r1.get_range_string() # print r2.get_range_string() cov = min(float(over) / float(l1), float(over) / float(l2)) if cov < self.params["single_exon_minimum_overlap_fraction"]: return False if abs(e1 - e2) > self.params["single_exon_maximum_endpoint_distance"]: return False if abs(s1 - s2) > self.params["single_exon_maximum_endpoint_distance"]: return False # If we're still here, we can add result output = self.copy() newstart = output.start.merge(fuz2.start) newstart.set_payload([]) for s in output.start.get_payload(): newstart.get_payload().append(s) for s in fuz2.start.get_payload(): newstart.get_payload().append(s) newend = output.end.merge(fuz2.end) newend.set_payload([]) for e in output.end.get_payload(): newend.get_payload().append(e) for e in fuz2.end.get_payload(): newend.get_payload().append(e) output.start = newstart output.end = newend for gpd in fuz2.gpds: output.gpds.append(gpd) sjun = get_simple_junction(gpd) if sjun: output.simple_junction_set.add(gpd) return output # Return true if these fuzzy genepreds can be added together def compatible_overlap(self, fingpd): f1 = self f2 = fingpd #### Forget about trying zero exon cases for now if len(f1.fuzzy_junctions) == 0 or len(f2.fuzzy_junctions) == 0: return False # Find all matches matches = [] for i in range(0, len(f1.fuzzy_junctions)): for j in range(0, len(f2.fuzzy_junctions)): if f1.fuzzy_junctions[i].overlaps(f2.fuzzy_junctions[j], self.params["junction_tolerance"]): matches.append([i, j]) # This is our matched junctions in f1 and f2 if len(matches) == 0: return False # Nothing matched.. certainly no overlap # This is the number of extra exons it would take in the middle of the run (shifts) if len(set([x[0] - x[1] for x in matches])) != 1: return False # Lets make sure all our exons are consecutive if len(matches) > 1: consec1 = list(set([matches[i + 1][0] - matches[i][0] for i in range(0, len(matches) - 1)])) consec2 = list(set([matches[i + 1][1] - matches[i][1] for i in range(0, len(matches) - 1)])) if len(consec1) != 1: return False if len(consec2) != 1: return False if consec1[0] != 1: return False if consec2[0] != 1: return False # one of them should be zero if not (matches[0][1] == 0 or matches[0][0] == 0): return False # and one of our last matches should be the last junction if not (len(f1.fuzzy_junctions) - 1 == matches[-1][0] or len(f2.fuzzy_junctions) - 1 == matches[-1][1]): return False #### most of the time we will probably be looking for a proper set #### unless we are extending the long read for isoform prediction if self.params["proper_set"]: # check those last overhangs # one of the two needs to have the start and end points in the consecutive matches if (matches[0][0] == 0 and len(f1.fuzzy_junctions) - 1 == matches[-1][0]) or ( matches[0][1] == 0 and len(f2.fuzzy_junctions) - 1 == matches[-1][1] ): return True return False return True def read_first(self, ingpd): self.gpds.append(ingpd) sjun = get_simple_junction(ingpd) if sjun: self.simple_junction_set.add(sjun) if self.params["use_dir"]: self.dir = ingpd.value("strand") # add fuzzy junctions chr = ingpd.value("chrom") for i in range(0, len(ingpd.value("exonStarts")) - 1): self.fuzzy_junctions.append( FuzzyJunction(chr, ingpd.value("exonEnds")[i], ingpd.value("exonStarts")[i + 1] + 1, self.dir) ) if len(ingpd.value("exonStarts")) > 1: # we have junctions self.fuzzy_junctions[0].left.get_payload()["start"] = Bed( chr, ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir ) self.fuzzy_junctions[0].left.get_payload()["start"].set_payload([]) self.fuzzy_junctions[0].left.get_payload()["start"].get_payload().append(ingpd.value("txStart") + 1) self.fuzzy_junctions[-1].right.get_payload()["end"] = Bed( chr, ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir ) self.fuzzy_junctions[-1].right.get_payload()["end"].set_payload([]) self.fuzzy_junctions[-1].right.get_payload()["end"].get_payload().append(ingpd.value("txEnd")) # add fuzzy starts self.start = Bed(ingpd.value("chrom"), ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir) self.start.set_payload([]) self.start.get_payload().append(ingpd.value("txStart") + 1) self.end = Bed(ingpd.value("chrom"), ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir) self.end.set_payload([]) self.end.get_payload().append(ingpd.value("txEnd")) # Have finished reading in the first case # Pre: another fuzzy gpd # Post: True if they are all overlapping junctions def is_equal_fuzzy(self, fuz2, use_direction=False): if use_direction: if self.dir != fuz2.dir: return False if len(self.fuzzy_junctions) < 0: return False if len(fuz2.fuzzy_junctions) < 0: return False if len(self.fuzzy_junctions) != len(fuz2.fuzzy_junctions): return False for i in range(0, len(self.fuzzy_junctions)): if not self.fuzzy_junctions[i].overlaps(fuz2.fuzzy_junctions[i], self.params["junction_tolerance"]): return False return True
class FuzzyJunction: # Pre: inleft is 1-indexed last exonic base on the left # inright is 1-indexed first exonic base on the right # direction doesn't need to be used def __init__(self, inchr=None, inleft=None, inright=None, indir=None): self.chr = inchr self.left = None #range with payloads being the actual left and rights self.right = None self.dir = indir if inchr and inleft and inright: self.add_junction(inchr, inleft, inright, indir) def copy(self): newjunc = FuzzyJunction() newjunc.chr = self.chr newjunc.left = Bed(self.left.chr,\ self.left.start-1,\ self.left.end,\ self.left.direction) t1 = {} t1['junc'] = [] t1['start'] = None newjunc.left.set_payload(t1) for j in self.left.get_payload()['junc']: newjunc.left.get_payload()['junc'].append(j) newjunc.right = Bed(self.right.chr, self.right.start - 1, self.right.end, self.right.direction) #copy any starts for the junction if self.left.get_payload()['start']: ls = self.left.get_payload()['start'] newjunc.left.get_payload()['start'] = Bed(ls.chr, ls.start - 1, ls.end, ls.direction) newjunc.left.get_payload()['start'].set_payload([]) for p in self.left.get_payload()['start'].get_payload(): newjunc.left.get_payload()['start'].get_payload().append(p) t2 = {} t2['junc'] = [] t2['end'] = None newjunc.right.set_payload(t2) for j in self.right.get_payload()['junc']: newjunc.right.get_payload()['junc'].append(j) #copy any ends for the junction if self.right.get_payload()['end']: ren = self.right.get_payload()['end'] newjunc.right.get_payload()['end'] = Bed(ren.chr, ren.start - 1, ren.end, ren.direction) newjunc.right.get_payload()['end'].set_payload([]) for p in self.right.get_payload()['end'].get_payload(): newjunc.right.get_payload()['end'].get_payload().append(p) return newjunc # return chr, and the left and right mode as an array def get_mode(self): m1 = mode(self.left.get_payload()['junc']) m2 = mode(self.right.get_payload()['junc']) return [ Bed(self.chr, m1 - 1, m1, self.dir), Bed(self.chr, m2 - 1, m2, self.dir) ] # Find the mode of the junction and see if this overlaps def overlaps(self, fjun2, juntol): m1 = self.get_mode() m2 = fjun2.get_mode() if m1[0].chr != m2[0].chr: return False if m1[0].direction != m2[0].direction: return False # usually they are both off if not m1[0].overlaps_with_padding(m2[0], juntol): return False if not m1[1].overlaps_with_padding(m2[1], juntol): return False return True #Right now assumes these are overlap verified prior to calling def add_junction(self, inchr, inleft, inright, indir=None): if not self.left: # this is our first one t1 = {} t1['junc'] = [] t1['start'] = None self.left = Bed(inchr, inleft - 1, inleft, indir) self.left.set_payload(t1) self.left.get_payload()['junc'].append(inleft) self.right = Bed(inchr, inright - 1, inright, indir) t2 = {} t2['junc'] = [] t2['end'] = None self.right = Bed(inchr, inright - 1, inright, indir) self.right.set_payload(t2) self.right.get_payload()['junc'].append(inright) return #Lets add this one to our current one newfuz = FuzzyJunction(inchar, inleft, inright, indir) self.add_fuzzy_junction(newfuz) def add_fuzzy_junction(self, newfuz): #print 'add fuzzy' mergeleft = self.left.merge(newfuz.left) mergeleft.set_payload(self.left.get_payload()) mergeright = self.right.merge(newfuz.right) mergeright.set_payload(self.right.get_payload()) for j1 in newfuz.left.get_payload()['junc']: mergeleft.get_payload()['junc'].append(j1) for j2 in newfuz.right.get_payload()['junc']: mergeright.get_payload()['junc'].append(j2) #fix the starts if newfuz.left.get_payload( )['start'] and not self.left.get_payload()['start']: mergeleft.get_payload()['start'] = newfuz.left.get_payload( )['start'] elif newfuz.left.get_payload()['start'] and self.left.get_payload( )['start']: newrange = self.left.get_payload()['start'].merge( newfuz.left.get_payload()['start']) newrange.set_payload([]) for s in self.left.get_payload()['start'].get_payload(): newrange.get_payload().append(s) for s in newfuz.left.get_payload()['start'].get_payload(): newrange.get_payload().append(s) mergeleft.get_payload()['start'] = newrange #print 'update left starts' #fix the ends if newfuz.right.get_payload( )['end'] and not self.right.get_payload()['end']: mergeright.get_payload()['end'] = newfuz.right.get_payload()['end'] elif newfuz.right.get_payload()['end'] and self.right.get_payload( )['end']: newrange = newfuz.right.get_payload()['end'].merge( self.right.get_payload()['end']) newrange.set_payload([]) for s in self.right.get_payload()['end'].get_payload(): newrange.get_payload().append(s) for s in newfuz.right.get_payload()['end'].get_payload(): newrange.get_payload().append(s) mergeright.get_payload()['end'] = newrange #print 'update right ends' # We finished the changes self.left = mergeleft self.right = mergeright
class FuzzyJunction: # Pre: inleft is 1-indexed last exonic base on the left # inright is 1-indexed first exonic base on the right # direction doesn't need to be used def __init__(self, inchr=None, inleft=None, inright=None, indir=None): self.chr = inchr self.left = None # range with payloads being the actual left and rights self.right = None self.dir = indir if inchr and inleft and inright: self.add_junction(inchr, inleft, inright, indir) def copy(self): newjunc = FuzzyJunction() newjunc.chr = self.chr newjunc.left = Bed(self.left.chr, self.left.start - 1, self.left.end, self.left.direction) t1 = {} t1["junc"] = [] t1["start"] = None newjunc.left.set_payload(t1) for j in self.left.get_payload()["junc"]: newjunc.left.get_payload()["junc"].append(j) newjunc.right = Bed(self.right.chr, self.right.start - 1, self.right.end, self.right.direction) # copy any starts for the junction if self.left.get_payload()["start"]: ls = self.left.get_payload()["start"] newjunc.left.get_payload()["start"] = Bed(ls.chr, ls.start - 1, ls.end, ls.direction) newjunc.left.get_payload()["start"].set_payload([]) for p in self.left.get_payload()["start"].get_payload(): newjunc.left.get_payload()["start"].get_payload().append(p) t2 = {} t2["junc"] = [] t2["end"] = None newjunc.right.set_payload(t2) for j in self.right.get_payload()["junc"]: newjunc.right.get_payload()["junc"].append(j) # copy any ends for the junction if self.right.get_payload()["end"]: ren = self.right.get_payload()["end"] newjunc.right.get_payload()["end"] = Bed(ren.chr, ren.start - 1, ren.end, ren.direction) newjunc.right.get_payload()["end"].set_payload([]) for p in self.right.get_payload()["end"].get_payload(): newjunc.right.get_payload()["end"].get_payload().append(p) return newjunc # return chr, and the left and right mode as an array def get_mode(self): m1 = mode(self.left.get_payload()["junc"]) m2 = mode(self.right.get_payload()["junc"]) return [Bed(self.chr, m1 - 1, m1, self.dir), Bed(self.chr, m2 - 1, m2, self.dir)] # Find the mode of the junction and see if this overlaps def overlaps(self, fjun2, juntol): m1 = self.get_mode() m2 = fjun2.get_mode() if m1[0].chr != m2[0].chr: return False if m1[0].direction != m2[0].direction: return False # usually they are both off if not m1[0].overlaps_with_padding(m2[0], juntol): return False if not m1[1].overlaps_with_padding(m2[1], juntol): return False return True # Right now assumes these are overlap verified prior to calling def add_junction(self, inchr, inleft, inright, indir=None): if not self.left: # this is our first one t1 = {} t1["junc"] = [] t1["start"] = None self.left = Bed(inchr, inleft - 1, inleft, indir) self.left.set_payload(t1) self.left.get_payload()["junc"].append(inleft) self.right = Bed(inchr, inright - 1, inright, indir) t2 = {} t2["junc"] = [] t2["end"] = None self.right = Bed(inchr, inright - 1, inright, indir) self.right.set_payload(t2) self.right.get_payload()["junc"].append(inright) return # Lets add this one to our current one newfuz = FuzzyJunction(inchar, inleft, inright, indir) self.add_fuzzy_junction(newfuz) def add_fuzzy_junction(self, newfuz): # print 'add fuzzy' mergeleft = self.left.merge(newfuz.left) mergeleft.set_payload(self.left.get_payload()) mergeright = self.right.merge(newfuz.right) mergeright.set_payload(self.right.get_payload()) for j1 in newfuz.left.get_payload()["junc"]: mergeleft.get_payload()["junc"].append(j1) for j2 in newfuz.right.get_payload()["junc"]: mergeright.get_payload()["junc"].append(j2) # fix the starts if newfuz.left.get_payload()["start"] and not self.left.get_payload()["start"]: mergeleft.get_payload()["start"] = newfuz.left.get_payload()["start"] elif newfuz.left.get_payload()["start"] and self.left.get_payload()["start"]: newrange = self.left.get_payload()["start"].merge(newfuz.left.get_payload()["start"]) newrange.set_payload([]) for s in self.left.get_payload()["start"].get_payload(): newrange.get_payload().append(s) for s in newfuz.left.get_payload()["start"].get_payload(): newrange.get_payload().append(s) mergeleft.get_payload()["start"] = newrange # print 'update left starts' # fix the ends if newfuz.right.get_payload()["end"] and not self.right.get_payload()["end"]: mergeright.get_payload()["end"] = newfuz.right.get_payload()["end"] elif newfuz.right.get_payload()["end"] and self.right.get_payload()["end"]: newrange = newfuz.right.get_payload()["end"].merge(self.right.get_payload()["end"]) newrange.set_payload([]) for s in self.right.get_payload()["end"].get_payload(): newrange.get_payload().append(s) for s in newfuz.right.get_payload()["end"].get_payload(): newrange.get_payload().append(s) mergeright.get_payload()["end"] = newrange # print 'update right ends' # We finished the changes self.left = mergeleft self.right = mergeright
def get_bed(self): return Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction)