def do_reduction(subset, args, nrfuzzykey, location): seen = set() for i in subset: seen.add(i) for j in subset[i]: seen.add(j) singles = [] for num in nrfuzzykey: if num not in seen: singles.append(num) #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return families = get_subset_evidence(subset, nrfuzzykey, args) gpdlines = "" tablelines = "" for num in singles: families.append(nrfuzzykey[num]) # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n" + gpdline + "\n" + fz.get_info_string() + "\n") gpd = sorted( fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline + "\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines += name + "\t" + g.entry['name'] + "\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def do_reduction(subset,args,nrfuzzykey,location): seen = set() for i in subset: seen.add(i) for j in subset[i]: seen.add(j) singles = [] for num in nrfuzzykey: if num not in seen: singles.append(num) #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return families = get_subset_evidence(subset,nrfuzzykey,args) gpdlines = "" tablelines = "" for num in singles: families.append(nrfuzzykey[num]) # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n") gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline+"\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines+=name+"\t"+g.entry['name']+"\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer',default=10000,type=int) parser.add_argument('--window_size',default=10000,type=int) parser.add_argument('--bin_size',default=1000,type=int) parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons',action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0,g.get_exon_count()): erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum)/float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds,already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True) intergenic_beds = window_break(intergenic_beds,args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True) intron_beds = merge_ranges(intron_beds,already_sorted=True) intron_beds = window_break(intron_beds,args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py "+args.bam_input p = Popen(cmd.split(),stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0],int(f[1]),int(f[2])) if section_count %100==0: sys.stderr.write(curr.get_range_string()+" \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0,size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0,size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0,size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
def do_prediction(compatible,args,nrfuzzykey,location): #if len(compatible.keys()) == 0: return None #all reads could be standing alone version families = [] for num in nrfuzzykey: families.append(nrfuzzykey[num]) nrfuzzykey[num].params['proper_set'] = False #partial overlap is enough #get_compatible_evidence(compatible,nrfuzzykey,args) for i in compatible: for j in compatible[i]: #see if its already in there g1lines = set() for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line()) repeat = False for g2 in nrfuzzykey[j].gpds: if g2.get_line() in g1lines: repeat = True break if not repeat: continue together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j]) if together: families.append(together) # now we need to find any duplicate entries and combine them newfam = [] beforefam = len(families) while len(families) > 0: fam = families.pop(0) remaining = [] for i in range(0,len(families)): if fam.is_equal_fuzzy(families[i]): added = fam.add_fuzzy_gpd(families[i]) if not added: sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n") fam = added else: remaining.append(families[i]) families = remaining newfam.append(fam) families = newfam afterfam = len(families) # Replace the family with a set where we haven't used the same gpd line twice # This may damage the fuzzy object for i in range(0,len(families)): gset = set() for g in families[i].gpds: gset.add(g.get_line()) families[i].gpds = [GenePredEntry(x) for x in gset] # sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n") gpdlines = "" tablelines = "" # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n") gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline+"\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines+=name+"\t"+g.entry['name']+"\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def do_prediction(compatible, args, nrfuzzykey, location): #if len(compatible.keys()) == 0: return None #all reads could be standing alone version families = [] for num in nrfuzzykey: families.append(nrfuzzykey[num]) nrfuzzykey[num].params[ 'proper_set'] = False #partial overlap is enough #get_compatible_evidence(compatible,nrfuzzykey,args) for i in compatible: for j in compatible[i]: #see if its already in there g1lines = set() for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line()) repeat = False for g2 in nrfuzzykey[j].gpds: if g2.get_line() in g1lines: repeat = True break if not repeat: continue together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j]) if together: families.append(together) # now we need to find any duplicate entries and combine them newfam = [] beforefam = len(families) while len(families) > 0: fam = families.pop(0) remaining = [] for i in range(0, len(families)): if fam.is_equal_fuzzy(families[i]): added = fam.add_fuzzy_gpd(families[i]) if not added: sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n") fam = added else: remaining.append(families[i]) families = remaining newfam.append(fam) families = newfam afterfam = len(families) # Replace the family with a set where we haven't used the same gpd line twice # This may damage the fuzzy object for i in range(0, len(families)): gset = set() for g in families[i].gpds: gset.add(g.get_line()) families[i].gpds = [GenePredEntry(x) for x in gset] # sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n") gpdlines = "" tablelines = "" # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n" + gpdline + "\n" + fz.get_info_string() + "\n") gpd = sorted( fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline + "\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines += name + "\t" + g.entry['name'] + "\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer', default=10000, type=int) parser.add_argument('--window_size', default=10000, type=int) parser.add_argument('--bin_size', default=1000, type=int) parser.add_argument( '--use_off_regions', action='store_true', help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons', action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0, g.get_exon_count()): erng = Bed(g.value('chrom'), g.value('exonStarts')[i], g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum) / float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds, already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds, padded_gene_beds, already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True) intergenic_beds = window_break(intergenic_beds, args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True) intron_beds = merge_ranges(intron_beds, already_sorted=True) intron_beds = window_break(intron_beds, args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py " + args.bam_input p = Popen(cmd.split(), stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0], int(f[1]), int(f[2])) if section_count % 100 == 0: sys.stderr.write(curr.get_range_string() + " \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len( exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0, size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len( intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0, size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len( intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr, introndepth, intergenicdepth, pseudoreadcount, avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0, size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()