def main(): parser = argparse.ArgumentParser(description="Filter a genepred by transcript length") parser.add_argument('input',help="Input '-' for STDOUT") parser.add_argument('--min_length',type=int,help="Minimum transcript length") parser.add_argument('--max_length',type=int,help="Maximum transcript length") parser.add_argument('--names',help="filter on a name list") parser.add_argument('--gene_names',help="filter on a gene name list") parser.add_argument('-v','--invert',action='store_true',help='Invert search result') args = parser.parse_args() name_list = set() gene_name_list = set() if args.names: with open(args.names) as inf: for line in inf: f = line.rstrip().split("\t") name_list.add(f[0]) if args.gene_names: with open(args.gene_names) as inf: for line in inf: f = line.rstrip().split("\t") gene_name_list.add(f[0]) inf = sys.stdin if args.input != '-': inf = open(args.input) for line in inf: if re.match('^#',line): continue is_good = True g = GPD(line.rstrip()) tot = g.length() if args.min_length: if tot < args.min_length: is_good = False if args.max_length: if tot > args.max_length: is_good = False if args.names: if g.value('name') not in name_list: is_good = False if args.gene_names: if g.value('gene_name') not in args.gene_name_list: is_good = False # If we are still here we can print if not args.invert: if is_good: print line.rstrip() else: if not is_good: print line.rstrip()
def main(): parser = argparse.ArgumentParser(description="For every genepred entry report its alignability",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Genepred can be gzipped or - for STDIN") parser.add_argument('-r','--reference',required=True,help="Reference fasta") parser.add_argument('-k','--fragment_size',default=100,type=int,help="Fragment size to try to align") parser.add_argument('-x','--hisat_index',required=True,help="HISAT index base name") parser.add_argument('--threads',type=int,default=cpu_count(),help="number of threads") parser.add_argument('--type',choices=['mean','median'],default='mean',help="How to bring together overlapping reads") parser.add_argument('--perbase',action='store_true') parser.add_argument('--output','-o',help="output file or leave unset for STDOUT") args = parser.parse_args() if args.input=='-': args.input=sys.stdin elif re.search('\.gz$',args.input): args.input = gzip.open(args.input) else: args.input = open(args.input) udir = os.path.dirname(os.path.realpath(__file__)) cmd2 = udir+'/genepred_counts_to_mappability.py -' cmd2 += ' --threads '+str(args.threads) cmd2 += ' -k '+str(args.fragment_size) if args.perbase: cmd2 += ' --perbase' if args.output: cmd2 += ' --output '+args.output if args.type: cmd2 += ' --type '+args.type p2 = Popen(cmd2.split(),stdin=PIPE) ref = read_fasta_into_hash(args.reference) cmd1 = 'hisat -x '+args.hisat_index+' -U - -f --reorder -p '+str(args.threads) p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin,stderr=null) #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin) line_number = 0 for line in args.input: line_number +=1 gpd = GPD(line.rstrip()) #print gpd.entry['name'] #print gpd.length() if gpd.length() < args.fragment_size: continue seq = gpd.get_sequence(ref) for i in range(0,len(seq)-args.fragment_size+1): info = gpd.value('name')+"\t"+gpd.value('gene_name')+"\t"+str(line_number)+"\t"+str(len(seq))+"\t"+str(i) einfo = encode_name(info) p1.stdin.write('>'+einfo+"\n") p1.stdin.write(seq[i:i+args.fragment_size]+"\n") p1.communicate() p2.communicate()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer',default=10000,type=int) parser.add_argument('--window_size',default=10000,type=int) parser.add_argument('--bin_size',default=1000,type=int) parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons',action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0,g.get_exon_count()): erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum)/float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds,already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True) intergenic_beds = window_break(intergenic_beds,args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True) intron_beds = merge_ranges(intron_beds,already_sorted=True) intron_beds = window_break(intron_beds,args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py "+args.bam_input p = Popen(cmd.split(),stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0],int(f[1]),int(f[2])) if section_count %100==0: sys.stderr.write(curr.get_range_string()+" \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0,size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0,size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0,size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer', default=10000, type=int) parser.add_argument('--window_size', default=10000, type=int) parser.add_argument('--bin_size', default=1000, type=int) parser.add_argument( '--use_off_regions', action='store_true', help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons', action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0, g.get_exon_count()): erng = Bed(g.value('chrom'), g.value('exonStarts')[i], g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum) / float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds, already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds, padded_gene_beds, already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True) intergenic_beds = window_break(intergenic_beds, args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True) intron_beds = merge_ranges(intron_beds, already_sorted=True) intron_beds = window_break(intron_beds, args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py " + args.bam_input p = Popen(cmd.split(), stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0], int(f[1]), int(f[2])) if section_count % 100 == 0: sys.stderr.write(curr.get_range_string() + " \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len( exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0, size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len( intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0, size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len( intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr, introndepth, intergenicdepth, pseudoreadcount, avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0, size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
def main(): parser = argparse.ArgumentParser( description="For every genepred entry report its alignability", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Genepred can be gzipped or - for STDIN") parser.add_argument('-r', '--reference', required=True, help="Reference fasta") parser.add_argument('-k', '--fragment_size', default=100, type=int, help="Fragment size to try to align") parser.add_argument('-x', '--hisat_index', required=True, help="HISAT index base name") parser.add_argument('--threads', type=int, default=cpu_count(), help="number of threads") parser.add_argument('--type', choices=['mean', 'median'], default='mean', help="How to bring together overlapping reads") parser.add_argument('--perbase', action='store_true') parser.add_argument('--output', '-o', help="output file or leave unset for STDOUT") args = parser.parse_args() if args.input == '-': args.input = sys.stdin elif re.search('\.gz$', args.input): args.input = gzip.open(args.input) else: args.input = open(args.input) udir = os.path.dirname(os.path.realpath(__file__)) cmd2 = udir + '/genepred_counts_to_mappability.py -' cmd2 += ' --threads ' + str(args.threads) cmd2 += ' -k ' + str(args.fragment_size) if args.perbase: cmd2 += ' --perbase' if args.output: cmd2 += ' --output ' + args.output if args.type: cmd2 += ' --type ' + args.type p2 = Popen(cmd2.split(), stdin=PIPE) ref = read_fasta_into_hash(args.reference) cmd1 = 'hisat -x ' + args.hisat_index + ' -U - -f --reorder -p ' + str( args.threads) p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, stderr=null) #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin) line_number = 0 for line in args.input: line_number += 1 gpd = GPD(line.rstrip()) #print gpd.entry['name'] #print gpd.length() if gpd.length() < args.fragment_size: continue seq = gpd.get_sequence(ref) for i in range(0, len(seq) - args.fragment_size + 1): info = gpd.value('name') + "\t" + gpd.value( 'gene_name') + "\t" + str(line_number) + "\t" + str( len(seq)) + "\t" + str(i) einfo = encode_name(info) p1.stdin.write('>' + einfo + "\n") p1.stdin.write(seq[i:i + args.fragment_size] + "\n") p1.communicate() p2.communicate()