def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="IDP output folder") #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons") parser.add_argument('--offset', type=int, default=1, help="add this much to all expressions") parser.add_argument('--mult', type=int, default=10, help="multiply all expressions by this much") parser.add_argument('-o', '--output', help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() args.input = args.input.rstrip('/') inf = open(args.input + '/isoform.gpd') sys.stderr.write("Reading isoform.gpd\n") txs = {} for line in inf: gpd = GPD(line) tx = gpd.get_transcript_name() if tx not in txs: txs[tx] = [] for exon in gpd.exons: txs[tx].append(exon.get_range()) inf.close() sys.stderr.write("Reading isoform.exp file\n") inf = open(args.input + '/isoform.exp') vals = [] for line in inf: f = line.rstrip().split("\t") v = int((float(f[1]) * args.mult) + args.offset) tx = f[0] exons = txs[tx] #if len(exons) < args.min_exons: continue for i in range(0, v): vals += exons[:] inf.close() sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n") of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" + str(v.get_payload()) + "\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def do_multi_round_locus(gpds,args): if args.threads == 1: sys.stderr.write("processing "+str(len(gpds))+" gpds\n") new_gpds = process_locus(gpds,args) if args.threads == 1: sys.stderr.write("merged to "+str(len(new_gpds))+" gpds\n") num_gpds = -1 round = 1 while num_gpds != len(new_gpds): round +=1 num_gpds = len(new_gpds) buffer = [] for v in new_gpds: #if v['evidence'] < args.minimum_support: continue for i in range(0,min(v['evidence'],max(args.minimum_support+1,args.minimum_junction_end_support+1))): nline = GPD(v['tx'].get_fake_gpd_line()) # replace the gene name if we know it if not nline.validate(): if args.threads == 1: sys.stderr.write("WARNING: 1. failed to make valid gpd. losing candidate\n") continue ngpd = GPD(nline.get_fake_gpd_line()) if args.gene_names: ngpd.set_gene_name(v['tx'].get_gene_name()) buffer.append(ngpd) gpds = buffer new_gpds = process_locus(gpds,args) if args.threads == 1: sys.stderr.write("round "+str(round)+" merged to "+str(len(new_gpds))+" gpds\n") return new_gpds
def do_buffer(gpd_lines, fasta, args): results = [] for gpd_line in gpd_lines: gpd = GPD(gpd_line) l = gpd.get_length() if l < args.length: continue num = int(float(l) / float(args.length)) rem = l % args.length #print 'rem : '+str(rem) extra = 0 offset = 0 #if space > 1: # we have room to make multiple passes # #print '---' # #print 'length: '+str(l) # #print 'strand: '+gpd.get_strand() # if random.random() < 0.5: extra = rem # offset = int(float(args.length)/float(args.coverage)) #else: # offset = int(float(rem)/float(args.coverage)) if args.short_reads: offset = 0 if random.random() < 0.5: offset = rem gsub = gpd.subset(offset, args.length + offset) #print gsub.get_gpd_line() val = get_sam(gsub, fasta) results.append(val) #continue else: # not short reads for i in range(0, args.coverage): init = 0 if num == 0 and rem > 0: init = random.choice(range(0, rem)) elif num > 0: init = random.choice(range(0, args.length)) #start = (i*offset+extra) % args.length #while start+args.length <= l: for j in range(init, l, args.length): if j + args.length > l: break #print str(start)+" "+str(start+args.length) gsub = gpd.subset(j, j + args.length) val = get_sam(gsub, fasta) results.append(val) #print gsub.get_sequence(fasta) #start += args.length #print gsub.get_strand() #print space #print rem #print gpd return results
def do_buffer(gpd_lines,fasta,args): results = [] for gpd_line in gpd_lines: gpd = GPD(gpd_line) l = gpd.get_length() if l < args.length: continue num = int(float(l)/float(args.length)) rem = l % args.length #print 'rem : '+str(rem) extra = 0 offset = 0 #if space > 1: # we have room to make multiple passes # #print '---' # #print 'length: '+str(l) # #print 'strand: '+gpd.get_strand() # if random.random() < 0.5: extra = rem # offset = int(float(args.length)/float(args.coverage)) #else: # offset = int(float(rem)/float(args.coverage)) if args.short_reads: offset = 0 if random.random() < 0.5: offset = rem gsub = gpd.subset(offset,args.length+offset) #print gsub.get_gpd_line() val = get_sam(gsub,fasta) results.append(val) #continue else:# not short reads for i in range(0,args.coverage): init = 0 if num == 0 and rem > 0: init = random.choice(range(0,rem)) elif num > 0: init = random.choice(range(0,args.length)) #start = (i*offset+extra) % args.length #while start+args.length <= l: for j in range(init,l,args.length): if j + args.length > l: break #print str(start)+" "+str(start+args.length) gsub = gpd.subset(j,j+args.length) val = get_sam(gsub,fasta) results.append(val) #print gsub.get_sequence(fasta) #start += args.length #print gsub.get_strand() #print space #print rem #print gpd return results
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="IDP output folder") #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons") parser.add_argument('--offset',type=int,default=1,help="add this much to all expressions") parser.add_argument('--mult',type=int,default=10,help="multiply all expressions by this much") parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() args.input= args.input.rstrip('/') inf = open(args.input+'/isoform.gpd') sys.stderr.write("Reading isoform.gpd\n") txs = {} for line in inf: gpd = GPD(line) tx = gpd.get_transcript_name() if tx not in txs: txs[tx] = [] for exon in gpd.exons: txs[tx].append(exon.get_range()) inf.close() sys.stderr.write("Reading isoform.exp file\n") inf = open(args.input+'/isoform.exp') vals = [] for line in inf: f = line.rstrip().split("\t") v = int((float(f[1])*args.mult)+args.offset) tx = f[0] exons = txs[tx] #if len(exons) < args.min_exons: continue for i in range(0,v): vals += exons[:] inf.close() sys.stderr.write("Generating coverage file "+str(len(vals))+"\n") of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def main(args): of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') #read the reference gpd rinf = None global txome txome = {} if re.search('\.gz$', args.reference): rinf = gzip.open(args.reference) else: rinf = open(args.reference) sys.stderr.write("Reading in reference\n") z = 0 # populate txome with reference transcripts for each chromosome for line in rinf: z += 1 gpd = GPD(line) gpd.set_payload(z) if z % 100 == 0: sys.stderr.write(str(z) + " \r") if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = [] r = gpd.get_range() r.set_payload(gpd) txome[gpd.value('chrom')].append(r) rinf.close() sys.stderr.write(str(z) + " \r") sys.stderr.write("\n") inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) #def annotate_line(gpd,txome,args): sys.stderr.write("annotating\n") p = Pool(processes=args.threads) csize = 100 #for v in generate_tx(inf,args): # res = annotate_line(v) # if not res: continue # print res.rstrip() results2 = p.imap(func=annotate_line, iterable=generate_tx(inf, args), chunksize=csize) #sys.stderr.write("done map\n") for res in results2: if not res: continue of.write(res) of.close()
def __init__(self, gpd_file=None, ref_fasta=None): self.transcripts = [] if gpd_file: from Bio.Format.GPD import GPD with open(gpd_file) as inf: for line in inf: self.transcripts.append(GPD(line)) if ref_fasta: for i in range(0, len(self.transcripts)): self.transcripts[i].get_sequence(ref_fasta)
def do_buffer(buffer, txome, args): results = [] for line_z in buffer: z = line_z[1] line = line_z[0] gpd = GPD(line) v = annotate_line(gpd, txome, args) if not v: continue type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] results.append(str(z)+"\t"+gpd.get_gene_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n") return results
def main(args): sys.stderr.write("Reading reference fasta\n") ref_genome = FastaData(open(args.reference_fasta,'rb').read()) sys.stderr.write("Reading in transcriptome\n") output = {} txome = Transcriptome() z = 0 with open(args.reference_gpd) as inf: for line in inf: z+=1 if z%1000==0: sys.stderr.write(str(z)+" \r") gpd = GPD(line) gpd.set_sequence(ref_genome) txome.add_transcript(gpd) sys.stderr.write("\n") sys.stderr.write("Serializing transcriptome\n") output['txome'] = txome.dump_serialized() txweights = {} weight_type = 'uniform_distribution' #default if args.expression_table: weight_type = 'expression_table' inf = None if args.expression_table[-3:]=='.gz': inf = gzip.open(args.expression_table) else: inf = open(args.expression_table) for line in inf: f = line.rstrip().split("\t") txweights[f[0]] = float(f[1]) elif args.exponential_distribution: weight_type = 'exponential_distribution' output['weight_type'] = weight_type output['weights'] = txweights #only matters for expression based of = sys.stdout if args.output: of = open(args.output,'w') of.write(base64.b64encode(zlib.compress(pickle.dumps(output)))+"\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): of = sys.stdout if args.output: if re.search('\.gz$',args.output): of = gzip.open(args.output,'w') else: of = open(args.output,'w') #read the reference gpd rinf = None global txome txome = {} if re.search('\.gz$',args.reference): rinf = gzip.open(args.reference) else: rinf = open(args.reference) sys.stderr.write("Reading in reference\n") z = 0 # populate txome with reference transcripts for each chromosome for line in rinf: z += 1 gpd = GPD(line) gpd.set_payload(z) if z%100 == 0: sys.stderr.write(str(z)+" \r") if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = [] r = gpd.get_range() r.set_payload(gpd) txome[gpd.value('chrom')].append(r) rinf.close() sys.stderr.write(str(z)+" \r") sys.stderr.write("\n") inf = sys.stdin if args.input != '-': if re.search('\.gz$',args.input): inf = gzip.open(args.input) else: inf = open(args.input) #def annotate_line(gpd,txome,args): sys.stderr.write("annotating\n") p = Pool(processes=args.threads) csize = 100 #for v in generate_tx(inf,args): # res = annotate_line(v) # if not res: continue # print res.rstrip() results2 = p.imap(func=annotate_line,iterable=generate_tx(inf,args),chunksize=csize) #sys.stderr.write("done map\n") for res in results2: if not res: continue of.write(res) of.close()
def process_locus(igpds,rgpds,args): input_entries = [GPD(x) for x in igpds] reference_entries = [GPD(x) for x in rgpds] outlines = [] injun = get_consecutive_junctions(input_entries,args) refjun = get_consecutive_junctions(reference_entries,args) allrefjuncs = [] # consolidate reference junctions for refgpdset in refjun: (refgpd,refjuncs) = refgpdset for refjunc in refjuncs: allrefjuncs.append(refjunc) # append all reference junctions #sys.stderr.write("Now check the overlap\n") for ingpdset in injun: (ingpd,juncs) = ingpdset # one gpd at a time unsupported_pairs = junction_match(juncs,allrefjuncs,args) ostr = '' ostr += ingpd.get_gene_name()+"\t" ostr += ingpd.get_transcript_name()+"\t" ostr += str(len(juncs))+"\t" ostr += str(len(unsupported_pairs))+"\t" ostr += ";".join([x[0].get_string()+"~~"+x[1].get_string() for x in unsupported_pairs]) outlines.append(ostr) return outlines
def main(): #do our inputs args = do_inputs() sys.stderr.write("Reading reference genepred\n") ref = {} tx_strand = {} z = 0 with open(args.reference_genepred) as inf: for line in inf: gpd = GPD(line) gname = gpd.get_gene_name() tname = gpd.get_transcript_name() tx_strand[tname] = gpd.get_strand() if gname not in ref: ref[gname] = [] ref[gname].append(gpd) z += 1 sys.stderr.write("Read "+str(len(ref.keys()))+" genes and "+str(z)+" transcripts\n") if args.maximum_isoforms > 0: sys.stderr.write("Removing genes with more than "+str(args.maximum_isoforms)+" isoforms.\n") for gname in ref.keys(): if len(ref[gname]) > args.maximum_isoforms: del ref[gname] sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Filtering by length "+str(args.minimum_length)+" bp\n") for gname in ref.keys(): passing = [] for gpd in ref[gname]: if gpd.get_length() < args.minimum_length: continue passing.append(gpd) if len(passing) == 0: del ref[gname] else: ref[gname] = passing sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Converting gpd into exon bed\n") beds = [] for gname in ref.keys(): for gpd in ref[gname]: tname = gpd.get_transcript_name() for i in range(0,len(gpd.exons)): ex = gpd.exons[i] beds.append(ex.get_range().get_bed_array()+[gname,tname,i]) with open(args.tempdir+'/gpd.bed','w') as of: for bed in sorted(beds,key=lambda x: (x[0],x[1],x[2],x[3],x[4],x[5])): of.write("\t".join([str(x) for x in bed])+"\n") sys.stderr.write("intersecting with bed depth\n") of = open(args.tempdir+'/intersect.bed','w') cmd = 'bedtools intersect -wo -a - -b '+args.tempdir+'/gpd.bed' p = Popen(cmd.split(),stdin=args.bed_depth,stdout=of) p.communicate() coverage = {} sys.stderr.write("Reading the intersection\n") with open(args.tempdir+'/intersect.bed') as inf: for line in inf: f = line.rstrip().split("\t") gname = f[7] tname = f[8] depth = int(f[3]) bed1 = Bed(f[0],int(f[1]),int(f[2])) bed2 = Bed(f[4],int(f[5]),int(f[6])) bed = bed1.union(bed2) bed.set_payload(depth) if gname not in coverage: coverage[gname] = {} if tname not in coverage[gname]: coverage[gname][tname] = [] coverage[gname][tname].append(bed) transcript_depths = {} for gname in coverage: for tname in coverage[gname]: ref_gpd = [x for x in ref[gname] if x.get_transcript_name()==tname][0] rlen = ref_gpd.get_length() bases_covered = sum([x.length() for x in coverage[gname][tname]]) bases_area = sum([x.length()*x.get_payload() for x in coverage[gname][tname]]) avg_depth = float(bases_area)/float(rlen) if avg_depth < args.minimum_average_depth: continue if bases_covered < args.minimum_length: continue #print gname #print tname #print rlen #print bases_covered #print bases_area total_positions = {} for ex in ref_gpd.exons: b = ex.get_range().get_bed_array() for i in range(b[1],b[2]): total_positions[i] = 0 # zero indexed for b in coverage[gname][tname]: depth = b.get_payload() barr = b.get_bed_array() for i in range(barr[1],barr[2]): total_positions[i] = depth transcript_depths[tname] = total_positions sys.stderr.write("have information needed to plot from "+str(len(transcript_depths.keys()))+" transcripts\n") outputs = [] for tname in transcript_depths: depths = transcript_depths[tname] positions = sorted(depths.keys()) tx_len = len(positions) bins = {} for i in range(0,tx_len): bin = int(100*float(i)/float(tx_len)) if bin not in bins: bins[bin] = [] bins[bin].append(depths[positions[i]]) for bin in bins: bins[bin] = average(bins[bin]) biggest = float(max(bins.values())) tx_array = [float(bins[x])/biggest for x in sorted(bins.keys())] if tx_strand[tname] == '-': tx_array.reverse() #outputs.append(tx_array) args.output.write(tname+"\t"+"\t".join([str(x) for x in tx_array])+"\n") #for i in range(0,100): # args.output.write("\t".join([str(x[i]) for x in outputs])+"\n") args.output.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') color = '0,0,0' if args.color: if args.color == 'blue': color = '67,162,202' elif args.color == 'green': color = '49,163,84' elif args.color == 'orange': color = '254,178,76' elif args.color == 'purple': color = '136,86,167' elif args.color == 'red': color = '240,59,32' # set up the header if one is desired header = '' if not args.noheader: newname = 'longreads' m = re.search('([^\/]+)$', args.input) if m: newname = m.group(1) newname = re.sub('[\s]+', '_', newname) if args.headername: newname = args.headername elif args.input == '-': newname = 'STDIN' header += "track\tname=" + newname + "\t" description = newname + ' GenePred Entries' if args.headerdescription: description = args.headerdescription header += 'description="' + description + '"' + "\t" header += 'itemRgb="On"' of.write(header + "\n") gpd_handle = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': gpd_handle = gzip.open(args.input) else: gpd_handle = open(args.input) gs = GPDStream(gpd_handle) #with gpd_handle as infile: for gpd in gs: #for line in infile: #if re.match('^#',line): # continue #genepred_entry = GenePredBasics.line_to_entry(line) if args.minintron: gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line()) exoncount = gpd.get_exon_count() ostring = gpd.value('chrom') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" if args.namefield == 1: ostring += gpd.value('gene_name') + "\t" else: ostring += gpd.value('name') ostring += '1000' + "\t" ostring += gpd.value('strand') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" ostring += color + "\t" ostring += str(exoncount) + "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonEnds')[i] - gpd.value('exonStarts')[i]) + ',' ostring += "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonStarts')[i] - gpd.value('exonStarts')[0]) + ',' of.write(ostring + "\n") #for i in range(0,len(genepred_entry['exonStarts'])): gpd_handle.close() of.close()
def main(args): global of if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') #read the reference gpd rinf = None txome = {} if re.search('\.gz$', args.reference): rinf = gzip.open(args.reference) else: rinf = open(args.reference) sys.stderr.write("Reading in reference\n") z = 0 for line in rinf: z += 1 gpd = GPD(line) gpd.set_payload(z) if z % 100 == 0: sys.stderr.write(str(z) + " \r") if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = [] r = gpd.get_range() r.set_payload(gpd) txome[gpd.value('chrom')].append(r) rinf.close() sys.stderr.write(str(z) + " \r") sys.stderr.write("\n") inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) z = 0 chroms = {} sys.stderr.write("Buffering reads\n") for line in inf: z += 1 m = re.match('[^\t]*\t[^\t]*\t([^\t]+)', line) chrom = m.group(1) if z % 100 == 0: sys.stderr.write(str(z) + " \r") if chrom not in chroms: chroms[chrom] = [] chroms[chrom].append([line, z]) sys.stderr.write("\n") sys.stderr.write("Finished buffering reads\n") if args.threads > 1: p = Pool(processes=args.threads) results = [] global chrtotal chrtotal = len(chroms) for chrom in chroms: if chrom not in txome: continue if args.threads > 1: v = p.apply_async(do_buffer, args=(chroms[chrom], { chrom: txome[chrom] }, args), callback=do_out) results.append(v) else: v = do_buffer(chroms[chrom], {chrom: txome[chrom]}, args) results.append(Queue(v)) do_out(v) if args.threads > 1: p.close() p.join() sys.stderr.write("\n") for res in [x.get() for x in results]: for oline in res: of.write(oline) inf.close() of.close()
def main(args): of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') color = '0,0,0' if args.color: if args.color == 'blue': color = '67,162,202' elif args.color == 'green': color = '49,163,84' elif args.color == 'orange': color = '254,178,76' elif args.color == 'purple': color = '136,86,167' elif args.color == 'red': color = '240,59,32' # set up the header if one is desired header = '' if not args.noheader: newname = 'longreads' m = re.search('([^\/]+)$',args.input) if m: newname = m.group(1) newname = re.sub('[\s]+','_',newname) if args.headername: newname = args.headername elif args.input == '-': newname = 'STDIN' header += "track\tname="+newname+"\t" description = newname+' GenePred Entries' if args.headerdescription: description = args.headerdescription header += 'description="'+description + '"'+"\t" header += 'itemRgb="On"' of.write(header+"\n") gpd_handle = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': gpd_handle = gzip.open(args.input) else: gpd_handle = open(args.input) gs = GPDStream(gpd_handle) #with gpd_handle as infile: for gpd in gs: #for line in infile: #if re.match('^#',line): # continue #genepred_entry = GenePredBasics.line_to_entry(line) if args.minintron: gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line()) exoncount = gpd.get_exon_count() ostring = gpd.value('chrom') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t" if args.namefield == 1: ostring += gpd.value('gene_name') + "\t" else: ostring += gpd.value('name') ostring += '1000' + "\t" ostring += gpd.value('strand') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t" ostring += color+"\t" ostring += str(exoncount) + "\t" for i in range(0,exoncount): ostring += str(gpd.value('exonEnds')[i]-gpd.value('exonStarts')[i]) + ',' ostring += "\t" for i in range(0,exoncount): ostring += str(gpd.value('exonStarts')[i]-gpd.value('exonStarts')[0])+',' of.write(ostring+"\n") #for i in range(0,len(genepred_entry['exonStarts'])): gpd_handle.close() of.close()
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('genepred', help="the genepred used for this alignqc") parser.add_argument('--min_exons', type=int, default=1, help="At least this number of exons") parser.add_argument('--full', action='store_true', help="only use full matches") parser.add_argument('-o', '--output', help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) genes = {} sys.stderr.write("Reading annotation file\n") for line in inf: f = line.rstrip().split("\t") gene = f[2] tx = f[3] type = f[4] if args.full and type != 'full': continue if gene not in genes: genes[gene] = {} genes[gene]['transcripts'] = {} genes[gene]['cnt'] = 0 if tx not in genes[gene]['transcripts']: genes[gene]['transcripts'][tx] = 0 genes[gene]['cnt'] += 1 genes[gene]['transcripts'][tx] += 1 inf.close() txs = {} sys.stderr.write("Reading genepred file\n") z = 0 with open(args.genepred) as inf: for line in inf: z += 1 if z % 1000 == 0: sys.stderr.write(str(z) + " \r") gpd = GPD(line) exs = [] for ex in gpd.exons: exs.append(ex.get_range()) txs[gpd.get_transcript_name()] = exs sys.stderr.write("\n") vals = [] sys.stderr.write("Traversing annotation file\n") for gene in genes: for tx in genes[gene]['transcripts']: v = genes[gene]['transcripts'][tx] exons = txs[tx] if len(exons) < args.min_exons: continue for i in range(0, v): vals += exons[:] sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n") of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" + str(v.get_payload()) + "\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def annotate_line(inputs): global txome (line, z, args) = inputs gpd = GPD(line) gpd.set_payload(z) v = gpd.get_range() if v.chr not in txome: return None possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)] candidates = [] if len(possible) == 0: return None for tx in possible: eo = None full = False subset = False econsec = 1 if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1: eo = gpd.exon_overlap(tx, single_minover=100, single_frac=0.5) else: eo = gpd.exon_overlap(tx, multi_minover=10, multi_endfrac=0, multi_midfrac=0.8, multi_consec=False) if eo.is_full_overlap(): full = True if eo.is_subset(): subset = True if eo: econsec = eo.consecutive_exon_count() if not eo: continue ecnt = eo.match_exon_count() osize = gpd.overlap_size(tx) candidates.append([ full, subset, ecnt, econsec, gpd.get_exon_count(), tx.get_exon_count(), osize, gpd.get_length(), tx.get_length(), tx ]) if len(candidates) == 0: return None bests = sorted(candidates, key=lambda x: (-x[0], -x[1], -x[3], -x[2], -min( float(x[6]) / float(x[7]), float(x[6]) / float(x[8])))) #line_z v = bests[0] ### we have the annotation z = gpd.get_payload() #line = line_z[0] #gpd = GPD(line) if not v: return None type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
def annotate_line(inputs): global txome (line,z,args) = inputs gpd = GPD(line) gpd.set_payload(z) v = gpd.get_range() if v.chr not in txome: return None possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)] candidates = [] if len(possible) == 0: return None for tx in possible: eo = None full = False subset = False econsec = 1 if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1: eo = gpd.exon_overlap(tx,single_minover=100,single_frac=0.5) else: eo = gpd.exon_overlap(tx,multi_minover=10,multi_endfrac=0,multi_midfrac=0.8,multi_consec=False) if eo.is_full_overlap(): full = True if eo.is_subset(): subset = True if eo: econsec = eo.consecutive_exon_count() if not eo: continue ecnt = eo.match_exon_count() osize = gpd.overlap_size(tx) candidates.append([full,subset,ecnt,econsec,gpd.get_exon_count(),tx.get_exon_count(),osize,gpd.get_length(),tx.get_length(),tx]) if len(candidates)==0: return None bests = sorted(candidates,key=lambda x: (-x[0],-x[1],-x[3],-x[2],-min(float(x[6])/float(x[7]),float(x[6])/float(x[8])))) #line_z v = bests[0] ### we have the annotation z = gpd.get_payload() #line = line_z[0] #gpd = GPD(line) if not v: return None type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Use - for STDIN") parser.add_argument('genepred',help="the genepred used for this alignqc") parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons") parser.add_argument('--full',action='store_true',help="only use full matches") parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() inf = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': inf = gzip.open(args.input) else: inf = open(args.input) genes = {} sys.stderr.write("Reading annotation file\n") for line in inf: f = line.rstrip().split("\t") gene = f[2] tx = f[3] type = f[4] if args.full and type != 'full': continue if gene not in genes: genes[gene] = {} genes[gene]['transcripts'] = {} genes[gene]['cnt'] = 0 if tx not in genes[gene]['transcripts']: genes[gene]['transcripts'][tx] = 0 genes[gene]['cnt'] += 1 genes[gene]['transcripts'][tx] += 1 inf.close() txs = {} sys.stderr.write("Reading genepred file\n") z = 0 with open(args.genepred) as inf: for line in inf: z +=1 if z%1000==0: sys.stderr.write(str(z)+" \r") gpd = GPD(line) exs = [] for ex in gpd.exons: exs.append(ex.range) txs[gpd.get_transcript_name()] = exs sys.stderr.write("\n") vals = [] sys.stderr.write("Traversing annotation file\n") for gene in genes: for tx in genes[gene]['transcripts']: v = genes[gene]['transcripts'][tx] exons = txs[tx] if len(exons) < args.min_exons: continue for i in range(0,v): vals += exons[:] sys.stderr.write("Generating coverage file "+str(len(vals))+"\n") of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()