def querys(listx): dbi_bam=DBI.init(args.bam,"bam") genome=DBI.init(args.genome,"genome") results=[] for x in listx: results.append(query(x,dbi_bam,genome)) return results
def run(args): out=IO.fopen(args.output,"w") cls=None if hclass.has_key(args.type): cls=hclass[args.type] dbi=DBI.init(args.db,"tabix",cls=cls) else: dbi=DBI.init(args.db,"tabix") for i in TableIO.parse(IO.fopen(args.input,"r"),args.format): print("QR",i,file=out) for j,ht in enumerate(dbi.query(i)): print("HT_{k}\t{ht}".format(k=j+1,ht=ht),file=out)
def run(args): if os.path.isfile(args.bed + ".tbi"): dbi = DBI.init(args.bed, "tabix", cls=BED12) else: dbi = DBI.init(args.bed, "binindex", cls=BED12) out = IO.fopen(args.output, "w") for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"): print("QR\t", i, file=out) for j in dbi.query(i): if compatible_with_transcript(j, i): print("HT\t{}".format(_translate_to_meta(i, j)), file=out) elif not args.hit: print("OP\t{}".format(j), file=out) print("", file=out)
def run(args): bedformat="bed"+str(args.bed_column_number) dbi=DBI.init(args.genome,"genome") out=IO.fopen(args.output,"w") for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat): print (">",i.id+"_"+args.method,file=out) print (seq_wrapper(dbi.query(i,method=args.method)),file=out)
def run(args): dbi=DBI.init(args.bw,"bigwig") out=IO.fopen(args.output,"w") for i in TableIO.parse(IO.fopen(args.input,"r"),args.format): ht=[ j for j in dbi.query(i,method=args.method) ] print("QR",i,file=out) print("HT",ht,file=out)
def run(args): bedformat="bed"+str(args.bed_column_number) dbi=DBI.init(args.bam,"bam") out=IO.fopen(args.output,"w") for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat): print("QR",i,file=out) for j in dbi.query(i,method=args.method): print("HT",j,file=out) print("",file=out)
def run(args): bedformat = "bed" + str(args.bed_column_number) dbi = DBI.init(args.genome, "genome") out = IO.fopen(args.output, "w") for i in TableIO.parse(IO.fopen(args.input, "r"), bedformat): seq = dbi.query(i, method=args.method) if len(seq) == 0: continue print(">{}".format(i.id + "_" + args.method), file=out) print(seq_wrapper(seq), file=out, end="")
def run(local_args): ''' IO TEMPLATE ''' global args,out args=local_args out=IO.fopen(args.output,"w") fin=IO.fopen(args.input,"r") print("# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,file=out) print("# in bam2x ( https://github.com/nimezhu/bam2x )",file=out) print("# Date: ",time.asctime(),file=out) print("# The command line is :",file=out) print("#\t"," ".join(sys.argv),file=out) gene=DBI.init(args.genetab,"binindex",cls="bed12"); upstream_list=[] downstream_list=[] exons_list=[] introns_list=[] utr3_list=[] utr5_list=[] for g in gene: upstream_list.append(g.upstream(args.upstream)); downstream_list.append(g.downstream(args.downstream)); for e in g.Exons(): exons_list.append(e) for i in g.Introns(): introns_list.append(i) if not (g.utr3() is None): utr3_list.append(g.utr3()) if not (g.utr5() is None): utr5_list.append(g.utr5()) upstream=DBI.init(upstream_list,"binindex",cls="bed6") downstream=DBI.init(downstream_list,"binindex",cls="bed6") exons=DBI.init(exons_list,"binindex",cls="bed6") introns=DBI.init(introns_list,"binindex",cls="bed6") utr3=DBI.init(utr3_list,"binindex",cls="bed6") utr5=DBI.init(utr5_list,"binindex",cls="bed6") if args.format=="guess": args.format=IO.guess_format(args.input) for (i0,i) in enumerate(TableIO.parse(fin,args.format)): if i0==0: if isinstance(i,Bed12): print("#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out) else: print("#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out) print(i,file=out,end="") print("\t",toIDs(gene.query(i)),file=out,end="") print("\t",toIDs(upstream.query(i)),file=out,end="") print("\t",toIDs(downstream.query(i)),file=out,end="") print("\t",toIDs(exons.query(i)),file=out,end="") print("\t",toIDs(introns.query(i)),file=out,end="") print("\t",toIDs(utr3.query(i)),file=out,end="") print("\t",toIDs(utr5.query(i)),file=out)
def run(args): # logging.basicConfig(level=logging.DEBUG) dbi = DBI.init(args.bam, "bam") out = IO.fopen(args.output, "w") for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"): print("QR\t", i, file=out) for j in dbi.query(i, method="bam1", strand=args.strand): if compatible_with_transcript(j, i): print("HT\t{}".format(_translate_to_meta(i, j)), file=out) elif not args.hit: print("OP\t{}".format(j), file=out) print("", file=out)
def run(args_local): ''' IO TEMPLATE ''' global args,out,exon_cutoff,intron_cutoff,hasGenome args=args_local dbi=DBI.init(args.input,"bam") out=IO.fopen(args.output,"w") hasGenome=False if args.genome: hasGenome=True ''' END OF IO TEMPLATE ''' print >>out,"# This positive_data was generated by program ",sys.argv[1]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) chrs=[] lengths=[] for i in dbi.bamfiles[0].references: chrs.append(i) for i in dbi.bamfiles[0].lengths: lengths.append(i) p=mp.Pool(processes=args.num_cpus) coverage_bedgraphs=p.map(process_chrom,chrs) bedgraphs=[] coverages=[] for i in range(len(chrs)): bedgraphs.append(coverage_bedgraphs[i][1]) coverages.append(coverage_bedgraphs[i][0]) s=0.0 # 1000.0 l=long(0) for i in range(len(chrs)): s+=coverages[i] l+=lengths[i] l=l*2 # Double Strand coverage=s/l*1000.0 threshold=1 while 1: if prob.poisson_cdf(threshold,coverage,False) < args.pvalue: break threshold+=1 exon_cutoff=threshold intron_cutoff=2 #TODO revise it print >>out,"# MEAN COVERAGE:",coverage print >>out,"# EXON COVERAGE CUTOFF:",exon_cutoff #call_peaks(bedgraphs[0],1) #debug peaks=p.map(call_peaks_star,itertools.izip(chrs,bedgraphs,itertools.repeat(exon_cutoff))) output(chrs,peaks)
def process_chrom(chrom): local_dbi=DBI.init(args.input,"bam") retv=list() intron_retv=list() a=[] #positive_data=TuringSortingArray(None,500) positive_data=TuringSortingArray() negative_data=TuringSortingArray() positive_intron_data=TuringSortingArray() negative_intron_data=TuringSortingArray() for i in local_dbi.query(chrom,method="bam1",strand=args.strand): if i.strand=="+" or i.strand==".": positive_data.append(TuringCode(i.start,cb.ON)) positive_data.append(TuringCode(i.stop,cb.OFF)) for j in i.Exons(): positive_data.append(TuringCode(j.start,cb.BLOCKON)) positive_data.append(TuringCode(j.stop,cb.BLOCKOFF)) for j in i.Introns(): positive_intron_data.append(TuringCode(j.start,cb.BLOCKON)) positive_intron_data.append(TuringCode(j.stop,cb.BLOCKOFF)) else: negative_data.append(TuringCode(i.start,cb.ON)) negative_data.append(TuringCode(i.stop,cb.OFF)) for j in i.Exons(): negative_data.append(TuringCode(j.start,cb.BLOCKON)) negative_data.append(TuringCode(j.stop,cb.BLOCKOFF)) for j in i.Introns(): negative_intron_data.append(TuringCode(j.start,cb.BLOCKON)) negative_intron_data.append(TuringCode(j.stop,cb.BLOCKOFF)) cutoff=args.cutoff coverage=0.0 for i,x in enumerate(codesToBedGraph(positive_data.iter())): if x[SCORE_INDEX] >= cutoff: retv.append((x[0],x[1],x[2],POSITIVE_STRAND,EXON_GROUP_CODE)) coverage+=float(x[1]-x[0])*x[2]/1000.0 for i,x in enumerate(codesToBedGraph(negative_data.iter())): if x[SCORE_INDEX] >= cutoff: retv.append((x[0],x[1],x[2],NEGATIVE_STRAND,EXON_GROUP_CODE)) coverage+=float(x[1]-x[0])*x[2]/1000.0 INTRON_CUTOFF=1.0 for i,x in enumerate(codesToBedGraph(positive_intron_data.iter())): if x[SCORE_INDEX] >= INTRON_CUTOFF: retv.append((x[0],x[1],x[2],POSITIVE_STRAND,INTRON_GROUP_CODE)) for i,x in enumerate(codesToBedGraph(negative_intron_data.iter())): if x[SCORE_INDEX] >= INTRON_CUTOFF: retv.append((x[0],x[1],x[2],NEGATIVE_STRAND,INTRON_GROUP_CODE)) retv.sort(key=itemgetter(0,1,2)) #TODO how to sort! local_dbi.close() return coverage,retv
def run(args): logging.basicConfig(level=logging.DEBUG) fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") bam=DBI.init(args.bam,"bam"); beds=[i for i in TableIO.parse(fin,"bed12")] beds.sort() bp=args.bp print("mapped:{}".format(bam.mapped)) print("unmapped:{}".format(bam.unmapped)) data={} for i,x in enumerate(iter_cluster(beds)): print("{}\t{}:{}-{}".format(i+1,x["chr"],x["start"]+1,x["stop"])) ''' cds=[z.cds() for z in x["beds"] if z.cds()] utr3=[z.utr3() for z in x["beds"] if z.utr3()] utr5=[z.utr5() for z in x["beds"] if z.utr5()] ''' coords = [ up_down_coordinate(gene,args.bp,args.bp) for gene in x["beds"] ] for j,y in enumerate(coords): data[y.id]={} data[y.id]["coord"]=y data[y.id]["values"]=[0.0 for l in range(y.cdna_length())]; coord_beds = [ _translate(coord,bed) for coord,bed in itertools.izip(coords,x["beds"])] for j,read in enumerate(bam.query(method="bam1",chr=x["chr"],start=x["start"]-args.bp,stop=x["stop"]+args.bp,strand=args.strand)): NM=getNM(read) # number of hits NC=0 # number of compatible c_coords=[] for k,coord in enumerate(coords): if overlap(read,coord) and compatible(read,coord): # don't consider the reads extend out of coords. NC+=1 c_coords.append(k) for k,c in enumerate(c_coords): coord=coords[c] if read.start < coord.start or read.stop > coord.stop: start=max(read.start,coord.start) stop=min(read.stop,coord.stop) read=read._slice(start,stop) read_in_coord = _translate(coord,read) for l in xrange(read_in_coord.start,read_in_coord.stop): data[coord.id]["values"][l]+=1.0/NC/NM for j,y in enumerate(coords): print(data[y.id]["coord"]) print(data[y.id]["values"]) '''
def run(args): logging.basicConfig(level=logging.INFO) global bam,out bam=DBI.init(args.bam,"bam") fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") p=mp.Pool(processes=args.num_cpus) beds_list=[[] for i in xrange(args.num_cpus)] for i0,bed in enumerate(TableIO.parse(fin,"bed12")): beds_list[i0%args.num_cpus].append(bed) gene_num=i0+1 print("bin_id\tmean\tentropy\treverse_strand_mean\treverse_strand_entropy",file=out) up_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(True))) output(up_results,args.bp,gene_num,"UP") results = p.map(count_list_star,itertools.izip(beds_list,itertools.repeat(args.bin_num),itertools.repeat(args.strand))) output(results,args.bin_num,gene_num,"TR") down_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(False))) output(down_results,args.bp,gene_num,"DN")
def run(args): #logging.basicConfig(level=logging.DEBUG) dbi=DBI.init(args.bam,"bam") mapped=dbi.mapped out=IO.fopen(args.output,"w") print("Gene\tRPKM",file=out); for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): print(i.id,"\t",end="",file=out) s=0.0 l=i.cdna_length() if args.uniq: for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq): if compatible_with_transcript(j,i): s+=1.0 else: for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq): if compatible_with_transcript(j,i): (nh,_,_)=j.itemRgb.split(",") nh=int(nh) s+=1.0/nh rpkm=float(s)*(1000000.0/mapped)*(1000.0/float(l)) print(rpkm,file=out)
def run(args): logging.basicConfig(level=logging.INFO) up=args.up down=args.down bp_num=up+down offset=-up bam=DBI.init(args.bam,"bam") fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") bin_sum=[0 for i in xrange(bp_num)] bin_e=[0.0 for i in xrange(bp_num)] bin_dis=[[] for i in xrange(bp_num)] for i0,bed in enumerate(TableIO.parse(fin,args.format)): bed_bin=[0 for i in xrange(bp_num)] if args.tts: pos=bed.tts() else: pos=bed.tss() pos_flank=get_flank_region(pos,up,down) for read in bam.query(pos_flank,"bam1",strand="read1"): a=translate_coordinates(pos,read) #print(a,file=out) for e in a.Exons(): #print(e,file=out) start=e.start-offset end=e.stop-offset if start < 0: start=0 if end > bp_num: end=bp_num for j in xrange(start,end): bed_bin[j]+=1 for i in xrange(bp_num): bin_sum[i]+=bed_bin[i] bin_dis[i].append(bed_bin[i]) bed_num=i0+1 for i in xrange(bp_num): bin_e[i]=gini_coefficient(bin_dis[i]) if args.tts: print("pos_to_tts\taggregation_mean\tgini_coefficient",file=out) else: print("pos_to_tss\taggregation_mean\tgini_coefficient",file=out) for i in xrange(bp_num): print("{bin}\t{aggregation}\t{E}".format(bin=i+offset,aggregation=float(bin_sum[i])/bed_num,E=bin_e[i]),file=out) try: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt matplotlib.rcParams.update({'font.size':9}) ax1=plt.subplot2grid((7,1),(6,0)) plt.ylabel('gini coeffecient') plt.fill_between(range(-up,down),bin_e,color="r",alpha=0.2,y2=0) ax1.set_ylim(0,1) ax1.set_xlim(-up,down) ax1.axes.get_xaxis().set_visible(False) plt.axvline(x=0,linewidth=1, color='y') ax2=plt.subplot2grid((7,1),(0,0),rowspan=5) ax2.set_xlim(-up,down) plt.plot(range(-up,down),[float(i)/bed_num for i in bin_sum]) plt.ylabel('mean coverage') if args.tts: plt.xlabel('pos to tts (bp)') else: plt.xlabel('pos to tss (bp)') plt.axvline(x=0,linewidth=1, color='y') plt.grid(True) plt.savefig(args.output+".png") except: pass
def call_peaks(chrom,bedgraph,exon_cutoff): #TODO if hasGenome: genome=DBI.init(args.genome,"genome") def filter_low_complexity(peak): new_exon_starts=[] new_exon_sizes=[] local_peak=list(peak) offset=local_peak[START_INDEX] for exon_start,exon_size in itertools.izip(local_peak[EXONSTARTS_INDEX],local_peak[EXONSIZES_INDEX]): exon_s=offset+exon_start exon_e=offset+exon_start+exon_size seq=genome.query(BED3(chrom,exon_s,exon_e),method="seq") if len(seq) > 100: cplx=float(complexity(seq[0:100]))/100 else: cplx=float(complexity(seq))/len(seq) if cplx > CPLX_CUTOFF: new_exon_starts.append(exon_start) new_exon_sizes.append(exon_size) if len(new_exon_sizes) > 0: local_peak[EXONSIZES_INDEX]=tuple(new_exon_sizes) local_peak[START_INDEX]=new_exon_starts[0]+offset local_peak[STOP_INDEX]=new_exon_starts[-1]+offset+new_exon_sizes[-1] shift_start=new_exon_starts[0] for i0 in range(len(new_exon_starts)): new_exon_starts[i0]-=shift_start local_peak[EXONSTARTS_INDEX]=tuple(new_exon_starts) return tuple(local_peak) else: return None gap=10 pos_beds=[] neg_beds=[] peaks=[] i_p=0 i_n=0 last_pos_stop=0 last_neg_stop=0 for i in bedgraph: if i[STRAND_INDEX]==POSITIVE_STRAND: if i[GROUP_INDEX]==EXON_GROUP_CODE: if i[SCORE_INDEX] >= exon_cutoff: if len(pos_beds)>0: if i[START_INDEX]-last_pos_stop < gap or last_pos_stop==0: pos_beds.append(i) if last_pos_stop < i[STOP_INDEX]: last_pos_stop=i[STOP_INDEX] else: peak=bedsToPeak(pos_beds,"p_"+str(i_p)) if peak is not None: if hasGenome: peak=filter_low_complexity(peak) if peak is not None: peaks.append(peak) i_p+=1 else: peaks.append(peak) i_p+=1 pos_beds=[i] last_pos_stop=i[STOP_INDEX] else: last_pos_stop=i[STOP_INDEX] pos_beds.append(i) elif i[GROUP_INDEX]==INTRON_GROUP_CODE: if last_pos_stop < i[STOP_INDEX]: last_pos_stop=i[STOP_INDEX] pos_beds.append(i) else: if i[GROUP_INDEX]==EXON_GROUP_CODE: if i[SCORE_INDEX] >= exon_cutoff: if len(neg_beds)>0: if i[START_INDEX]-last_neg_stop < gap or last_neg_stop==0: neg_beds.append(i) if last_neg_stop < i[STOP_INDEX]: last_neg_stop=i[STOP_INDEX] else: peak=bedsToPeak(neg_beds,"n_"+str(i_n)) if peak is not None: if hasGenome: peak=filter_low_complexity(peak) if peak is not None: peaks.append(peak) i_n+=1 else: peaks.append(peak) i_n+=1 neg_beds=[i] last_neg_stop=i[STOP_INDEX] else: neg_beds.append(i) last_neg_stop=i[STOP_INDEX] elif i[GROUP_INDEX]==INTRON_GROUP_CODE: if last_neg_stop < i[STOP_INDEX]: last_neg_stop=i[STOP_INDEX] neg_beds.append(i) if len(pos_beds)>0: peak=bedsToPeak(pos_beds,"p_"+str(i_p)) if peak is not None: if hasGenome: peak=filter_low_complexity(peak) if peak is not None: peaks.append(peak) else: peaks.append(peak) if len(neg_beds)>0: peak=bedsToPeak(neg_beds,"n_"+str(i_n)) if peak is not None: if hasGenome: peak=filter_low_complexity(peak) if peak is not None: peaks.append(peak) else: peaks.append(peak) peaks.sort() return peaks