def run(args): bedformat="bed"+str(args.bed_column_number) dbi=DBI.init(args.genome,"genome") out=IO.fopen(args.output,"w") for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat): print (">",i.id+"_"+args.method,file=out) print (seq_wrapper(dbi.query(i,method=args.method)),file=out)
def run(args): schema_template=schema_templates[args.input_format] SQL_template=insert_templates[args.input_format] db_filename=args.db out=IO.fopen(args.output,"w") if db_filename=="guess": db_filename=args.input.strip(".gz")+".db" db_is_new = not os.path.exists(db_filename) print("Database file : %s"%db_filename,file=out) with sqlite3.connect(db_filename) as conn: cursor=conn.cursor() if db_is_new: print ('Creating table %s if not exists\n________________________________'%args.table_name,file=out) S=schema_template.substitute({"table_name":args.table_name}) print (S,file=out) print ("_______________________________",file=out) cursor.execute(S) else: S=schema_template.substitute({"table_name":args.table_name}) cursor.execute(S) fin=IO.fopen(args.input,"r") S1=SQL_template.substitute({"table_name":args.table_name}) print(S1,file=out) s=TableIO.parse(args.input,"simple") cursor.executemany(S1,s) conn.commit() print("loaded",file=out)
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") G2RFP=defaultdict(init); ''' END OF IO TEMPLATE ''' m=[0.0 for i in xrange(200)] # 100 to 200 for b in TableIO.parse(IO.fopen(args.db,"r"),"bed6"): G2RFP[b.chr].append(b) total_reads=0; for i0,i in enumerate(TableIO.parse(fin,"bed6")): spectral=[0 for j in xrange(200)] # 100 to 200 for j in G2RFP[i.chr]: dis=i.start-j.start if(dis >=-100 and dis<100): # 50 to 100 spectral[dis+100]+=j.score # 50 to 100 spectral=norm(spectral) total_reads+=i.score m=[a*i.score+b for a,b in itertools.izip(spectral,m)] if i0%100==0: print("{} processed\r".format(i0),file=sys.stderr) print("pos\tvalue",file=out); for i,x in enumerate(m): print("{}\t{}".format(i,float(x)/total_reads),file=out)
def run(args): db_filename=args.db out=IO.fopen(args.output,"w") if os.path.exists(args.input): fin=IO.fopen(args.input,"r") else: fin=(args.input,) if not os.path.exists(db_filename): print("can't find database %s"%db_filename,file=sys.stderr) exit(1) print("# Database file : %s"%db_filename,file=out) with sqlite3.connect(db_filename) as conn: conn.row_factory=factories[args.db_format] cursor=conn.cursor() for i in fin: i=i.strip() i=i.strip(" ") print("# query %s"%i,file=out) s=template.substitute({"table_name":args.table_name,"name":i}) print("# "+s,file=out) cursor.execute(s) r=None try: r=cursor.fetchone() print(r,file=out) except: raise
def run(args): fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") r=[] m=0 ideograms=[] qr="" for i in TableIO.parse(fin,sep=","): if len(i)==1: a=i[0].split("\t") if len(a)==2: if args.query=="all" or args.query==qr: if m > 0: ideograms.append({"id":qr,"length":m}) qr=a[1].strip() else: if args.query=="all" or args.query==qr: r.append({"chr":qr,"start":i[0],"length":i[2],"value":i[1]}) m=int(i[0])+int(i[2]) if args.query=="all" or args.query==qr: if m > 0: ideograms.append({"id":qr,"length":m}) j={ "ideograms":ideograms, "tracks": [ { "name":args.input, "type":"bedgraph", "values":r } ] } print(json.dumps(j,indent=4),file=out)
def run(args): dbi=DBI.init(args.bw,"bigwig") out=IO.fopen(args.output,"w") for i in TableIO.parse(IO.fopen(args.input,"r"),args.format): ht=[ j for j in dbi.query(i,method=args.method) ] print("QR",i,file=out) print("HT",ht,file=out)
def run(args): fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") if args.format=="guess": args.format=IO.guess_format(args.input) s=TableIO.parse(args.input,args.format) for i in s: print(i,file=out)
def run(local_args): ''' IO TEMPLATE ''' global args,out args=local_args out=IO.fopen(args.output,"w") fin=IO.fopen(args.input,"r") print("# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,file=out) print("# in bam2x ( https://github.com/nimezhu/bam2x )",file=out) print("# Date: ",time.asctime(),file=out) print("# The command line is :",file=out) print("#\t"," ".join(sys.argv),file=out) gene=DBI.init(args.genetab,"binindex",cls="bed12"); upstream_list=[] downstream_list=[] exons_list=[] introns_list=[] utr3_list=[] utr5_list=[] for g in gene: upstream_list.append(g.upstream(args.upstream)); downstream_list.append(g.downstream(args.downstream)); for e in g.Exons(): exons_list.append(e) for i in g.Introns(): introns_list.append(i) if not (g.utr3() is None): utr3_list.append(g.utr3()) if not (g.utr5() is None): utr5_list.append(g.utr5()) upstream=DBI.init(upstream_list,"binindex",cls="bed6") downstream=DBI.init(downstream_list,"binindex",cls="bed6") exons=DBI.init(exons_list,"binindex",cls="bed6") introns=DBI.init(introns_list,"binindex",cls="bed6") utr3=DBI.init(utr3_list,"binindex",cls="bed6") utr5=DBI.init(utr5_list,"binindex",cls="bed6") if args.format=="guess": args.format=IO.guess_format(args.input) for (i0,i) in enumerate(TableIO.parse(fin,args.format)): if i0==0: if isinstance(i,Bed12): print("#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out) else: print("#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out) print(i,file=out,end="") print("\t",toIDs(gene.query(i)),file=out,end="") print("\t",toIDs(upstream.query(i)),file=out,end="") print("\t",toIDs(downstream.query(i)),file=out,end="") print("\t",toIDs(exons.query(i)),file=out,end="") print("\t",toIDs(introns.query(i)),file=out,end="") print("\t",toIDs(utr3.query(i)),file=out,end="") print("\t",toIDs(utr5.query(i)),file=out)
def run(args): bedformat="bed"+str(args.bed_column_number) dbi=DBI.init(args.bam,"bam") out=IO.fopen(args.output,"w") for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat): print("QR",i,file=out) for j in dbi.query(i,method=args.method): print("HT",j,file=out) print("",file=out)
def run(args): bedformat = "bed" + str(args.bed_column_number) dbi = DBI.init(args.genome, "genome") out = IO.fopen(args.output, "w") for i in TableIO.parse(IO.fopen(args.input, "r"), bedformat): seq = dbi.query(i, method=args.method) if len(seq) == 0: continue print(">{}".format(i.id + "_" + args.method), file=out) print(seq_wrapper(seq), file=out, end="")
def smart_init(handle,**dict): ''' test version ''' if isinstance(handle,str): fn,ext=os.path.splitext(handle) if ext==".bam": dbi=FormatToDBI["bam"] return dbi(handle,**dict) elif ext==".gz": if os.path.isfile(handle+".tbi"): fn1,ext1=os.path.splitext(fn) if ext1==".bed": col_num=IO.get_col_number(handle) t=ext1+str(col_num) else: t=ext1 t=t[1:] if TableIO.hclass.has_key(t): return TabixI(handle,cls=t,**dict) else: return TabixI(handle,**dict) else: fn1,ext1=os.path.splitext(fn) if ext1==".bed": col_num=IO.guess(handle) t=ext1+col_num else: t=ext1 t=t[1:] if TableIO.hclass.has_key(t): return BinIndexI(handle,cls=t,**dict) else: return BinIndexI(handle,**dict) elif ext==".bed": col_num=IO.get_col_number(handle) t=ext+str(col_num) if TableIO.hclass.has_key(t): return BinIndexI(handle,cls=t,**dict) else: if col_num >=12 : return BinIndexI(handle,cls="bed12",**dict) elif col_num>=6: return BinIndexI(handle,cls="bed6",**dict) else: return BinIndexI(handle,cls="bed3",**dict) else: if TableIO.hclass.has_key(t): return BinIndexI(handle,cls=t,**dict) elif isinstance(handle,file): return smart_init(handle.name,**dict) else: return BinIndex(handle,**dict)
def run(args): logging.basicConfig(level=logging.INFO) fin = IO.fopen(args.input, "r") out = IO.fopen(args.output, "w") for qr, hits, overlap in iterate(fin): l = qr.cdna_length() pileup = [0.0 for i in xrange(l)] for i in hits: for j in xrange(i.start, i.stop): pileup[j] += nh(i) print("QR\t{id}".format(id=qr.id), file=out) print("PILEUP\n{value}".format(value=rep(pileup)), file=out)
def run(args): # logging.basicConfig(level=logging.DEBUG) dbi = DBI.init(args.bam, "bam") out = IO.fopen(args.output, "w") for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"): print("QR\t", i, file=out) for j in dbi.query(i, method="bam1", strand=args.strand): if compatible_with_transcript(j, i): print("HT\t{}".format(_translate_to_meta(i, j)), file=out) elif not args.hit: print("OP\t{}".format(j), file=out) print("", file=out)
def run(args): out=IO.fopen(args.output,"w") cls=None if hclass.has_key(args.type): cls=hclass[args.type] dbi=DBI.init(args.db,"tabix",cls=cls) else: dbi=DBI.init(args.db,"tabix") for i in TableIO.parse(IO.fopen(args.input,"r"),args.format): print("QR",i,file=out) for j,ht in enumerate(dbi.query(i)): print("HT_{k}\t{ht}".format(k=j+1,ht=ht),file=out)
def run(args): if os.path.isfile(args.bed + ".tbi"): dbi = DBI.init(args.bed, "tabix", cls=BED12) else: dbi = DBI.init(args.bed, "binindex", cls=BED12) out = IO.fopen(args.output, "w") for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"): print("QR\t", i, file=out) for j in dbi.query(i): if compatible_with_transcript(j, i): print("HT\t{}".format(_translate_to_meta(i, j)), file=out) elif not args.hit: print("OP\t{}".format(j), file=out) print("", file=out)
def run(args): fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") beds=[i for i in TableIO.parse(fin,"bed12")] beds.sort() for i,x in enumerate(iter_cluster(beds)): id=find_prefix_consensus([i0.id for i0 in x[1]]) strand=find_consensus_strand([i0.strand for i0 in x[1]]) print("REGION\tCL_{index}\t{chr}\t{start}\t{end}\t{id}\t{score}\t{strand}".format(strand=strand,score=len(x[1]),chr=x[1][0].chr,start=x[1][0].start,end=x[0],index=str(i+1),id=id),file=out) for j,y in enumerate(greedy_iter_compatible_group(x[1])): print("\tGROUP{j}\t{bed}".format(j=j+1,bed=merge_beds(y,id="CL.{i}_GP.{j}".format(i=i+1,j=j+1))),file=out) for k,z in enumerate(sorted(y,key= lambda x0:x0.cdna_length(), reverse=True)): print("\t\tCL.{i}_GP.{j}_TR.{k}\t{l}\t{z}".format(i=i+1,j=j+1,k=k+1,l=z.cdna_length(),z=z),file=out)
def run(args): fin=IO.fopen(args.input,"r") outfile=args.input if not args.sorted: l = [ i for i in TableIO.parse(fin,args.format) ] l.sort() name=splitext(args.input) outfile = "{name[0]}.sorted{name[1]}".format(name=name) out = IO.fopen(outfile,"w") for i in l: print(i,file=out) out.close() format=args.format.translate(None,digits) tabix_index(outfile,preset=format)
def run(args): out=IO.fopen(args.output,"w") if args.annotation=="exon": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): for j in i.Exons(): print(j,file=out) elif args.annotation=="intron": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): for j in i.Introns(): print(j,file=out) elif args.annotation=="cds": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): j=i.cds() if j is not None and j.cdna_length() > 0: print(j,file=out) elif args.annotation=="cds": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): j=i.cds() if j is not None and j.cdna_length() > 0: print(j,file=out) elif args.annotation=="utr5": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): j=i.utr5() if j is not None and j.cdna_length() > 0: print(j,file=out) elif args.annotation=="utr3": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): j=i.utr3() if j is not None and j.cdna_length() > 0: print(j,file=out) elif args.annotation=="utr": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): j=i.utr5() if j is not None and j.cdna_length() > 0: print(j,file=out) j=i.utr3() if j is not None and j.cdna_length() > 0: print(j,file=out) elif args.annotation=="upstream": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): j=i.upstream(args.bp) print(j,file=out) elif args.annotation=="downstream": for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): j=i.downstream(args.bp) print(j,file=out)
def run(args): logging.basicConfig(level=logging.DEBUG) fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") bam=DBI.init(args.bam,"bam"); beds=[i for i in TableIO.parse(fin,"bed12")] beds.sort() bp=args.bp print("mapped:{}".format(bam.mapped)) print("unmapped:{}".format(bam.unmapped)) data={} for i,x in enumerate(iter_cluster(beds)): print("{}\t{}:{}-{}".format(i+1,x["chr"],x["start"]+1,x["stop"])) ''' cds=[z.cds() for z in x["beds"] if z.cds()] utr3=[z.utr3() for z in x["beds"] if z.utr3()] utr5=[z.utr5() for z in x["beds"] if z.utr5()] ''' coords = [ up_down_coordinate(gene,args.bp,args.bp) for gene in x["beds"] ] for j,y in enumerate(coords): data[y.id]={} data[y.id]["coord"]=y data[y.id]["values"]=[0.0 for l in range(y.cdna_length())]; coord_beds = [ _translate(coord,bed) for coord,bed in itertools.izip(coords,x["beds"])] for j,read in enumerate(bam.query(method="bam1",chr=x["chr"],start=x["start"]-args.bp,stop=x["stop"]+args.bp,strand=args.strand)): NM=getNM(read) # number of hits NC=0 # number of compatible c_coords=[] for k,coord in enumerate(coords): if overlap(read,coord) and compatible(read,coord): # don't consider the reads extend out of coords. NC+=1 c_coords.append(k) for k,c in enumerate(c_coords): coord=coords[c] if read.start < coord.start or read.stop > coord.stop: start=max(read.start,coord.start) stop=min(read.stop,coord.stop) read=read._slice(start,stop) read_in_coord = _translate(coord,read) for l in xrange(read_in_coord.start,read_in_coord.stop): data[coord.id]["values"][l]+=1.0/NC/NM for j,y in enumerate(coords): print(data[y.id]["coord"]) print(data[y.id]["values"]) '''
def run(args): fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") if args.format=="guess": args.format=IO.guess_format(args.input) s=TableIO.parse(args.input,args.format) l=[] for i,x in enumerate(s): if i/10000==0: logging.info("reading %s entrys in %s",i,args.input) l.append(x) logging.info("begin sorting") l.sort() logging.info("sorting done") for i in l: print(i,file=out) logging.info("completed")
def _generate_db(filename,db_filename,table_name): with sqlite3.connect(db_filename) as conn: cursor=conn.cursor() S=schema_t.substitute({"table_name":table_name}) cursor.execute(S) LOAD_S=insert_t.substitute({"table_name":table_name}) s=TableIO.parse(IO.fopen(filename,"r"),"simple") cursor.executemany(LOAD_S,s) conn.commit()
def run(args): logging.basicConfig(level=logging.INFO) global bam,out bam=DBI.init(args.bam,"bam") fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") p=mp.Pool(processes=args.num_cpus) beds_list=[[] for i in xrange(args.num_cpus)] for i0,bed in enumerate(TableIO.parse(fin,"bed12")): beds_list[i0%args.num_cpus].append(bed) gene_num=i0+1 print("bin_id\tmean\tentropy\treverse_strand_mean\treverse_strand_entropy",file=out) up_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(True))) output(up_results,args.bp,gene_num,"UP") results = p.map(count_list_star,itertools.izip(beds_list,itertools.repeat(args.bin_num),itertools.repeat(args.strand))) output(results,args.bin_num,gene_num,"TR") down_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(False))) output(down_results,args.bp,gene_num,"DN")
def run(local_args): logging.basicConfig(level=logging.WARNING) global args,out,dbi_bam,g, MIN_INTRON_LENGTH, MIN_SPLICING_SITES_SCORE, MIN_FPK_RATIO,query_num MIN_INTRON_LENGTH=10 MIN_SPLICING_SITES_SCORE=2 ''' IO TEMPLATE ''' ''' mySorts={ 0:sort_by_intron_and_abundance, 1:sort_by_intron, 2:sort_by_abundance } ''' args=local_args #print "debug:",args.report_seq MIN_FPK_RATIO=args.min_uniq_fpk_increase #TO TEST fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) # header=["chr","start","end","id","score","strand","seq"]; # dbi_splicing_sites=DBI.init(args.splicing_sites,"tabix",tabix="metabed",header=header); #if args.format=="guess": # args.format=IO.guess_format(args.input) reader=TableIO.parse(fin,args.format) query_list=[] query_lists=[[] for i in range(args.num_cpus)] query_num=0 for i,x in enumerate(reader): query_lists[i%args.num_cpus].append(x) query_num=i+1 #querys(query_lists[0]) #DEBUG pool=Pool(processes=args.num_cpus) results=pool.map(querys,query_lists) #print results output(results)
def run(args): logging.basicConfig(level=logging.DEBUG) db_filename=args.translator t_name,t_ext=splitext(args.translator) ''' test if it is db file generate db file if it doesn't exists. ''' if t_ext!="db": #possible_db=args.translator.strip("\\.gz")+".db" possible_db=args.translator+".db" print(possible_db) if os.path.exists(possible_db): db_filename=possible_db else: _generate_db(args.translator,possible_db,args.table_name) db_filename=possible_db ''' query db file ''' out=IO.fopen(args.output,"w") with sqlite3.connect(db_filename) as conn: conn.row_factory=lambda conn,x: Bed12._make(Bed12._types(x[1:])) cursor=conn.cursor() for i in TableIO.parse(IO.fopen(args.input,"r"),"bed"): s=template.substitute({"table_name":args.table_name,"name":i.chr.strip()}) print(s) cursor.execute(s) gene=None try: gene=cursor.fetchone() logging.debug(i) logging.debug(i.cdna_length()) logging.debug(gene) logging.debug(gene.cdna_length()) except: raise logging.warning("can't find gene %s"%i.chr) continue assert gene.cdna_length() > i.cdna_length() print(reverse_translate(gene,i),file=out)
def run(args_local): ''' IO TEMPLATE ''' global args,out,exon_cutoff,intron_cutoff,hasGenome args=args_local dbi=DBI.init(args.input,"bam") out=IO.fopen(args.output,"w") hasGenome=False if args.genome: hasGenome=True ''' END OF IO TEMPLATE ''' print >>out,"# This positive_data was generated by program ",sys.argv[1]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) chrs=[] lengths=[] for i in dbi.bamfiles[0].references: chrs.append(i) for i in dbi.bamfiles[0].lengths: lengths.append(i) p=mp.Pool(processes=args.num_cpus) coverage_bedgraphs=p.map(process_chrom,chrs) bedgraphs=[] coverages=[] for i in range(len(chrs)): bedgraphs.append(coverage_bedgraphs[i][1]) coverages.append(coverage_bedgraphs[i][0]) s=0.0 # 1000.0 l=long(0) for i in range(len(chrs)): s+=coverages[i] l+=lengths[i] l=l*2 # Double Strand coverage=s/l*1000.0 threshold=1 while 1: if prob.poisson_cdf(threshold,coverage,False) < args.pvalue: break threshold+=1 exon_cutoff=threshold intron_cutoff=2 #TODO revise it print >>out,"# MEAN COVERAGE:",coverage print >>out,"# EXON COVERAGE CUTOFF:",exon_cutoff #call_peaks(bedgraphs[0],1) #debug peaks=p.map(call_peaks_star,itertools.izip(chrs,bedgraphs,itertools.repeat(exon_cutoff))) output(chrs,peaks)
def run(args): #logging.basicConfig(level=logging.DEBUG) dbi=DBI.init(args.bam,"bam") mapped=dbi.mapped out=IO.fopen(args.output,"w") print("Gene\tRPKM",file=out); for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"): print(i.id,"\t",end="",file=out) s=0.0 l=i.cdna_length() if args.uniq: for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq): if compatible_with_transcript(j,i): s+=1.0 else: for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq): if compatible_with_transcript(j,i): (nh,_,_)=j.itemRgb.split(",") nh=int(nh) s+=1.0/nh rpkm=float(s)*(1000000.0/mapped)*(1000.0/float(l)) print(rpkm,file=out)
def parse_simple(handle,**dict): sep="\t" if dict.has_key("sep"): sep=dict["sep"] if isinstance(handle,str): try: handle=IO.fopen(handle,"r") for i in csv.reader(handle,delimiter=sep): if i[0].strip()[0]=="#": continue yield tuple(i) handle.close() except IOError as e: print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror) else: try: for i in csv.reader(handle,delimiter=sep): if i[0].strip()[0]=="#": continue yield tuple(i) except: raise
def __init__(self,file,**dict): ''' Wrapped in bam2x.DBI.init() BinIndex(file,cls=inherited_namedtuplecls) inherited_namedtuplecls should have _make and _types functino or assuming the entry in container is already formatted BinIndex(container) ''' if isinstance(file,str): file=csv.reader(IO.fopen(file,"r"),delimiter="\t") if dict.has_key("cls"): cls=dict["cls"] if isinstance(cls,str): if hclass.has_key(cls): cls=hclass[cls] else: print >>sys.stderr,"UNKNOWN FORMAT %s IN BININDEX DATA STRUCT"%cls self.data=binindex(file,cls=cls) else: self.data=binindex(file)
def run(args): logging.basicConfig(level=logging.INFO) up=args.up down=args.down bp_num=up+down offset=-up bam=DBI.init(args.bam,"bam") fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") bin_sum=[0 for i in xrange(bp_num)] bin_e=[0.0 for i in xrange(bp_num)] bin_dis=[[] for i in xrange(bp_num)] for i0,bed in enumerate(TableIO.parse(fin,args.format)): bed_bin=[0 for i in xrange(bp_num)] if args.tts: pos=bed.tts() else: pos=bed.tss() pos_flank=get_flank_region(pos,up,down) for read in bam.query(pos_flank,"bam1",strand="read1"): a=translate_coordinates(pos,read) #print(a,file=out) for e in a.Exons(): #print(e,file=out) start=e.start-offset end=e.stop-offset if start < 0: start=0 if end > bp_num: end=bp_num for j in xrange(start,end): bed_bin[j]+=1 for i in xrange(bp_num): bin_sum[i]+=bed_bin[i] bin_dis[i].append(bed_bin[i]) bed_num=i0+1 for i in xrange(bp_num): bin_e[i]=gini_coefficient(bin_dis[i]) if args.tts: print("pos_to_tts\taggregation_mean\tgini_coefficient",file=out) else: print("pos_to_tss\taggregation_mean\tgini_coefficient",file=out) for i in xrange(bp_num): print("{bin}\t{aggregation}\t{E}".format(bin=i+offset,aggregation=float(bin_sum[i])/bed_num,E=bin_e[i]),file=out) try: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt matplotlib.rcParams.update({'font.size':9}) ax1=plt.subplot2grid((7,1),(6,0)) plt.ylabel('gini coeffecient') plt.fill_between(range(-up,down),bin_e,color="r",alpha=0.2,y2=0) ax1.set_ylim(0,1) ax1.set_xlim(-up,down) ax1.axes.get_xaxis().set_visible(False) plt.axvline(x=0,linewidth=1, color='y') ax2=plt.subplot2grid((7,1),(0,0),rowspan=5) ax2.set_xlim(-up,down) plt.plot(range(-up,down),[float(i)/bed_num for i in bin_sum]) plt.ylabel('mean coverage') if args.tts: plt.xlabel('pos to tts (bp)') else: plt.xlabel('pos to tss (bp)') plt.axvline(x=0,linewidth=1, color='y') plt.grid(True) plt.savefig(args.output+".png") except: pass
def run(args): logging.basicConfig(level=logging.INFO) def process(): if len(buff)==1: return 0 max_score=0.0 total_score=0.0 e=[] for i in buff: total_score+=i.score e.append(i.score) e=[i/total_score for i in e] gini=gini_coefficient(e) if total_score < args.min_reads_number: return 0 record={} meta=BED6(buff[0].chr,buff[0].start,buff[-1].stop,args.prefix+"."+str(group_id),total_score,buff[0].strand) peak=max(buff,key=lambda x:x.score) record["peak"]=peak._replace(score=peak.score/total_score) record["meta"]=meta._replace(strand=peak.strand) record["gini"]=gini records.append(record) return 1 def simple_output(): print("# formats: bayes_prob_model2, gini, [ region bed, score is total reads], [peak bed , score is proportion ]",file=out) for i,x in enumerate(records): print("{p2}\t{gini}\t{meta}\t".format(p2=p2[i],meta=x["meta"],gini=x["gini"]),end="",file=out) print(x["peak"],file=out) def bed12_output(): print("# formats: bed12 , [R,G,B] are corresponding to [ TTS_GINI_PVALUE*200, TSS_GINI_PALUE*200, PROPORTION_OF_PEAK*200 ]",file=out) for i,x in enumerate(records): if args.tts: g=0 r=int(p2[i]*200) else: g=int(p2[i]*200) r=0 b=int(x["gini"]*200) if p2[i]>0.5: meta=x["meta"]._replace(id=x["meta"].id+".end") else: meta=x["meta"] rgb="{r},{g},{b}".format(r=r,g=g,b=b) print("{bed6}\t{thickStart}\t{thickEnd}\t{itemRgb}\t{blockCount}\t{blockSizes}\t{blockStarts}".format(bed6=meta,thickStart=x["peak"].start,thickEnd=x["peak"].end,itemRgb=rgb,blockSizes=x["meta"].stop-x["meta"].start,blockCount=1,blockStarts=0),file=out) records=[] GAP=args.gap fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") iterator=TableIO.parse(fin,"bed6") last=iterator.next() last_stop=last.stop group_id=0 buff=[last] last_chr=last.chr for x,i in enumerate(iterator): if x%10000==0: logging.info("processing {x} reads".format(x=x)); if i.chr!=last_chr or i.start-last_stop > GAP: group_id+=process() buff=[i] last_chr=i.chr last_stop=i.stop else: buff.append(i) if i.stop>last_stop: last_stop=i.stop process() gini=array([i["gini"] for i in records]) model=fit_two_peaks_EM(gini) p2=bayes_p2(gini,model) print("# Date: ",time.asctime(),file=out) print("# Program Version ",VERSION,file=out) print("# The command line is :",file=out) print("#\t"," ".join(sys.argv),file=out) print("# learning model:",file=out) print("#",model_str(model),file=out) #simple_output() bed12_output()
def run(args): fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") for i in TableIO.parse(fin,"bed12"): print(remove_small_introns(i,args.cutoff),file=out)