예제 #1
0
def run(args):
    bedformat="bed"+str(args.bed_column_number)
    dbi=DBI.init(args.genome,"genome")
    out=IO.fopen(args.output,"w")
    for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat):
        print (">",i.id+"_"+args.method,file=out)
        print (seq_wrapper(dbi.query(i,method=args.method)),file=out)
예제 #2
0
def run(args):
    schema_template=schema_templates[args.input_format]
    SQL_template=insert_templates[args.input_format]
    db_filename=args.db
    out=IO.fopen(args.output,"w")
    if db_filename=="guess":
        db_filename=args.input.strip(".gz")+".db"
    db_is_new = not os.path.exists(db_filename)
    print("Database file : %s"%db_filename,file=out)
    with sqlite3.connect(db_filename) as conn:
        cursor=conn.cursor()
        if db_is_new:
            print ('Creating table %s if not exists\n________________________________'%args.table_name,file=out)
            S=schema_template.substitute({"table_name":args.table_name})
            print (S,file=out)
            print ("_______________________________",file=out)
            cursor.execute(S)
        else:
            S=schema_template.substitute({"table_name":args.table_name})
            cursor.execute(S)

        fin=IO.fopen(args.input,"r")
        S1=SQL_template.substitute({"table_name":args.table_name})
        print(S1,file=out)
        s=TableIO.parse(args.input,"simple")
        cursor.executemany(S1,s)
        conn.commit()
        print("loaded",file=out)
예제 #3
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    G2RFP=defaultdict(init);
    '''
    END OF IO TEMPLATE 
    '''
    m=[0.0 for i in xrange(200)] # 100 to 200
    for b in TableIO.parse(IO.fopen(args.db,"r"),"bed6"):
        G2RFP[b.chr].append(b)
    total_reads=0;
    for i0,i in enumerate(TableIO.parse(fin,"bed6")):
        spectral=[0 for j in xrange(200)] # 100 to 200
        for j in G2RFP[i.chr]:
            dis=i.start-j.start
            if(dis >=-100 and dis<100): # 50 to 100
                spectral[dis+100]+=j.score # 50 to 100
        spectral=norm(spectral)
        total_reads+=i.score
        m=[a*i.score+b for a,b in itertools.izip(spectral,m)]
        if i0%100==0:
            print("{} processed\r".format(i0),file=sys.stderr)
    print("pos\tvalue",file=out);
    for i,x in enumerate(m):
        print("{}\t{}".format(i,float(x)/total_reads),file=out)
예제 #4
0
파일: query_db.py 프로젝트: HaoKuo/bam2x
def run(args):
    db_filename=args.db
    out=IO.fopen(args.output,"w")
    if os.path.exists(args.input):
        fin=IO.fopen(args.input,"r")
    else:
        fin=(args.input,)
    if not os.path.exists(db_filename):
        print("can't find database %s"%db_filename,file=sys.stderr)
        exit(1)
    print("# Database file : %s"%db_filename,file=out)
    with sqlite3.connect(db_filename) as conn:
        conn.row_factory=factories[args.db_format]
        cursor=conn.cursor()
        for i in fin:
            i=i.strip()
            i=i.strip(" ")
            print("# query %s"%i,file=out)
            s=template.substitute({"table_name":args.table_name,"name":i})
            print("# "+s,file=out)
            cursor.execute(s)
            r=None
            try:
                r=cursor.fetchone()
                print(r,file=out)
            except:
                raise
예제 #5
0
def run(args):
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    r=[]
    m=0
    ideograms=[]
    qr=""
    for i in TableIO.parse(fin,sep=","):
        if len(i)==1:
            a=i[0].split("\t")
            if len(a)==2:
                if args.query=="all" or args.query==qr:
                    if m > 0:
                        ideograms.append({"id":qr,"length":m})
                qr=a[1].strip()
        else:
            if args.query=="all" or args.query==qr:
                r.append({"chr":qr,"start":i[0],"length":i[2],"value":i[1]})
                m=int(i[0])+int(i[2])
    if args.query=="all" or args.query==qr:
        if m > 0:
           ideograms.append({"id":qr,"length":m})
    j={
        "ideograms":ideograms,
        "tracks":
        [
         {
             "name":args.input,
             "type":"bedgraph",
             "values":r
         }
        ]
    }
    print(json.dumps(j,indent=4),file=out)
예제 #6
0
파일: query_bw.py 프로젝트: HaoKuo/bam2x
def run(args):
    dbi=DBI.init(args.bw,"bigwig")
    out=IO.fopen(args.output,"w")
    for i in TableIO.parse(IO.fopen(args.input,"r"),args.format):
        ht=[ j for j in dbi.query(i,method=args.method) ]
        print("QR",i,file=out)
        print("HT",ht,file=out)
예제 #7
0
파일: read.py 프로젝트: HaoKuo/bam2x
def run(args):
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    if args.format=="guess":
        args.format=IO.guess_format(args.input)
    s=TableIO.parse(args.input,args.format)
    for i in s:
        print(i,file=out)
예제 #8
0
파일: cmpgene.py 프로젝트: HaoKuo/bam2x
def run(local_args):
    '''
    IO TEMPLATE
    '''
    global args,out
    args=local_args
    out=IO.fopen(args.output,"w")
    fin=IO.fopen(args.input,"r")
    print("# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,file=out)
    print("# in bam2x ( https://github.com/nimezhu/bam2x )",file=out)
    print("# Date: ",time.asctime(),file=out)
    print("# The command line is :",file=out)
    print("#\t"," ".join(sys.argv),file=out)
    gene=DBI.init(args.genetab,"binindex",cls="bed12");
    upstream_list=[]
    downstream_list=[]
    exons_list=[]
    introns_list=[]
    utr3_list=[]
    utr5_list=[]
    for g in gene:
        upstream_list.append(g.upstream(args.upstream));
        downstream_list.append(g.downstream(args.downstream));
        for e in g.Exons():
            exons_list.append(e)
        for i in g.Introns():
            introns_list.append(i)
        if not (g.utr3() is None):
            utr3_list.append(g.utr3())
        if not (g.utr5() is None):
            utr5_list.append(g.utr5())
    upstream=DBI.init(upstream_list,"binindex",cls="bed6")
    downstream=DBI.init(downstream_list,"binindex",cls="bed6")
    exons=DBI.init(exons_list,"binindex",cls="bed6")
    introns=DBI.init(introns_list,"binindex",cls="bed6")
    utr3=DBI.init(utr3_list,"binindex",cls="bed6")
    utr5=DBI.init(utr5_list,"binindex",cls="bed6")



    if args.format=="guess":
        args.format=IO.guess_format(args.input)
    for (i0,i) in enumerate(TableIO.parse(fin,args.format)):
        if i0==0:
            if isinstance(i,Bed12):
                print("#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out)
            else:
                print("#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out)

        print(i,file=out,end="")
        print("\t",toIDs(gene.query(i)),file=out,end="")

        print("\t",toIDs(upstream.query(i)),file=out,end="")
        print("\t",toIDs(downstream.query(i)),file=out,end="")
        print("\t",toIDs(exons.query(i)),file=out,end="")
        print("\t",toIDs(introns.query(i)),file=out,end="")
        print("\t",toIDs(utr3.query(i)),file=out,end="")
        print("\t",toIDs(utr5.query(i)),file=out)
예제 #9
0
파일: query_bam.py 프로젝트: HaoKuo/bam2x
def run(args):
    bedformat="bed"+str(args.bed_column_number)
    dbi=DBI.init(args.bam,"bam")
    out=IO.fopen(args.output,"w")
    for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat):
        print("QR",i,file=out)
        for j in dbi.query(i,method=args.method):
            print("HT",j,file=out)
        print("",file=out)
예제 #10
0
def run(args):
    bedformat = "bed" + str(args.bed_column_number)
    dbi = DBI.init(args.genome, "genome")
    out = IO.fopen(args.output, "w")
    for i in TableIO.parse(IO.fopen(args.input, "r"), bedformat):
        seq = dbi.query(i, method=args.method)
        if len(seq) == 0:
            continue
        print(">{}".format(i.id + "_" + args.method), file=out)
        print(seq_wrapper(seq), file=out, end="")
예제 #11
0
파일: __init__.py 프로젝트: HaoKuo/bam2x
def smart_init(handle,**dict):
    '''
    test version
    '''
    if isinstance(handle,str):
        fn,ext=os.path.splitext(handle)
        if ext==".bam":
            dbi=FormatToDBI["bam"]
            return dbi(handle,**dict)
        elif ext==".gz":
            if os.path.isfile(handle+".tbi"):
                fn1,ext1=os.path.splitext(fn)
                if ext1==".bed":
                    col_num=IO.get_col_number(handle)
                    t=ext1+str(col_num)
                else:
                    t=ext1
                t=t[1:]

                if TableIO.hclass.has_key(t):
                    return TabixI(handle,cls=t,**dict)
                else:
                    return TabixI(handle,**dict)
            else:
                fn1,ext1=os.path.splitext(fn)
                if ext1==".bed":
                    col_num=IO.guess(handle)
                    t=ext1+col_num
                else:
                    t=ext1
                t=t[1:]
                if TableIO.hclass.has_key(t):
                    return BinIndexI(handle,cls=t,**dict)
                else:
                    return BinIndexI(handle,**dict)
        elif ext==".bed":
            col_num=IO.get_col_number(handle)
            t=ext+str(col_num)
            if TableIO.hclass.has_key(t):
                return BinIndexI(handle,cls=t,**dict)
            else:
                if col_num >=12 :
                    return BinIndexI(handle,cls="bed12",**dict)
                elif col_num>=6:
                    return BinIndexI(handle,cls="bed6",**dict)
                else:
                    return BinIndexI(handle,cls="bed3",**dict)

        else:
            if TableIO.hclass.has_key(t):
                return BinIndexI(handle,cls=t,**dict)
    elif isinstance(handle,file):
        return smart_init(handle.name,**dict)
    else:
        return BinIndex(handle,**dict)
예제 #12
0
def run(args):
    logging.basicConfig(level=logging.INFO)
    fin = IO.fopen(args.input, "r")
    out = IO.fopen(args.output, "w")
    for qr, hits, overlap in iterate(fin):
        l = qr.cdna_length()
        pileup = [0.0 for i in xrange(l)]
        for i in hits:
            for j in xrange(i.start, i.stop):
                pileup[j] += nh(i)
        print("QR\t{id}".format(id=qr.id), file=out)
        print("PILEUP\n{value}".format(value=rep(pileup)), file=out)
예제 #13
0
def run(args):
    # logging.basicConfig(level=logging.DEBUG)
    dbi = DBI.init(args.bam, "bam")
    out = IO.fopen(args.output, "w")
    for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"):
        print("QR\t", i, file=out)
        for j in dbi.query(i, method="bam1", strand=args.strand):
            if compatible_with_transcript(j, i):
                print("HT\t{}".format(_translate_to_meta(i, j)), file=out)
            elif not args.hit:
                print("OP\t{}".format(j), file=out)
        print("", file=out)
예제 #14
0
파일: query_tabix.py 프로젝트: HaoKuo/bam2x
def run(args):
    out=IO.fopen(args.output,"w")
    cls=None
    if hclass.has_key(args.type):
        cls=hclass[args.type]
        dbi=DBI.init(args.db,"tabix",cls=cls)
    else:
        dbi=DBI.init(args.db,"tabix")
    for i in TableIO.parse(IO.fopen(args.input,"r"),args.format):
        print("QR",i,file=out)
        
        for j,ht in enumerate(dbi.query(i)):
            print("HT_{k}\t{ht}".format(k=j+1,ht=ht),file=out)
예제 #15
0
def run(args):
    if os.path.isfile(args.bed + ".tbi"):
        dbi = DBI.init(args.bed, "tabix", cls=BED12)
    else:
        dbi = DBI.init(args.bed, "binindex", cls=BED12)
    out = IO.fopen(args.output, "w")
    for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"):
        print("QR\t", i, file=out)
        for j in dbi.query(i):
            if compatible_with_transcript(j, i):
                print("HT\t{}".format(_translate_to_meta(i, j)), file=out)
            elif not args.hit:
                print("OP\t{}".format(j), file=out)
        print("", file=out)
예제 #16
0
파일: group_bed.py 프로젝트: HaoKuo/bam2x
def run(args):
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    beds=[i for i in TableIO.parse(fin,"bed12")]
    beds.sort()
    for i,x in enumerate(iter_cluster(beds)):
        id=find_prefix_consensus([i0.id for i0 in x[1]])
        strand=find_consensus_strand([i0.strand for i0 in x[1]])
        print("REGION\tCL_{index}\t{chr}\t{start}\t{end}\t{id}\t{score}\t{strand}".format(strand=strand,score=len(x[1]),chr=x[1][0].chr,start=x[1][0].start,end=x[0],index=str(i+1),id=id),file=out)
        
        for j,y in enumerate(greedy_iter_compatible_group(x[1])):
            print("\tGROUP{j}\t{bed}".format(j=j+1,bed=merge_beds(y,id="CL.{i}_GP.{j}".format(i=i+1,j=j+1))),file=out)
            for k,z in enumerate(sorted(y,key= lambda x0:x0.cdna_length(), reverse=True)):
                print("\t\tCL.{i}_GP.{j}_TR.{k}\t{l}\t{z}".format(i=i+1,j=j+1,k=k+1,l=z.cdna_length(),z=z),file=out)
예제 #17
0
파일: tabix.py 프로젝트: HaoKuo/bam2x
def run(args):
    fin=IO.fopen(args.input,"r")
    outfile=args.input
    if not args.sorted:
        l = [ i for i in TableIO.parse(fin,args.format) ]
        l.sort()
        name=splitext(args.input)
        outfile = "{name[0]}.sorted{name[1]}".format(name=name)
        out = IO.fopen(outfile,"w")
        for i in l:
            print(i,file=out)
        out.close()
    format=args.format.translate(None,digits)
    tabix_index(outfile,preset=format)
예제 #18
0
파일: getanno.py 프로젝트: HaoKuo/bam2x
def run(args):
    out=IO.fopen(args.output,"w")
    if args.annotation=="exon":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            for j in i.Exons():
                print(j,file=out)
    elif args.annotation=="intron":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            for j in i.Introns():
                print(j,file=out)
    elif args.annotation=="cds":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            j=i.cds()
            if j is not None and j.cdna_length() > 0:
                print(j,file=out)

    elif args.annotation=="cds":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            j=i.cds()
            if j is not None and j.cdna_length() > 0:
                print(j,file=out)

    elif args.annotation=="utr5":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            j=i.utr5()
            if j is not None and j.cdna_length() > 0:
                print(j,file=out)

    elif args.annotation=="utr3":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            j=i.utr3()
            if j is not None and j.cdna_length() > 0:
                print(j,file=out)
    elif args.annotation=="utr":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            j=i.utr5()
            if j is not None and j.cdna_length() > 0:
                print(j,file=out)
            j=i.utr3()
            if j is not None and j.cdna_length() > 0:
                print(j,file=out)
    elif args.annotation=="upstream":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            j=i.upstream(args.bp)
            print(j,file=out)
    elif args.annotation=="downstream":
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
            j=i.downstream(args.bp)
            print(j,file=out)
예제 #19
0
파일: bam2dis.py 프로젝트: HaoKuo/bam2x
def run(args):
    logging.basicConfig(level=logging.DEBUG) 
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    bam=DBI.init(args.bam,"bam");
    beds=[i for i in TableIO.parse(fin,"bed12")]
    beds.sort()
    bp=args.bp
    print("mapped:{}".format(bam.mapped))
    print("unmapped:{}".format(bam.unmapped))
    data={}
    for i,x in enumerate(iter_cluster(beds)):
        print("{}\t{}:{}-{}".format(i+1,x["chr"],x["start"]+1,x["stop"]))
        '''
        cds=[z.cds() for z in x["beds"] if z.cds()]
        utr3=[z.utr3() for z in x["beds"] if z.utr3()]
        utr5=[z.utr5() for z in x["beds"] if z.utr5()]
        '''
        
        coords = [ up_down_coordinate(gene,args.bp,args.bp) for gene in x["beds"] ]
        for j,y in enumerate(coords):
            data[y.id]={}
            data[y.id]["coord"]=y
            data[y.id]["values"]=[0.0 for l in range(y.cdna_length())];
        coord_beds = [ _translate(coord,bed) for coord,bed in itertools.izip(coords,x["beds"])]
        for j,read in enumerate(bam.query(method="bam1",chr=x["chr"],start=x["start"]-args.bp,stop=x["stop"]+args.bp,strand=args.strand)):
            NM=getNM(read)  # number of hits
            NC=0            # number of compatible 
            c_coords=[]
            for k,coord in enumerate(coords):
                if overlap(read,coord) and compatible(read,coord): # don't consider the reads extend out of coords.
                    NC+=1
                    c_coords.append(k)
            for k,c in enumerate(c_coords):
                coord=coords[c]
                if read.start < coord.start or read.stop > coord.stop:
                    start=max(read.start,coord.start)
                    stop=min(read.stop,coord.stop)
                    read=read._slice(start,stop)
                read_in_coord = _translate(coord,read)
                for l in xrange(read_in_coord.start,read_in_coord.stop):
                    data[coord.id]["values"][l]+=1.0/NC/NM
        for j,y in enumerate(coords):
            print(data[y.id]["coord"])
            print(data[y.id]["values"])

    '''
예제 #20
0
파일: sort.py 프로젝트: HaoKuo/bam2x
def run(args):
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    if args.format=="guess":
        args.format=IO.guess_format(args.input)
    s=TableIO.parse(args.input,args.format)
    l=[]
    for i,x in enumerate(s):
        if i/10000==0:
            logging.info("reading %s entrys in %s",i,args.input)
        l.append(x)
    logging.info("begin sorting")
    l.sort()
    logging.info("sorting done")
    for i in l:
        print(i,file=out)
    logging.info("completed")
예제 #21
0
파일: translator.py 프로젝트: HaoKuo/bam2x
def _generate_db(filename,db_filename,table_name):
    with sqlite3.connect(db_filename) as conn:
        cursor=conn.cursor()
        S=schema_t.substitute({"table_name":table_name})
        cursor.execute(S)
        LOAD_S=insert_t.substitute({"table_name":table_name})
        s=TableIO.parse(IO.fopen(filename,"r"),"simple")
        cursor.executemany(LOAD_S,s)
        conn.commit()
예제 #22
0
def run(args):
    logging.basicConfig(level=logging.INFO)
    global bam,out
    bam=DBI.init(args.bam,"bam")
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    p=mp.Pool(processes=args.num_cpus)
    beds_list=[[] for i in xrange(args.num_cpus)]
    for i0,bed in enumerate(TableIO.parse(fin,"bed12")):
        beds_list[i0%args.num_cpus].append(bed)
    gene_num=i0+1
    print("bin_id\tmean\tentropy\treverse_strand_mean\treverse_strand_entropy",file=out)
    up_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(True)))
    output(up_results,args.bp,gene_num,"UP")
    results = p.map(count_list_star,itertools.izip(beds_list,itertools.repeat(args.bin_num),itertools.repeat(args.strand)))
    output(results,args.bin_num,gene_num,"TR")
    down_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(False)))
    output(down_results,args.bp,gene_num,"DN")
예제 #23
0
파일: isoforms.py 프로젝트: HaoKuo/bam2x
def run(local_args):
    logging.basicConfig(level=logging.WARNING)
    global args,out,dbi_bam,g, MIN_INTRON_LENGTH, MIN_SPLICING_SITES_SCORE, MIN_FPK_RATIO,query_num
    MIN_INTRON_LENGTH=10
    MIN_SPLICING_SITES_SCORE=2
    '''
    IO TEMPLATE
    '''
    '''
    mySorts={ 0:sort_by_intron_and_abundance,
              1:sort_by_intron,
              2:sort_by_abundance
    }
    '''
    args=local_args
    #print "debug:",args.report_seq
    MIN_FPK_RATIO=args.min_uniq_fpk_increase #TO TEST
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    # header=["chr","start","end","id","score","strand","seq"];
    # dbi_splicing_sites=DBI.init(args.splicing_sites,"tabix",tabix="metabed",header=header);
    #if args.format=="guess":
    #    args.format=IO.guess_format(args.input)
    reader=TableIO.parse(fin,args.format)
    query_list=[]
    query_lists=[[] for i in range(args.num_cpus)]
    query_num=0
    for i,x in enumerate(reader):
        query_lists[i%args.num_cpus].append(x)
    query_num=i+1
    #querys(query_lists[0])  #DEBUG
    pool=Pool(processes=args.num_cpus)
    results=pool.map(querys,query_lists)
    #print results
    output(results)
예제 #24
0
def run(args):
    logging.basicConfig(level=logging.DEBUG)
    db_filename=args.translator
    t_name,t_ext=splitext(args.translator)
    '''
    test if it is db file
    generate db file if it doesn't exists.
    '''
    if t_ext!="db":
        #possible_db=args.translator.strip("\\.gz")+".db"
        possible_db=args.translator+".db"
        print(possible_db)
        if os.path.exists(possible_db):
            db_filename=possible_db
        else:
            _generate_db(args.translator,possible_db,args.table_name)
            db_filename=possible_db
    
    '''
    query db file
    '''
    out=IO.fopen(args.output,"w")
    with sqlite3.connect(db_filename) as conn:
        conn.row_factory=lambda conn,x: Bed12._make(Bed12._types(x[1:]))
        cursor=conn.cursor()
        for i in TableIO.parse(IO.fopen(args.input,"r"),"bed"):
            s=template.substitute({"table_name":args.table_name,"name":i.chr.strip()})
            print(s)
            cursor.execute(s)
            gene=None
            try:
                gene=cursor.fetchone()
                logging.debug(i)
                logging.debug(i.cdna_length())
                logging.debug(gene)
                logging.debug(gene.cdna_length())
            except:
                raise
                logging.warning("can't find gene %s"%i.chr)
                continue
            assert gene.cdna_length() > i.cdna_length()
            print(reverse_translate(gene,i),file=out) 
예제 #25
0
파일: bam2peak.py 프로젝트: HaoKuo/bam2x
def run(args_local):
    '''
    IO TEMPLATE
    '''
    global args,out,exon_cutoff,intron_cutoff,hasGenome
    args=args_local
    dbi=DBI.init(args.input,"bam")
    out=IO.fopen(args.output,"w")
    hasGenome=False
    if args.genome:
        hasGenome=True
    
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This positive_data was generated by program ",sys.argv[1]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    chrs=[]
    lengths=[]
    for i in dbi.bamfiles[0].references:
        chrs.append(i)
    for i in dbi.bamfiles[0].lengths:
        lengths.append(i)
    p=mp.Pool(processes=args.num_cpus)    
    coverage_bedgraphs=p.map(process_chrom,chrs)
    bedgraphs=[]
    coverages=[]
    for i in range(len(chrs)):
        bedgraphs.append(coverage_bedgraphs[i][1])
        coverages.append(coverage_bedgraphs[i][0])
    s=0.0  # 1000.0
    l=long(0)
    for i in range(len(chrs)):
        s+=coverages[i]
        l+=lengths[i]
    l=l*2 # Double Strand
    coverage=s/l*1000.0
    threshold=1
    while 1:
        if prob.poisson_cdf(threshold,coverage,False) < args.pvalue: break
        threshold+=1
    exon_cutoff=threshold
    intron_cutoff=2 #TODO revise it
    print >>out,"# MEAN COVERAGE:",coverage
    print >>out,"# EXON COVERAGE CUTOFF:",exon_cutoff
    
    #call_peaks(bedgraphs[0],1) #debug
    peaks=p.map(call_peaks_star,itertools.izip(chrs,bedgraphs,itertools.repeat(exon_cutoff)))
    output(chrs,peaks)
예제 #26
0
def run(args):
    #logging.basicConfig(level=logging.DEBUG)
    dbi=DBI.init(args.bam,"bam")
    mapped=dbi.mapped
    out=IO.fopen(args.output,"w")
    print("Gene\tRPKM",file=out);
    for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
        print(i.id,"\t",end="",file=out)
        s=0.0
        l=i.cdna_length()
        if args.uniq:
            for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq):
                if compatible_with_transcript(j,i):
                    s+=1.0
        else:
            for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq):
                if compatible_with_transcript(j,i):
                    (nh,_,_)=j.itemRgb.split(",")
                    nh=int(nh)
                    s+=1.0/nh
        rpkm=float(s)*(1000000.0/mapped)*(1000.0/float(l))
        print(rpkm,file=out)
예제 #27
0
def parse_simple(handle,**dict):
    sep="\t"
    if dict.has_key("sep"):
        sep=dict["sep"]
    if isinstance(handle,str):
        try:
            handle=IO.fopen(handle,"r")
            for i in csv.reader(handle,delimiter=sep):
                if i[0].strip()[0]=="#": continue
                yield tuple(i)
            handle.close()
        except IOError as e:
            print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
    else:
        try:    
            for i in csv.reader(handle,delimiter=sep):
                if i[0].strip()[0]=="#": continue
                yield tuple(i)
        except:
            raise
예제 #28
0
파일: DB.py 프로젝트: adamwespiser/bam2x
 def __init__(self,file,**dict):
     '''
     Wrapped in bam2x.DBI.init()
     BinIndex(file,cls=inherited_namedtuplecls)
     inherited_namedtuplecls should have _make and _types functino
     or
     assuming the entry in container is already formatted
     BinIndex(container) 
     '''
     if isinstance(file,str):
         file=csv.reader(IO.fopen(file,"r"),delimiter="\t")
     if dict.has_key("cls"):
         cls=dict["cls"]
         if isinstance(cls,str):
             if hclass.has_key(cls):
                 cls=hclass[cls]
             else:
                 print >>sys.stderr,"UNKNOWN FORMAT %s IN BININDEX DATA STRUCT"%cls
 
         self.data=binindex(file,cls=cls)
     else:
         self.data=binindex(file)
예제 #29
0
파일: aggregation.py 프로젝트: HaoKuo/bam2x
def run(args):
    logging.basicConfig(level=logging.INFO)
    up=args.up
    down=args.down
    bp_num=up+down
    offset=-up
    bam=DBI.init(args.bam,"bam")
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    bin_sum=[0 for i in xrange(bp_num)]
    bin_e=[0.0 for i in xrange(bp_num)]
    bin_dis=[[] for i in xrange(bp_num)]
    for i0,bed in enumerate(TableIO.parse(fin,args.format)):
        bed_bin=[0 for i in xrange(bp_num)]
        if args.tts:
            pos=bed.tts()
        else:
            pos=bed.tss()
        pos_flank=get_flank_region(pos,up,down)
        for read in bam.query(pos_flank,"bam1",strand="read1"):
            a=translate_coordinates(pos,read)
            #print(a,file=out)
            for e in a.Exons():
                #print(e,file=out)
                start=e.start-offset
                end=e.stop-offset
                if start < 0: start=0
                if end > bp_num: end=bp_num
                for j in xrange(start,end):
                    bed_bin[j]+=1
        for  i in xrange(bp_num):
            bin_sum[i]+=bed_bin[i]
            bin_dis[i].append(bed_bin[i])
    bed_num=i0+1
    for i in xrange(bp_num):
        bin_e[i]=gini_coefficient(bin_dis[i])
    if args.tts:
        print("pos_to_tts\taggregation_mean\tgini_coefficient",file=out)
    else:
        print("pos_to_tss\taggregation_mean\tgini_coefficient",file=out)
    for i in xrange(bp_num):
        print("{bin}\t{aggregation}\t{E}".format(bin=i+offset,aggregation=float(bin_sum[i])/bed_num,E=bin_e[i]),file=out)
    
    try:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        matplotlib.rcParams.update({'font.size':9})
        ax1=plt.subplot2grid((7,1),(6,0))
        plt.ylabel('gini coeffecient')
        plt.fill_between(range(-up,down),bin_e,color="r",alpha=0.2,y2=0)
        ax1.set_ylim(0,1)
        ax1.set_xlim(-up,down)
        ax1.axes.get_xaxis().set_visible(False)
        plt.axvline(x=0,linewidth=1, color='y')
        ax2=plt.subplot2grid((7,1),(0,0),rowspan=5)
        ax2.set_xlim(-up,down)
        plt.plot(range(-up,down),[float(i)/bed_num for i in bin_sum])
        plt.ylabel('mean coverage')
        if args.tts:
            plt.xlabel('pos to tts (bp)')
        else:
            plt.xlabel('pos to tss (bp)')
        plt.axvline(x=0,linewidth=1, color='y')
        plt.grid(True)
        plt.savefig(args.output+".png")
    except:
        pass
예제 #30
0
def run(args):
    logging.basicConfig(level=logging.INFO)
    def process():
        if len(buff)==1: return 0
        max_score=0.0
        total_score=0.0
        e=[]
        for i in buff:
            total_score+=i.score
            e.append(i.score)
        e=[i/total_score for i in e]
        gini=gini_coefficient(e)
        if total_score < args.min_reads_number:
            return 0
        record={}
        meta=BED6(buff[0].chr,buff[0].start,buff[-1].stop,args.prefix+"."+str(group_id),total_score,buff[0].strand)
        peak=max(buff,key=lambda x:x.score)
        record["peak"]=peak._replace(score=peak.score/total_score)
        record["meta"]=meta._replace(strand=peak.strand)
        record["gini"]=gini
        records.append(record)
        return 1
    
    
    def simple_output():
        print("# formats: bayes_prob_model2, gini, [ region bed, score is total reads], [peak bed , score is proportion ]",file=out)
        for i,x in enumerate(records):
            print("{p2}\t{gini}\t{meta}\t".format(p2=p2[i],meta=x["meta"],gini=x["gini"]),end="",file=out)
            print(x["peak"],file=out)
    def bed12_output():
        print("# formats: bed12 , [R,G,B] are corresponding to [ TTS_GINI_PVALUE*200, TSS_GINI_PALUE*200, PROPORTION_OF_PEAK*200 ]",file=out)
        for i,x in enumerate(records):
            if args.tts:
                g=0
                r=int(p2[i]*200)
            else:
                g=int(p2[i]*200)
                r=0
            b=int(x["gini"]*200)
            if p2[i]>0.5:
                meta=x["meta"]._replace(id=x["meta"].id+".end")
            else:
                meta=x["meta"]
            rgb="{r},{g},{b}".format(r=r,g=g,b=b)
            print("{bed6}\t{thickStart}\t{thickEnd}\t{itemRgb}\t{blockCount}\t{blockSizes}\t{blockStarts}".format(bed6=meta,thickStart=x["peak"].start,thickEnd=x["peak"].end,itemRgb=rgb,blockSizes=x["meta"].stop-x["meta"].start,blockCount=1,blockStarts=0),file=out)

    
    records=[]
    GAP=args.gap
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    iterator=TableIO.parse(fin,"bed6")
    last=iterator.next()
    last_stop=last.stop
    group_id=0
    buff=[last]
    last_chr=last.chr
    for x,i in enumerate(iterator):
        if x%10000==0: logging.info("processing {x} reads".format(x=x));
        if i.chr!=last_chr or i.start-last_stop > GAP:
            group_id+=process()
            buff=[i]
            last_chr=i.chr
            last_stop=i.stop
        else:
            buff.append(i)
            if i.stop>last_stop:
                last_stop=i.stop

    process()
    gini=array([i["gini"] for i in records])
    model=fit_two_peaks_EM(gini)
    p2=bayes_p2(gini,model)
    print("# Date: ",time.asctime(),file=out)
    print("# Program Version ",VERSION,file=out)
    print("# The command line is :",file=out)
    print("#\t"," ".join(sys.argv),file=out)
    print("# learning model:",file=out)
    print("#",model_str(model),file=out)
    #simple_output()
    bed12_output()
예제 #31
0
def run(args):
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    for i in TableIO.parse(fin,"bed12"):
        print(remove_small_introns(i,args.cutoff),file=out)