Beispiel #1
0
def querys(listx):
    dbi_bam=DBI.init(args.bam,"bam")
    genome=DBI.init(args.genome,"genome")
    results=[]
    for x in listx:
        results.append(query(x,dbi_bam,genome))
    return results
Beispiel #2
0
def run(args):
    out=IO.fopen(args.output,"w")
    cls=None
    if hclass.has_key(args.type):
        cls=hclass[args.type]
        dbi=DBI.init(args.db,"tabix",cls=cls)
    else:
        dbi=DBI.init(args.db,"tabix")
    for i in TableIO.parse(IO.fopen(args.input,"r"),args.format):
        print("QR",i,file=out)
        
        for j,ht in enumerate(dbi.query(i)):
            print("HT_{k}\t{ht}".format(k=j+1,ht=ht),file=out)
def run(args):
    if os.path.isfile(args.bed + ".tbi"):
        dbi = DBI.init(args.bed, "tabix", cls=BED12)
    else:
        dbi = DBI.init(args.bed, "binindex", cls=BED12)
    out = IO.fopen(args.output, "w")
    for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"):
        print("QR\t", i, file=out)
        for j in dbi.query(i):
            if compatible_with_transcript(j, i):
                print("HT\t{}".format(_translate_to_meta(i, j)), file=out)
            elif not args.hit:
                print("OP\t{}".format(j), file=out)
        print("", file=out)
Beispiel #4
0
def run(args):
    bedformat="bed"+str(args.bed_column_number)
    dbi=DBI.init(args.genome,"genome")
    out=IO.fopen(args.output,"w")
    for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat):
        print (">",i.id+"_"+args.method,file=out)
        print (seq_wrapper(dbi.query(i,method=args.method)),file=out)
Beispiel #5
0
def run(args):
    dbi=DBI.init(args.bw,"bigwig")
    out=IO.fopen(args.output,"w")
    for i in TableIO.parse(IO.fopen(args.input,"r"),args.format):
        ht=[ j for j in dbi.query(i,method=args.method) ]
        print("QR",i,file=out)
        print("HT",ht,file=out)
Beispiel #6
0
def run(args):
    bedformat="bed"+str(args.bed_column_number)
    dbi=DBI.init(args.bam,"bam")
    out=IO.fopen(args.output,"w")
    for i in TableIO.parse(IO.fopen(args.input,"r"),bedformat):
        print("QR",i,file=out)
        for j in dbi.query(i,method=args.method):
            print("HT",j,file=out)
        print("",file=out)
Beispiel #7
0
def run(args):
    bedformat = "bed" + str(args.bed_column_number)
    dbi = DBI.init(args.genome, "genome")
    out = IO.fopen(args.output, "w")
    for i in TableIO.parse(IO.fopen(args.input, "r"), bedformat):
        seq = dbi.query(i, method=args.method)
        if len(seq) == 0:
            continue
        print(">{}".format(i.id + "_" + args.method), file=out)
        print(seq_wrapper(seq), file=out, end="")
Beispiel #8
0
def run(local_args):
    '''
    IO TEMPLATE
    '''
    global args,out
    args=local_args
    out=IO.fopen(args.output,"w")
    fin=IO.fopen(args.input,"r")
    print("# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,file=out)
    print("# in bam2x ( https://github.com/nimezhu/bam2x )",file=out)
    print("# Date: ",time.asctime(),file=out)
    print("# The command line is :",file=out)
    print("#\t"," ".join(sys.argv),file=out)
    gene=DBI.init(args.genetab,"binindex",cls="bed12");
    upstream_list=[]
    downstream_list=[]
    exons_list=[]
    introns_list=[]
    utr3_list=[]
    utr5_list=[]
    for g in gene:
        upstream_list.append(g.upstream(args.upstream));
        downstream_list.append(g.downstream(args.downstream));
        for e in g.Exons():
            exons_list.append(e)
        for i in g.Introns():
            introns_list.append(i)
        if not (g.utr3() is None):
            utr3_list.append(g.utr3())
        if not (g.utr5() is None):
            utr5_list.append(g.utr5())
    upstream=DBI.init(upstream_list,"binindex",cls="bed6")
    downstream=DBI.init(downstream_list,"binindex",cls="bed6")
    exons=DBI.init(exons_list,"binindex",cls="bed6")
    introns=DBI.init(introns_list,"binindex",cls="bed6")
    utr3=DBI.init(utr3_list,"binindex",cls="bed6")
    utr5=DBI.init(utr5_list,"binindex",cls="bed6")



    if args.format=="guess":
        args.format=IO.guess_format(args.input)
    for (i0,i) in enumerate(TableIO.parse(fin,args.format)):
        if i0==0:
            if isinstance(i,Bed12):
                print("#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out)
            else:
                print("#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5",file=out)

        print(i,file=out,end="")
        print("\t",toIDs(gene.query(i)),file=out,end="")

        print("\t",toIDs(upstream.query(i)),file=out,end="")
        print("\t",toIDs(downstream.query(i)),file=out,end="")
        print("\t",toIDs(exons.query(i)),file=out,end="")
        print("\t",toIDs(introns.query(i)),file=out,end="")
        print("\t",toIDs(utr3.query(i)),file=out,end="")
        print("\t",toIDs(utr5.query(i)),file=out)
Beispiel #9
0
def run(args):
    # logging.basicConfig(level=logging.DEBUG)
    dbi = DBI.init(args.bam, "bam")
    out = IO.fopen(args.output, "w")
    for i in TableIO.parse(IO.fopen(args.input, "r"), "bed12"):
        print("QR\t", i, file=out)
        for j in dbi.query(i, method="bam1", strand=args.strand):
            if compatible_with_transcript(j, i):
                print("HT\t{}".format(_translate_to_meta(i, j)), file=out)
            elif not args.hit:
                print("OP\t{}".format(j), file=out)
        print("", file=out)
Beispiel #10
0
def run(args_local):
    '''
    IO TEMPLATE
    '''
    global args,out,exon_cutoff,intron_cutoff,hasGenome
    args=args_local
    dbi=DBI.init(args.input,"bam")
    out=IO.fopen(args.output,"w")
    hasGenome=False
    if args.genome:
        hasGenome=True
    
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This positive_data was generated by program ",sys.argv[1]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    chrs=[]
    lengths=[]
    for i in dbi.bamfiles[0].references:
        chrs.append(i)
    for i in dbi.bamfiles[0].lengths:
        lengths.append(i)
    p=mp.Pool(processes=args.num_cpus)    
    coverage_bedgraphs=p.map(process_chrom,chrs)
    bedgraphs=[]
    coverages=[]
    for i in range(len(chrs)):
        bedgraphs.append(coverage_bedgraphs[i][1])
        coverages.append(coverage_bedgraphs[i][0])
    s=0.0  # 1000.0
    l=long(0)
    for i in range(len(chrs)):
        s+=coverages[i]
        l+=lengths[i]
    l=l*2 # Double Strand
    coverage=s/l*1000.0
    threshold=1
    while 1:
        if prob.poisson_cdf(threshold,coverage,False) < args.pvalue: break
        threshold+=1
    exon_cutoff=threshold
    intron_cutoff=2 #TODO revise it
    print >>out,"# MEAN COVERAGE:",coverage
    print >>out,"# EXON COVERAGE CUTOFF:",exon_cutoff
    
    #call_peaks(bedgraphs[0],1) #debug
    peaks=p.map(call_peaks_star,itertools.izip(chrs,bedgraphs,itertools.repeat(exon_cutoff)))
    output(chrs,peaks)
Beispiel #11
0
def process_chrom(chrom):
    local_dbi=DBI.init(args.input,"bam")
    retv=list()
    intron_retv=list()
    a=[]
    #positive_data=TuringSortingArray(None,500)
    positive_data=TuringSortingArray()
    negative_data=TuringSortingArray()
    
    positive_intron_data=TuringSortingArray()
    negative_intron_data=TuringSortingArray()

    for i in local_dbi.query(chrom,method="bam1",strand=args.strand):
        if i.strand=="+" or i.strand==".": 
            positive_data.append(TuringCode(i.start,cb.ON))
            positive_data.append(TuringCode(i.stop,cb.OFF))
            for j in i.Exons():
                positive_data.append(TuringCode(j.start,cb.BLOCKON))
                positive_data.append(TuringCode(j.stop,cb.BLOCKOFF))
            for j in i.Introns():
                positive_intron_data.append(TuringCode(j.start,cb.BLOCKON))
                positive_intron_data.append(TuringCode(j.stop,cb.BLOCKOFF))
        else:
            negative_data.append(TuringCode(i.start,cb.ON))
            negative_data.append(TuringCode(i.stop,cb.OFF))
            for j in i.Exons():
                negative_data.append(TuringCode(j.start,cb.BLOCKON))
                negative_data.append(TuringCode(j.stop,cb.BLOCKOFF))
            for j in i.Introns():
                negative_intron_data.append(TuringCode(j.start,cb.BLOCKON))
                negative_intron_data.append(TuringCode(j.stop,cb.BLOCKOFF))
    cutoff=args.cutoff
    coverage=0.0
    for i,x in enumerate(codesToBedGraph(positive_data.iter())):
        if x[SCORE_INDEX] >= cutoff:
            retv.append((x[0],x[1],x[2],POSITIVE_STRAND,EXON_GROUP_CODE))
        coverage+=float(x[1]-x[0])*x[2]/1000.0
    for i,x in enumerate(codesToBedGraph(negative_data.iter())):
        if x[SCORE_INDEX] >= cutoff:
            retv.append((x[0],x[1],x[2],NEGATIVE_STRAND,EXON_GROUP_CODE))
        coverage+=float(x[1]-x[0])*x[2]/1000.0
    INTRON_CUTOFF=1.0
    for i,x in enumerate(codesToBedGraph(positive_intron_data.iter())):
        if x[SCORE_INDEX] >= INTRON_CUTOFF:
            retv.append((x[0],x[1],x[2],POSITIVE_STRAND,INTRON_GROUP_CODE))
    for i,x in enumerate(codesToBedGraph(negative_intron_data.iter())):
        if x[SCORE_INDEX] >= INTRON_CUTOFF:
            retv.append((x[0],x[1],x[2],NEGATIVE_STRAND,INTRON_GROUP_CODE))
    retv.sort(key=itemgetter(0,1,2))
    #TODO how to sort!
    local_dbi.close()
    return coverage,retv
Beispiel #12
0
def run(args):
    logging.basicConfig(level=logging.DEBUG) 
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    bam=DBI.init(args.bam,"bam");
    beds=[i for i in TableIO.parse(fin,"bed12")]
    beds.sort()
    bp=args.bp
    print("mapped:{}".format(bam.mapped))
    print("unmapped:{}".format(bam.unmapped))
    data={}
    for i,x in enumerate(iter_cluster(beds)):
        print("{}\t{}:{}-{}".format(i+1,x["chr"],x["start"]+1,x["stop"]))
        '''
        cds=[z.cds() for z in x["beds"] if z.cds()]
        utr3=[z.utr3() for z in x["beds"] if z.utr3()]
        utr5=[z.utr5() for z in x["beds"] if z.utr5()]
        '''
        
        coords = [ up_down_coordinate(gene,args.bp,args.bp) for gene in x["beds"] ]
        for j,y in enumerate(coords):
            data[y.id]={}
            data[y.id]["coord"]=y
            data[y.id]["values"]=[0.0 for l in range(y.cdna_length())];
        coord_beds = [ _translate(coord,bed) for coord,bed in itertools.izip(coords,x["beds"])]
        for j,read in enumerate(bam.query(method="bam1",chr=x["chr"],start=x["start"]-args.bp,stop=x["stop"]+args.bp,strand=args.strand)):
            NM=getNM(read)  # number of hits
            NC=0            # number of compatible 
            c_coords=[]
            for k,coord in enumerate(coords):
                if overlap(read,coord) and compatible(read,coord): # don't consider the reads extend out of coords.
                    NC+=1
                    c_coords.append(k)
            for k,c in enumerate(c_coords):
                coord=coords[c]
                if read.start < coord.start or read.stop > coord.stop:
                    start=max(read.start,coord.start)
                    stop=min(read.stop,coord.stop)
                    read=read._slice(start,stop)
                read_in_coord = _translate(coord,read)
                for l in xrange(read_in_coord.start,read_in_coord.stop):
                    data[coord.id]["values"][l]+=1.0/NC/NM
        for j,y in enumerate(coords):
            print(data[y.id]["coord"])
            print(data[y.id]["values"])

    '''
Beispiel #13
0
def run(args):
    logging.basicConfig(level=logging.INFO)
    global bam,out
    bam=DBI.init(args.bam,"bam")
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    p=mp.Pool(processes=args.num_cpus)
    beds_list=[[] for i in xrange(args.num_cpus)]
    for i0,bed in enumerate(TableIO.parse(fin,"bed12")):
        beds_list[i0%args.num_cpus].append(bed)
    gene_num=i0+1
    print("bin_id\tmean\tentropy\treverse_strand_mean\treverse_strand_entropy",file=out)
    up_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(True)))
    output(up_results,args.bp,gene_num,"UP")
    results = p.map(count_list_star,itertools.izip(beds_list,itertools.repeat(args.bin_num),itertools.repeat(args.strand)))
    output(results,args.bin_num,gene_num,"TR")
    down_results=p.map(count_flank_star,itertools.izip(beds_list,itertools.repeat(args.bp),itertools.repeat(args.strand),itertools.repeat(False)))
    output(down_results,args.bp,gene_num,"DN")
Beispiel #14
0
def run(args):
    #logging.basicConfig(level=logging.DEBUG)
    dbi=DBI.init(args.bam,"bam")
    mapped=dbi.mapped
    out=IO.fopen(args.output,"w")
    print("Gene\tRPKM",file=out);
    for i in TableIO.parse(IO.fopen(args.input,"r"),"bed12"):
        print(i.id,"\t",end="",file=out)
        s=0.0
        l=i.cdna_length()
        if args.uniq:
            for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq):
                if compatible_with_transcript(j,i):
                    s+=1.0
        else:
            for j in dbi.query(i,method="bam1",strand=args.strand,uniq=args.uniq):
                if compatible_with_transcript(j,i):
                    (nh,_,_)=j.itemRgb.split(",")
                    nh=int(nh)
                    s+=1.0/nh
        rpkm=float(s)*(1000000.0/mapped)*(1000.0/float(l))
        print(rpkm,file=out)
Beispiel #15
0
def run(args):
    logging.basicConfig(level=logging.INFO)
    up=args.up
    down=args.down
    bp_num=up+down
    offset=-up
    bam=DBI.init(args.bam,"bam")
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    bin_sum=[0 for i in xrange(bp_num)]
    bin_e=[0.0 for i in xrange(bp_num)]
    bin_dis=[[] for i in xrange(bp_num)]
    for i0,bed in enumerate(TableIO.parse(fin,args.format)):
        bed_bin=[0 for i in xrange(bp_num)]
        if args.tts:
            pos=bed.tts()
        else:
            pos=bed.tss()
        pos_flank=get_flank_region(pos,up,down)
        for read in bam.query(pos_flank,"bam1",strand="read1"):
            a=translate_coordinates(pos,read)
            #print(a,file=out)
            for e in a.Exons():
                #print(e,file=out)
                start=e.start-offset
                end=e.stop-offset
                if start < 0: start=0
                if end > bp_num: end=bp_num
                for j in xrange(start,end):
                    bed_bin[j]+=1
        for  i in xrange(bp_num):
            bin_sum[i]+=bed_bin[i]
            bin_dis[i].append(bed_bin[i])
    bed_num=i0+1
    for i in xrange(bp_num):
        bin_e[i]=gini_coefficient(bin_dis[i])
    if args.tts:
        print("pos_to_tts\taggregation_mean\tgini_coefficient",file=out)
    else:
        print("pos_to_tss\taggregation_mean\tgini_coefficient",file=out)
    for i in xrange(bp_num):
        print("{bin}\t{aggregation}\t{E}".format(bin=i+offset,aggregation=float(bin_sum[i])/bed_num,E=bin_e[i]),file=out)
    
    try:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        matplotlib.rcParams.update({'font.size':9})
        ax1=plt.subplot2grid((7,1),(6,0))
        plt.ylabel('gini coeffecient')
        plt.fill_between(range(-up,down),bin_e,color="r",alpha=0.2,y2=0)
        ax1.set_ylim(0,1)
        ax1.set_xlim(-up,down)
        ax1.axes.get_xaxis().set_visible(False)
        plt.axvline(x=0,linewidth=1, color='y')
        ax2=plt.subplot2grid((7,1),(0,0),rowspan=5)
        ax2.set_xlim(-up,down)
        plt.plot(range(-up,down),[float(i)/bed_num for i in bin_sum])
        plt.ylabel('mean coverage')
        if args.tts:
            plt.xlabel('pos to tts (bp)')
        else:
            plt.xlabel('pos to tss (bp)')
        plt.axvline(x=0,linewidth=1, color='y')
        plt.grid(True)
        plt.savefig(args.output+".png")
    except:
        pass
Beispiel #16
0
def call_peaks(chrom,bedgraph,exon_cutoff):
    #TODO
    
    if hasGenome:
        genome=DBI.init(args.genome,"genome")
    def filter_low_complexity(peak):
        new_exon_starts=[]
        new_exon_sizes=[]
        local_peak=list(peak)
        offset=local_peak[START_INDEX]
        for exon_start,exon_size in itertools.izip(local_peak[EXONSTARTS_INDEX],local_peak[EXONSIZES_INDEX]):
            exon_s=offset+exon_start
            exon_e=offset+exon_start+exon_size
            seq=genome.query(BED3(chrom,exon_s,exon_e),method="seq")
            if len(seq) > 100:
                cplx=float(complexity(seq[0:100]))/100
            else:
                cplx=float(complexity(seq))/len(seq)
            if cplx > CPLX_CUTOFF:
                new_exon_starts.append(exon_start)
                new_exon_sizes.append(exon_size)
        if len(new_exon_sizes) > 0:
            local_peak[EXONSIZES_INDEX]=tuple(new_exon_sizes)
            local_peak[START_INDEX]=new_exon_starts[0]+offset
            local_peak[STOP_INDEX]=new_exon_starts[-1]+offset+new_exon_sizes[-1]
            shift_start=new_exon_starts[0]
            for i0 in range(len(new_exon_starts)):
                new_exon_starts[i0]-=shift_start
            local_peak[EXONSTARTS_INDEX]=tuple(new_exon_starts)
            return tuple(local_peak)
        else:
            return None
    gap=10
    pos_beds=[]
    neg_beds=[]
    peaks=[]
    i_p=0
    i_n=0
    last_pos_stop=0
    last_neg_stop=0
    for i in bedgraph:
        if i[STRAND_INDEX]==POSITIVE_STRAND:
            if i[GROUP_INDEX]==EXON_GROUP_CODE:
                if i[SCORE_INDEX] >= exon_cutoff:
                    if len(pos_beds)>0:
                        if  i[START_INDEX]-last_pos_stop < gap or last_pos_stop==0:
                            pos_beds.append(i)
                            if last_pos_stop < i[STOP_INDEX]:
                                last_pos_stop=i[STOP_INDEX]
                        else:
                            
                            peak=bedsToPeak(pos_beds,"p_"+str(i_p))
                            if peak is not None:
                                if hasGenome:
                                    peak=filter_low_complexity(peak)
                                    if peak is not None:
                                        peaks.append(peak)
                                        i_p+=1
                                else:
                                    peaks.append(peak)
                                    i_p+=1
                            pos_beds=[i]
                            last_pos_stop=i[STOP_INDEX]
                    else:
                        last_pos_stop=i[STOP_INDEX]
                        pos_beds.append(i)
            elif i[GROUP_INDEX]==INTRON_GROUP_CODE:
                if last_pos_stop < i[STOP_INDEX]:
                    last_pos_stop=i[STOP_INDEX]
                pos_beds.append(i)
        else:
            if i[GROUP_INDEX]==EXON_GROUP_CODE:
                if i[SCORE_INDEX] >= exon_cutoff:
                    if len(neg_beds)>0:
                        if  i[START_INDEX]-last_neg_stop < gap or last_neg_stop==0:
                            neg_beds.append(i)
                            if last_neg_stop < i[STOP_INDEX]:
                                last_neg_stop=i[STOP_INDEX]
                        else:
                            peak=bedsToPeak(neg_beds,"n_"+str(i_n))
                            if peak is not None:
                                if hasGenome:
                                    peak=filter_low_complexity(peak)
                                    if peak is not None:
                                        peaks.append(peak)
                                        i_n+=1
                                else:
                                    peaks.append(peak)
                                    i_n+=1
                            neg_beds=[i]
                            last_neg_stop=i[STOP_INDEX]
                    else:
                        neg_beds.append(i)
                        last_neg_stop=i[STOP_INDEX]
            elif i[GROUP_INDEX]==INTRON_GROUP_CODE:
                if last_neg_stop < i[STOP_INDEX]:
                    last_neg_stop=i[STOP_INDEX]
                neg_beds.append(i)

    if len(pos_beds)>0:
        peak=bedsToPeak(pos_beds,"p_"+str(i_p))
        if peak is not None:
            if hasGenome:
                peak=filter_low_complexity(peak)
                if peak is not None:        
                    peaks.append(peak)
            else:
                peaks.append(peak)
    if len(neg_beds)>0:
        peak=bedsToPeak(neg_beds,"n_"+str(i_n))
        if peak is not None:
            if hasGenome:
                peak=filter_low_complexity(peak)
                if peak is not None:        
                    peaks.append(peak)
            
            else:
                peaks.append(peak)
    peaks.sort()
    return peaks