Example #1
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    dbi=DBI.init(args.db,Tools.guess_format(args.db))
    references=dbi.bamfiles[0].references
    for i in TableIO.parse(fin,args.format):
        print i
        n=0
        c_count=0
        reads=dbi.query(i,args.method)
        for read in reads:
            compatible=Tools.compatible_with_transcript(read,i,references=references,strand=args.strand)
            print "HT:"
            for i0,r in enumerate(TableIO.parse(read.reads,"bam2bed12",references=references)):
                print "READ"+str(i0)+"\t",r
            print "COMPATIBLE:",compatible,"\n\n"
            if compatible: c_count+=1
            n+=1
        print "COMPATIBLE / ALL OVERLAP READS =  ",c_count,"/",n
        print "RATIO\t%.4f"%float(c_count)/n
Example #2
0
def BamToBed12Iterator(handle,**kwargs):
    '''
    handle is an bam iterator
    need references hash if handle is not filename.
    '''
    if type(handle)==type("string"):
        handle=pysam.Samfile(handle,"rb");
    for i in handle:
        #print i #debug
        if i.tid<0: continue
        strand="+"
        if i.is_reverse:
            strand="-"
        score=i.mapq
        
        '''
        test
        '''
        if kwargs.has_key("references"):
            if isinstance(kwargs["references"],str):
                chr=kwargs["references"]  
            else:
                chr=kwargs["references"][i.tid];
        else:
            try:
                 chr=handle.references[i.tid];
            except:
                 chr="chr"
        if kwargs.has_key("strand"):
            if kwargs["strand"]=="read1" or kwargs["strand"]=="firstMate":
                read1=True
            else:
                read1=False
        else:
            read1=True   
        start=i.pos
        end=i.aend
        name=i.qname
        cds_start=start
        cds_end=start
        itemRgb="0,0,0"
        '''
        debug
        import sys
        if i.cigar is None:
            print >>sys.stderr,"why cigar is Nonetype?"
            print >>sys.stderr,i
            exit(0)
        end of debug
        '''
        if i.cigar==None: continue # IGNORE THIS READS?
        (block_starts,block_sizes)=Tools.cigar_to_coordinates(i.cigar);
        if i.is_read1 and not read1:
            strand=Tools.reverse_strand(strand)
        elif i.is_read2 and read1:
            strand=Tools.reverse_strand(strand)
        bed=Bed12([chr,start,end,name,score,strand,cds_start,cds_end,itemRgb,len(block_sizes),block_sizes,block_starts])
        yield bed
Example #3
0
def BamToBed12Iterator(handle, **kwargs):
    '''
    handle is an bam iterator
    need references hash if handle is not filename.
    '''
    if type(handle) == type("string"):
        handle = pysam.Samfile(handle, "rb")
    for i in handle:
        #print i #debug
        if i.tid < 0: continue
        strand = "+"
        if i.is_reverse:
            strand = "-"
        score = i.mapq
        '''
        test
        '''
        if kwargs.has_key("references"):
            chr = kwargs["references"][i.tid]
        else:
            try:
                chr = handle.references[i.tid]
            except:
                chr = "chr"
        if kwargs.has_key("strand"):
            if kwargs["strand"] == "read1" or kwargs["strand"] == "firstMate":
                read1 = True
            else:
                read1 = False
        else:
            read1 = True
        start = i.pos
        end = i.aend
        name = i.qname
        cds_start = start
        cds_end = start
        itemRgb = "0,0,0"
        '''
        debug
        import sys
        if i.cigar is None:
            print >>sys.stderr,"why cigar is Nonetype?"
            print >>sys.stderr,i
            exit(0)
        end of debug
        '''
        if i.cigar == None: continue  # IGNORE THIS READS?
        (block_starts, block_sizes) = Tools.cigar_to_coordinates(i.cigar)
        if i.is_read1 and not read1:
            strand = Tools.reverse_strand(strand)
        elif i.is_read2 and read1:
            strand = Tools.reverse_strand(strand)
        bed = Bed12([
            chr, start, end, name, score, strand, cds_start, cds_end, itemRgb,
            len(block_sizes), block_sizes, block_starts
        ])
        yield bed
Example #4
0
File: IO.py Project: nimezhu/xplib
def fopen(file,mode="r",**kwargs):
    '''
    '''
    if Tools.guess_format(file)=="bam" and mode=="r":
        return pysam.Samfile(file,"rb")
    if mode=="w":
        return open_output(file)
    if mode=="r":
        return open_input(file)
    return None
Example #5
0
File: IO.py Project: sterding/bam2x
def fopen(file, mode="r", **kwargs):
    '''
    '''
    if Tools.guess_format(file) == "bam" and mode == "r":
        return pysam.Samfile(file, "rb")
    if mode == "w":
        return open_output(file)
    if mode == "r":
        return open_input(file)
    return None
Example #6
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args, out
    args = ParseArg()
    fin = IO.fopen(args.input, "r")
    out = IO.fopen(args.output, "w")
    '''
    END OF IO TEMPLATE 
    '''
    print >> out, "# This data was generated by program ", sys.argv[
        0], " (version: %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :"
    print >> out, "#\t", " ".join(sys.argv)
    dbi = DBI.init(args.db, Tools.guess_format(args.db))
    references = dbi.bamfiles[0].references
    for i in TableIO.parse(fin, args.format):
        print i
        n = 0
        c_count = 0
        reads = dbi.query(i, args.method)
        for read in reads:
            compatible = Tools.compatible_with_transcript(
                read, i, references=references, strand=args.strand)
            print "HT:"
            for i0, r in enumerate(
                    TableIO.parse(read.reads,
                                  "bam2bed12",
                                  references=references)):
                print "READ" + str(i0) + "\t", r
            print "COMPATIBLE:", compatible, "\n\n"
            if compatible: c_count += 1
            n += 1
        print "COMPATIBLE / ALL OVERLAP READS =  ", c_count, "/", n
        print "RATIO\t%.4f" % float(c_count) / n
Example #7
0
def Main():
    '''
    This program is a test for TableIO.parse(file.bam,"bam2bed")

    '''
    global args,out
    args=ParseArg()
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    if args.format=="guess":
        args.format=Tools.guess_format(args.input)
    s=TableIO.parse(args.input,args.format)
    for i in s:
        print >>out,i
Example #8
0
def Main():
    global args,out
    args=ParseArg()
    dict={}
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    argv=sys.argv
    argv[0]=argv[0].split("/")[-1]
    print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :\n#\t"," ".join(argv)
    if args.query_method:
        dict["method"]=args.query_method
    dbi=DBI.init(args.db,args.dbformat)
    hits=0
    query=0
    if args.input=="stdin":
        input=sys.stdin
    else:
        input=args.input

    query_length=0
    hits_number=0
    for (i0,x) in enumerate(TableIO.parse(input,args.input_format)):
        if i0%10==0:
            print >>sys.stderr,"query ",i0," entries\r",
        print >>out,"QR\t",x
        hit=0
        query+=1
        query_length+=len(x)
        results=dbi.query(x,**dict)
        compatible=0
        #print >>sys.stderr,type(results)
        if isinstance(results,numpy.ndarray) or isinstance(results,list):
            if not args.silence:
                print >>out,"HT\t",
                for value in results:
                    print >>out,str(value)+",",
                print >>out,""
            hit=1
            hits_number+=1
        elif isinstance(results,str):
            if not args.silence:
                print >>out,"HT\t",
                print >>out,results
            hit=1
            hits_number+=1

        else:
            this_query_hits=0
            for j in results:
                
                if not args.silence:
                    print >>out,"HT\t",j,
                hit=1
                hits_number+=1
                this_query_hits+=1
                if isinstance(j,xplib.Annotation.Bed12) and isinstance(x,xplib.Annotation.Bed12):
                    compatible_binary=Tools.compatible_with_transcript(j,x)
                    if not args.silence:
                        print >>out,"\tCompatible:",compatible_binary
                    if compatible_binary:
                        compatible+=1
                else:
                    if not args.silence:
                        print >>out,""
            print >>out,"HN\t",this_query_hits
            if compatible>0:
                print >>out,"CP\t",compatible

        if args.dbformat=="tabix":
            x.chr=x.chr.replace("chr","")
            for j in dbi.query(x,**dict):
                print >>out,"HT\t",j
                hit=1
                hits_number+=1
        hits+=hit
    print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits
    print >>out,"# Query Length:",query_length
    print >>out,"# Hits Number:",hits_number
Example #9
0
def Main():
    global args, out
    args = ParseArg()
    dict = {}
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    argv = sys.argv
    argv[0] = argv[0].split("/")[-1]
    print >> out, "# This data was generated by program ", argv[
        0], "(version %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :\n#\t", " ".join(argv)
    init_dict = {}
    if args.dbformat == "guess":
        if Tools.suffix(args.db) == "gz":
            args.dbformat = "tabix"
            args.tabix_format = Tools.guess_format(args.db)
        else:
            args.dbformat = Tools.guess_format(args.db)

    if args.query_method:
        dict["method"] = args.query_method
    if args.tabix_format:
        init_dict["tabix"] = args.tabix_format

    dbi = DBI.init(args.db, args.dbformat, **init_dict)
    hits = 0
    query = 0
    if args.input == "stdin":
        input = sys.stdin
    else:
        input = args.input

    query_length = 0
    hits_number = 0
    if (args.input_format == "guess"):
        args.input_format = Tools.guess_format(args.input)
    for (i0, x) in enumerate(TableIO.parse(input, args.input_format)):
        if i0 % 100 == 0:
            print >> sys.stderr, "query ", i0, " entries\r",
        print >> out, "QR\t", x
        hit = 0
        query += 1
        query_length += len(x)
        #print dbi;#debug
        results = dbi.query(x, **dict)
        #results=dbi.query(x) #DEBUG
        #print >>sys.stderr,type(results)
        if isinstance(results, numpy.ndarray) or isinstance(results, list):
            print >> out, "HT\t",
            for value in results:
                print >> out, str(value) + ",",
            print >> out, ""
            hit = 1
            hits_number += 1
        elif isinstance(results, str):
            print >> out, "HT\t",
            print >> out, results
            hit = 1
            hits_number += 1

        else:
            for j in results:
                print >> out, "HT\t", j
                hit = 1
                hits_number += 1

        if args.dbformat == "tabix":
            x.chr = x.chr.replace("chr", "")
            for j in dbi.query(x, **dict):
                print >> out, "HT\t", j
                hit = 1
                hits_number += 1
        hits += hit
    print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits
    print >> out, "# Query Length:", query_length
    print >> out, "# Hits Number:", hits_number
Example #10
0
def compare_reads(isoforms):
    # global dbi,out
    isoforms_set=[]
    chr=isoforms[0].chr
    min_start=isoforms[0].start
    max_stop=isoforms[0].stop
    for i in isoforms:
        if i.start < min_start: min_start=i.start
        if i.stop > max_stop: max_stop=i.stop
        isoforms_set.append(i)
#        print >>sys.stderr,"debug",i
    transcript_region=Bed([chr,min_start,max_stop]);
    print >>out,"REGION\t",chr,"\t",min_start,"\t",max_stop
    print >>out,"ISOFORM_INPUT_NUMBER\t",len(isoforms_set)

    
    '''
    reading all the reads in this transcript region
    '''
    reads_set=[]
    reads_num=0
    for i in dbi.query(transcript_region,method="fetch12"):
        reads_set.append(i)
        reads_num+=1


    '''
    compare two sets
    '''
    l=len(isoforms_set)
    bincodes={}
    total=reads_num
    if total==0: total=0.001
    for i in reads_set:
        bincode=0
        for j in isoforms_set:
            k=Tools.compatible_with_transcript(i,j)
            if k:
                bincode = (bincode<<1)+1
            else:
                bincode = bincode<<1
        if  bincodes.has_key(bincode):
            bincodes[bincode]+=1
        else:
            bincodes[bincode]=1
        
    
    init=[ 1.0/l for i in range(l) ]
    proportion=init
    '''
    EM Initialize
    '''

    '''
    E step
    '''
    totals=[0.0 for i in range(l)]
    new_proportion=[0.0 for i in range(l)]
    iterate_time=0;
    while(1):
        totals=[0.0 for i in range(l)]
        for code in bincodes.keys():
            row_total=0.0
            for j in range(l):
                if get_bit_n(j,l,code):
                    row_total+=proportion[j]
            for j in range(l):
                if get_bit_n(j,l,code):
                    totals[j]+=bincodes[code] * proportion[j] / row_total

        for i in range(l):
            new_proportion[i]=totals[i]/total
        '''
        M step
        '''
        #print >>sys.stderr,"proportion",proportion #debug
        #print >>sys.stderr,"new_proportion",new_proportion #debug
        #print >>sys.stderr,"total",total #debug
        #print >>sys.stderr,"totals",totals #debug
        dis=distance(proportion,new_proportion)
        proportion=new_proportion
        iterate_time+=1
        if(dis<1e-05): break;
        if(args.BYY and iterate_time > 10): break;
   
    '''
    BYY Hard Cut Algorithm
    '''
    while(args.BYY):
        totals=[0.0 for i in range(l)]
        for code in bincodes.keys():
            maxj=-1
            for j in range(l):
                if get_bit_n(j,l,code):
                   # totals[j]+=bincodes[code] * proportion[j] / row_total
                   if maxj==-1: maxj=j
                   elif proportion[j] > proportion[maxj]: maxj=j
            if maxj!=-1: totals[maxj]+=bincodes[code]
        #new_proportion=[0.0 for i in range(l)]
        for i in range(l):
            new_proportion[i]=totals[i]/total
        '''
        M step
        '''
        dis=distance(proportion,new_proportion)
        if(dis<1e-05): break;
        proportion=new_proportion
    # print >>out,proportion

    '''
    print isoforms
    '''
    for i,x in enumerate(isoforms_set):
        if proportion[i] > args.threshold:  
            if x.score==0.0:
                x.score=proportion[i]
                print >>out,"HT\t",x,"\t",proportion[i]
            else:
                print >>out,"HT\t",x,"\t",proportion[i]
        else:
            if not args.hits_only:
                if x.score==0.0:
                    x.score=proportion[i]
                    print >>out,"NT\t",x,"\t",proportion[i]
                else:
                    print >>out,"NT\t",x,"\t",proportion[i]
    print >>out,"//"
Example #11
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    if args.input=="stdin":
        fin=sys.stdin
    else:
        try:
            x=args.input.split(".")
            if x[-1]=="gz":
                fin=gzip.open(args.input,"r")
            else:
                fin=open(args.input,"r")
        except IOError:
            print >>sys.stderr,"can't read file",args.input
            fin=sys.stdin
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    gene=DBI.init(args.genetab,args.gene_format);
    upstream_list=[]
    downstream_list=[]
    exons_list=[]
    introns_list=[]
    utr3_list=[]
    utr5_list=[]
    for g in gene:
        upstream_list.append(g.upstream(args.upstream));
        downstream_list.append(g.downstream(args.downstream));
        for e in g.Exons():
            exons_list.append(e)
        for i in g.Introns():
            introns_list.append(i)
        if not (g.utr3() is None):
            utr3_list.append(g.utr3())
        if not (g.utr5() is None):
            utr5_list.append(g.utr5())
    upstream=DBI.init(upstream_list,"bed")
    downstream=DBI.init(downstream_list,"bed")
    exons=DBI.init(exons_list,"bed")
    introns=DBI.init(introns_list,"bed")
    utr3=DBI.init(utr3_list,"genebed")
    utr5=DBI.init(utr5_list,"genebed")



    if args.format=="guess":
        args.format=Tools.guess_format(args.input)
    for (i0,i) in enumerate(TableIO.parse(fin,args.format)):
        if i0==0:
            if isinstance(i,Bed12):
                print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            elif isinstance(i,GeneBed):
                print >>out,"#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            else:
                print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
        


        print >>out,i,
        print >>out,"\t",toIDs(gene.query(i)),

        print >>out,"\t",toIDs(upstream.query(i)),
        print >>out,"\t",toIDs(downstream.query(i)),
        print >>out,"\t",toIDs(exons.query(i)),
        print >>out,"\t",toIDs(introns.query(i)),
        print >>out,"\t",toIDs(utr3.query(i)),
        print >>out,"\t",toIDs(utr5.query(i))
Example #12
0
def Main():
    global args, out
    args = ParseArg()
    dict = {}
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    argv = sys.argv
    argv[0] = argv[0].split("/")[-1]
    print >> out, "# This data was generated by program ", argv[
        0], "(version %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :\n#\t", " ".join(argv)
    if args.query_method:
        dict["method"] = args.query_method
    dbi = DBI.init(args.db, args.dbformat)
    hits = 0
    query = 0
    if args.input == "stdin":
        input = sys.stdin
    else:
        input = args.input

    query_length = 0
    hits_number = 0
    for (i0, x) in enumerate(TableIO.parse(input, args.input_format)):
        if i0 % 10 == 0:
            print >> sys.stderr, "query ", i0, " entries\r",
        print >> out, "QR\t", x
        hit = 0
        query += 1
        query_length += len(x)
        results = dbi.query(x, **dict)
        compatible = 0
        #print >>sys.stderr,type(results)
        if isinstance(results, numpy.ndarray) or isinstance(results, list):
            if not args.silence:
                print >> out, "HT\t",
                for value in results:
                    print >> out, str(value) + ",",
                print >> out, ""
            hit = 1
            hits_number += 1
        elif isinstance(results, str):
            if not args.silence:
                print >> out, "HT\t",
                print >> out, results
            hit = 1
            hits_number += 1

        else:
            this_query_hits = 0
            for j in results:

                if not args.silence:
                    print >> out, "HT\t", j,
                hit = 1
                hits_number += 1
                this_query_hits += 1
                if isinstance(j, xplib.Annotation.Bed12) and isinstance(
                        x, xplib.Annotation.Bed12):
                    compatible_binary = Tools.compatible_with_transcript(j, x)
                    if not args.silence:
                        print >> out, "\tCompatible:", compatible_binary
                    if compatible_binary:
                        compatible += 1
                else:
                    if not args.silence:
                        print >> out, ""
            print >> out, "HN\t", this_query_hits
            if compatible > 0:
                print >> out, "CP\t", compatible

        if args.dbformat == "tabix":
            x.chr = x.chr.replace("chr", "")
            for j in dbi.query(x, **dict):
                print >> out, "HT\t", j
                hit = 1
                hits_number += 1
        hits += hit
    print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits
    print >> out, "# Query Length:", query_length
    print >> out, "# Hits Number:", hits_number
Example #13
0
def Main():
    """
    IO TEMPLATE
    """
    global args, out
    args = ParseArg()
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >>sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    if args.input == "stdin":
        fin = sys.stdin
    else:
        try:
            x = args.input.split(".")
            if x[-1] == "gz":
                fin = gzip.open(args.input, "r")
            else:
                fin = open(args.input, "r")
        except IOError:
            print >>sys.stderr, "can't read file", args.input
            fin = sys.stdin
    """
    END OF IO TEMPLATE 
    """
    print >> out, "# This data was generated by program ", sys.argv[0], " (version: %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :"
    print >> out, "#\t", " ".join(sys.argv)
    gene = DBI.init(args.genetab, args.gene_format)
    upstream_list = []
    downstream_list = []
    exons_list = []
    introns_list = []
    utr3_list = []
    utr5_list = []
    for g in gene:
        upstream_list.append(g.upstream(args.upstream))
        downstream_list.append(g.downstream(args.downstream))
        for e in g.Exons():
            exons_list.append(e)
        for i in g.Introns():
            introns_list.append(i)
        if not (g.utr3() is None):
            utr3_list.append(g.utr3())
        if not (g.utr5() is None):
            utr5_list.append(g.utr5())
    upstream = DBI.init(upstream_list, "bed")
    downstream = DBI.init(downstream_list, "bed")
    exons = DBI.init(exons_list, "bed")
    introns = DBI.init(introns_list, "bed")
    utr3 = DBI.init(utr3_list, "genebed")
    utr5 = DBI.init(utr5_list, "genebed")

    if args.format == "guess":
        args.format = Tools.guess_format(args.input)
    for (i0, i) in enumerate(TableIO.parse(fin, args.format)):
        if i0 == 0:
            if isinstance(i, Bed12):
                print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            elif isinstance(i, GeneBed):
                print >> out, "#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            else:
                print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"

        print >> out, i,
        print >> out, "\t", toIDs(gene.query(i)),

        print >> out, "\t", toIDs(upstream.query(i)),
        print >> out, "\t", toIDs(downstream.query(i)),
        print >> out, "\t", toIDs(exons.query(i)),
        print >> out, "\t", toIDs(introns.query(i)),
        print >> out, "\t", toIDs(utr3.query(i)),
        print >> out, "\t", toIDs(utr5.query(i))
Example #14
0
    def query(self,x,method='pileup'):
        if method=='fetch':
            for bamfile in self.bamfiles:
                for read in bamfile.fetch(x.chr,x.start,x.stop):
                    if read.tid<0:continue
                    if read.mapq==0:continue
                    strand='+'
                    if read.is_reverse:
                        strand='-'
                    score=read.mapq
                    bed=Bed([bamfile.references[read.tid],read.pos,read.aend,read.qname,score,strand])
                    yield bed
        elif method=='fetch12':
            '''
            test version
            still test Tools.cigar_to_coordinates
            '''
            for bamfile in self.bamfiles:
                for read in bamfile.fetch(x.chr,x.start,x.stop):
                    if read.tid<0:continue
                    if read.mapq==0:continue
                    chr=bamfile.references[read.tid]
                    strand='+'
                    if read.is_reverse:
                        strand='-'
                    score=read.mapq
                    start=read.pos
                    end=read.aend
                    name=read.qname
                    cds_start=start
                    cds_end=start
                    itemRgb="0,0,0"
                    (block_starts,block_sizes)=Tools.cigar_to_coordinates(read.cigar); 
                    bed=Bed12([chr,start,end,name,score,strand,cds_start,cds_end,itemRgb,len(block_sizes),block_sizes,block_starts])
                    yield bed
        elif method=="paired_end":
            for bamfile in self.bamfiles:
                for fragment in TableIO.parse(bamfile.fetch(x.chr,x.start,x.stop),"bam2fragment",bam=bamfile):
                    yield fragment
        elif method=='pileup':
            s=[[0,0,0,0] for row in range(x.stop-x.start)]
            for bamfile in self.bamfiles:
                try:
                    A=bamfile.pileup(x.chr,x.start,x.stop)

                except:
                    print >>sys.stderr,"Can't pile up",x.chr,x.start,x.stop
                    raise StopIteration 
                for pileupcolumn in A:
                    j=pileupcolumn.pos-x.start
                    if j<0: continue
                    if j>x.stop-x.start: break
                    for pileupread in pileupcolumn.pileups:
                        try:
                            if pileupread.is_del: continue
                            if pileupread.indel!=0: continue
                            nt=pileupread.alignment.seq[pileupread.qpos]
                            if BamI.hNtToNum.has_key(nt):
                                k=BamI.hNtToNum[nt]
                                s[j][k]+=1
                        except:
                            pass
            for i in s:
                yield i
Example #15
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out,isoforms_set,selected_isoforms_set,reads_set,selected_reads_set,dbi
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    if args.input=="stdin":
        fin=sys.stdin
    else:
        try:
            x=args.input.split(".")
            if x[-1]=="gz":
                fin=gzip.open(args.input,"r")
            else:
                fin=open(args.input,"r")
        except IOError:
            print >>sys.stderr,"can't read file",args.input
            fin=sys.stdin
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)

    dbi=DBI.init(args.bam,args.format)
    '''
    reading all the isoforms
    '''
    isoforms=[]
    


    iterator=TableIO.parse(fin,"bed")
    beds=[]
    for i in iterator:
        beds.append(i)
    beds.sort()
    if len(beds)==0:
        print >>sys.stderr,"error in reading file",args.input
        exit(1)

    bed=beds[0]
    chr=bed.chr
    min_start=bed.start
    max_stop=bed.stop
    j=0
    for i in beds:
        j+=1
        if (j%10==0): print >>sys.stderr,"processed %d entries\r"%j,
        if Tools.overlap(bed,i):
            if bed.stop < i.stop:
                bed.stop=i.stop
            isoforms.append(i)
        else:
            compare(isoforms)
            isoforms=[i]
            bed=i
    if len(isoforms)>0:
        compare(isoforms)
Example #16
0
def Main():
    global args,out
    args=ParseArg()
    dict={}
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    argv=sys.argv
    argv[0]=argv[0].split("/")[-1]
    print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :\n#\t"," ".join(argv)
    init_dict={}
    if args.dbformat=="guess":
        if Tools.suffix(args.db)=="gz": 
            args.dbformat="tabix"
            args.tabix_format=Tools.guess_format(args.db)
        else:
            args.dbformat=Tools.guess_format(args.db)

    if args.query_method:
        dict["method"]=args.query_method
    if args.tabix_format:
        init_dict["tabix"]=args.tabix_format

    dbi=DBI.init(args.db,args.dbformat,**init_dict)
    hits=0
    query=0
    if args.input=="stdin":
        input=sys.stdin
    else:
        input=args.input

    query_length=0
    hits_number=0
    if (args.input_format=="guess"):
        args.input_format=Tools.guess_format(args.input)
    for (i0,x) in enumerate(TableIO.parse(input,args.input_format)):
        if i0%100==0:
            print >>sys.stderr,"query ",i0," entries\r",
        print >>out,"QR\t",x
        hit=0
        query+=1
        query_length+=len(x)
        #print dbi;#debug
        results=dbi.query(x,**dict)
        #results=dbi.query(x) #DEBUG
        #print >>sys.stderr,type(results)
        if isinstance(results,numpy.ndarray) or isinstance(results,list):
            print >>out,"HT\t",
            for value in results:
                print >>out,str(value)+",",
            print >>out,""
            hit=1
            hits_number+=1
        elif isinstance(results,str):
            print >>out,"HT\t",
            print >>out,results
            hit=1
            hits_number+=1

        else:
            for j in results:
                print >>out,"HT\t",j
                hit=1
                hits_number+=1

        if args.dbformat=="tabix":
            x.chr=x.chr.replace("chr","")
            for j in dbi.query(x,**dict):
                print >>out,"HT\t",j
                hit=1
                hits_number+=1
        hits+=hit
    print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits
    print >>out,"# Query Length:",query_length
    print >>out,"# Hits Number:",hits_number
Example #17
0
File: DB.py Project: nimezhu/xplib
    def query(self, x=None, method="pileup", **dict):
        if type(x) == type("str"):
            x = x.split(":")
            chrom = x[0]
            start = None
            end = None
            if len(x) > 1:
                b = x[1].split("-")
                if len(b) == 2:
                    start = int(b[0]) - 1
                    end = int(b[1])
        elif isinstance(x, tuple):
            chrom = x[CHROM]
            start = x[CHROMSTART]
            end = x[CHROMEND]
        elif x is not None:
            chrom = x.chr
            start = x.start
            end = x.stop

        if method == "fetch":
            for bamfile in self.bamfiles:
                for read in bamfile.fetch(chrom, start, end):
                    if read.tid < 0:
                        continue
                    if read.mapq == 0:
                        continue
                    strand = "+"
                    if read.is_reverse:
                        strand = "-"
                    score = read.mapq
                    bed = Bed([bamfile.references[read.tid], read.pos, read.aend, read.qname, score, strand])
                    yield bed
        elif method == "fetch12":
            """
            test version
            still test Tools.cigar_to_coordinates
            """
            for bamfile in self.bamfiles:
                for read in bamfile.fetch(chrom, start, end):
                    if read.tid < 0:
                        continue
                    if read.mapq == 0:
                        continue
                    chr = bamfile.references[read.tid]
                    strand = "+"
                    if read.is_reverse:
                        strand = "-"
                    score = read.mapq
                    start = read.pos
                    end = read.aend
                    name = read.qname
                    cds_start = start
                    cds_end = start
                    itemRgb = "0,0,0"
                    (block_starts, block_sizes) = Tools.cigar_to_coordinates(read.cigar)
                    bed = Bed12(
                        [
                            chr,
                            start,
                            end,
                            name,
                            score,
                            strand,
                            cds_start,
                            cds_end,
                            itemRgb,
                            len(block_sizes),
                            block_sizes,
                            block_starts,
                        ]
                    )
                    yield bed
        elif method == "bam1":
            # fetch read from paired end with strand information
            for bamfile in self.bamfiles:
                strand = "read1"
                if dict.has_key("strand"):  # TODO: if bamfiles have different read1 or read2 ?
                    strand = dict["strand"]
                for bed in TableIO.parse(
                    bamfile.fetch(chrom, start, end), "bam2bed12", references=chrom, strand=strand
                ):
                    yield bed

        elif method == "paired_end":
            for bamfile in self.bamfiles:
                for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile):
                    yield fragment
        elif method == "bam2":  # yield bed12
            for bamfile in self.bamfiles:
                for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile):
                    if dict.has_key("strand"):
                        yield fragment.toBed12(chr=chrom, strand=dict["strand"])
                    else:
                        yield fragment.toBed12(chr=chrom)
        elif method == "bam1tuple":
            for bamfile in self.bamfiles:
                strand = "read1"
                if dict.has_key("strand"):  # TODO: if bamfiles have different read1 or read2 ?
                    strand = dict["strand"]
                for bed in TableIO.parse(
                    bamfile.fetch(chrom, start, end), "bam2bed12tuple", references=chrom, strand=strand
                ):
                    yield bed
        elif method == "bam2tuple":
            for bamfile in self.bamfiles:
                for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile):
                    if dict.has_key("strand"):
                        yield fragment.toBed12Tuple(chr=chrom, strand=dict["strand"])
                    else:
                        yield fragment.toBed12Tuple(chr=chrom)

        elif method == "bam2tuple_fast":
            for bamfile in self.bamfiles:
                for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment"):
                    if dict.has_key("strand"):
                        yield fragment.toBed12Tuple(chr=chrom, strand=dict["strand"])
                    else:
                        yield fragment.toBed12Tuple(chr=chrom)
        elif method == "pileup":
            s = [[0, 0, 0, 0] for row in range(end - start)]
            for bamfile in self.bamfiles:
                try:
                    A = bamfile.pileup(chrom, start, end)
                except:
                    print >>sys.stderr, "Can't pile up", chrom, start, end
                    raise StopIteration
                for pileupcolumn in A:
                    j = pileupcolumn.pos - start
                    if j < 0:
                        continue
                    if j > end - start:
                        break
                    for pileupread in pileupcolumn.pileups:
                        try:
                            if pileupread.is_del:
                                continue
                            if pileupread.indel != 0:
                                continue
                            nt = pileupread.alignment.seq[pileupread.qpos]
                            if BamI.hNtToNum.has_key(nt):
                                k = BamI.hNtToNum[nt]
                                s[j][k] += 1
                        except:
                            pass
            for i in s:
                yield i
        elif method == "count":
            s = 0
            for bamfile in self.bamfiles:
                s += bamfile.count(chrom, start, end)
            yield s
        elif method == "count_fragment":
            s = 0
            for bamfile in self.bamfiles:
                for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile):
                    s += 1
            yield s
        elif method == "references":
            for i in self.bamfiles[0].references:
                yield i
        elif method == "lengths":
            for i in self.bamfiles[0].lengths:
                yield i