Esempio n. 1
0
def Main():
    global args,out
    CellLine=["H1"]
    HM=("input","H3K27ac","H3K27me3","H3K36me3","H3K4me1","H3K4me3","H3K9me3")
    marks=[]
    dbi={}

    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    for cell in CellLine:
        for hm in HM:
            mark=cell+"_"+hm
            marks.append(mark)
            dbi[mark]=DBI.init("/data/zhuxp/bam2x/data/bamlist/"+mark+".bamlist","bamlist")
    for i,x in enumerate(TableIO.parse(args.input,args.input_format)):
        print >>out,"QR\t",x
        if i%100==0: print >>sys.stderr,"query %d entries\r"%i,
        for mark in marks:
            print >>out,mark,"\t"
            for j in DBI.query(x,dbi[mark]):
                print >>out,"HT\t",j
Esempio n. 2
0
def Main():
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout

    count={}
    dbi1=DBI.init(args.db,"bed") # the DBI init file for bed6 file of all kinds of RNA
    dbi2=DBI.init(args.db_detail,"bed") # the DBI init file for bed12 file of lincRNA and mRNA with intron, exon, UTR
    genome=Genome('mouse', Release=67, account=None)
    for bed in TableIO.parse(args.input,args.format):
        [typ,name,subtype]=annotation(bed,dbi1,dbi2,genome)
        if count.has_key(typ):
            count[typ]+=1
        else:
            count[typ]=1
        print >>out, "\t".join (str(f) for f in [bed.chr,bed.start,bed.stop,bed.id,name,bed.strand,typ, subtype])

    print >>out, "\n".join ("#"+typ+"\t%d"%(count[typ]) for typ in count.keys())
Esempio n. 3
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args, out
    args = ParseArg()
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    if args.input == "stdin":
        fin = sys.stdin
    else:
        try:
            x = args.input.split(".")
            if x[-1] == "gz":
                fin = gzip.open(args.input, "r")
            else:
                fin = open(args.input, "r")
        except IOError:
            print >> sys.stderr, "can't read file", args.input
            fin = sys.stdin
    '''
    END OF IO TEMPLATE 
    '''
    print >> out, "# This data was generated by program ", sys.argv[
        0], " (version: %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :"
    print >> out, "#\t", " ".join(sys.argv)
    enhancer_dbi = DBI.init(args.enhancer_tabix,
                            "tabix",
                            tabix="metabed",
                            header=re.sub(".gz$", ".header",
                                          args.enhancer_tabix))
    promoter_dbi = DBI.init(args.promoter_tabix,
                            "tabix",
                            tabix="metabed",
                            header=re.sub(".gz$", ".header",
                                          args.promoter_tabix))

    for i in TableIO.parse(fin, args.format):
        tss = i.tss()
        tss.start -= args.size
        tss.stop += args.size
        if tss.start < 0: tss.start = 0
        tss.id += "_near" + str(args.size)
        print "QR\t", tss
        for e in enhancer_dbi.query(tss):
            print "EH\t", e
        for p in promoter_dbi.query(tss):
            print "PM\t", p
Esempio n. 4
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    if args.input=="stdin":
        fin=sys.stdin
    else:
        try:
            x=args.input.split(".")
            if x[-1]=="gz":
                fin=gzip.open(args.input,"r")
            else:
                fin=open(args.input,"r")
        except IOError:
            print >>sys.stderr,"can't read file",args.input
            fin=sys.stdin
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    enhancer_dbi=DBI.init(args.enhancer_tabix,"tabix",tabix="metabed",header=re.sub(".gz$",".header",args.enhancer_tabix))
    promoter_dbi=DBI.init(args.promoter_tabix,"tabix",tabix="metabed",header=re.sub(".gz$",".header",args.promoter_tabix))

    for i in TableIO.parse(fin,args.format):
        tss=i.tss()
        tss.start-=args.size
        tss.stop+=args.size
        if tss.start<0: tss.start=0
        tss.id+="_near"+str(args.size)
        print "QR\t",tss
        for e in enhancer_dbi.query(tss):
            print "EH\t",e
        for p in promoter_dbi.query(tss):
            print "PM\t",p
Esempio n. 5
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    dbi=DBI.init(args.db,Tools.guess_format(args.db))
    references=dbi.bamfiles[0].references
    for i in TableIO.parse(fin,args.format):
        print i
        n=0
        c_count=0
        reads=dbi.query(i,args.method)
        for read in reads:
            compatible=Tools.compatible_with_transcript(read,i,references=references,strand=args.strand)
            print "HT:"
            for i0,r in enumerate(TableIO.parse(read.reads,"bam2bed12",references=references)):
                print "READ"+str(i0)+"\t",r
            print "COMPATIBLE:",compatible,"\n\n"
            if compatible: c_count+=1
            n+=1
        print "COMPATIBLE / ALL OVERLAP READS =  ",c_count,"/",n
        print "RATIO\t%.4f"%float(c_count)/n
Esempio n. 6
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)

    dbi=[];
    for i,bam in enumerate(args.bams):
        print >>out,"# SAMPLE_"+str(i+1)+" BAM File:",bam
        dbi.append(DBI.init(bam,"bam"))
    print >>out,"#",VCF.header(),
    for i,bam in enumerate(args.bams):
        print >>out,"\t","Sample_"+str(i+1),
    print >>out,""
    for i,vcf in enumerate(TableIO.parse(fin,"vcf")):
        vcf.chr=args.chr_prefix+vcf.chr
        if(i%100==0):
            print >>sys.stderr,"processing",i,"vcf\r",
        print >>out,vcf,
        for d in dbi:
            print >>out,"\t",
            for r in d.query(vcf):
                print >>out,format(r),
        print >>out,""
Esempio n. 7
0
def Main():
    args = ParseArg()

    if len(args.data) != len(args.name):
        print >> sys.stderr, "ERROR: Number of data is not the same as number of names!"
        sys.exit(0)

    # store data information
    data = {}
    total_reads = {}
    for i in range(len(args.data)):
        temp_name = args.name[i]
        print >> sys.stderr, "\n Reading data file:" + temp_name + "..."
        total_reads[temp_name] = 0
        if args.format[i] == "bam":
            total_reads[temp_name] = reduce(lambda x, y: x + y, [
                int(l.rstrip('\n').split('\t')[2])
                for l in pysam.idxstats(args.data[i])
            ])
        else:
            Format = "bed"
            for b in TableIO.parse(args.data[i], Format):
                total_reads[temp_name] += 1
                if total_reads[temp_name] % 50000 == 0:
                    print >> sys.stderr, "  reading %d reads..\r" % (
                        total_reads[temp_name]),
        data[temp_name] = DBI.init(args.data[i], args.format[i])

    output = open(args.output, 'w')

    Input = open(args.input, 'r')
    lines = Input.read().split("\n")

    # header
    header = ["chr", "start", "end", "type", "name", "subtype", "count"
              ] + data.keys()
    print >> output, "\t".join(g + "_%d" % (f) for f in [1, 2]
                               for g in header) + "\tinteraction\tp-value"

    num = 0
    print >> sys.stderr, "Start process interactions:"
    for l in lines:
        if l.strip() == '': continue
        l = l.strip().split('\t')
        num = num + 1
        if l[0] == "chrM" or l[7] == "chrM": continue
        C1 = Bed([l[0], int(l[1]), int(l[2])])
        C2 = Bed([l[7], int(l[8]), int(l[9])])
        rpkm1 = "\t".join(
            str(f) for f in
            [RPKM(C1, data[n], total_reads[n], n) for n in data.keys()])
        rpkm2 = "\t".join(
            str(f) for f in
            [RPKM(C2, data[n], total_reads[n], n) for n in data.keys()])
        print >> output, "\t".join(
            str(f) for f in l[:7] + [rpkm1] + l[7:14] + [rpkm2, l[14], l[15]])
        if num % 1000 == 0:
            print >> sys.stderr, "  Output interaction: %d\r" % (num),
Esempio n. 8
0
def Main():
    global args
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout

    dbi=DBI.init(args.db,"genebed")
    count={}
    count["Intergenic"]=0
    for x in TableIO.parse(args.input,args.input_format):
        flag=0
        gene=""
        for hit in dbi.query(x):
            flag=1
            if hit.align_id==gene:
                continue
            gene=hit.align_id
            #print hit
            #print hit.cds_start,hit.cds_stop
            if (hit.cds_start==hit.cds_stop):
                if hit.align_id[0:3]=="Mir":
                    loc="MiRNA"
                else:
                    loc="Non-coding"
            elif hit.strand=="+":
                if x.stop<=hit.cds_start:
                    loc="5-UTR"
                elif x.start>=hit.cds_stop:
                    loc="3-UTR"
                else:
                    loc=judge_exon(x,hit)
                        
            else:
                if x.stop<=hit.cds_start:
                    loc="3-UTR"
                elif x.start>=hit.cds_stop:
                    loc="5-UTR"
                else:
                    loc=judge_exon(x,hit)
            print >>out,"\t".join (str(f) for f in [x.chr,x.start,x.stop,x.id,x.score,x.strand,hit.align_id,loc])
            if count.has_key(loc):
                count[loc]+=1
            else:
                count[loc]=1

        if flag==0:
            print >>out, "\t".join (str(f) for f in [x.chr,x.start,x.stop,x.id,x.score,x.strand,"None","Intergenic"])
            count["Intergenic"]+=1
    
    out2=open(args.output.split(".")[0]+".cisStat","w")
    for key in sorted(count.keys()):
        print >>out2,key+"\t"+str(count[key])
Esempio n. 9
0
File: Query.py Progetto: yu68/bam2x
def Main():
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    dbi=DBI.init(args.db,args.dbformat)
    for x in TableIO.parse(args.input,args.input_format):
        print "QR\t",x
        for j in DBI.query(x,dbi):
                print "HT\t",j
        x.chr=x.chr.replace("chr","")
        for j in DBI.query(x,dbi):
                print "HT\t",j
Esempio n. 10
0
def ReadHistones(fhist_name):
    sp1 = []
    sp2 = []
    with open(fhist_name, "r") as fhist:
        fhist.readline()
        while True:
            line = fhist.readline().strip()
            if "@" in line:
                break
            line = line.split()
            sp1.append(DBI.init(line[0], "bed"))
        while True:
            line = fhist.readline().strip()
            if line == "":
                break
            line = line.split()
            sp2.append(DBI.init(line[0], "bed"))

    return sp1, sp2
Esempio n. 11
0
def Main():
    args=ParseArg()
    
    if len(args.data)!=len(args.name):
        print >> sys.stderr, "ERROR: Number of data is not the same as number of names!"
        sys.exit(0)

    # store data information
    data={}
    total_reads={}
    for i in range(len(args.data)):
        temp_name=args.name[i]
        print >> sys.stderr, "\n Reading data file:"+temp_name+"..."
        total_reads[temp_name]=0
        if args.format[i]=="bam":
            total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i])])
        else:
            Format="bed"
            for b in TableIO.parse(args.data[i],Format):
                total_reads[temp_name]+=1
                if total_reads[temp_name]%50000==0:
                    print >> sys.stderr, "  reading %d reads..\r"%(total_reads[temp_name]),
        data[temp_name]=DBI.init(args.data[i],args.format[i])
        
    
    output=open(args.output,'w')

    Input=open(args.input,'r')
    lines=Input.read().split("\n")

    # header
    header=["chr","start","end","type","name","subtype","count"]+data.keys()
    print >> output, "\t".join(g+"_%d"%(f) for f in [1,2] for g in header)+"\tinteraction\tp-value"

    num=0    
    print >> sys.stderr, "Start process interactions:"
    for l in lines:
        if l.strip()=='': continue
        l=l.strip().split('\t')
        num=num+1
        if l[0]=="chrM" or l[7]=="chrM": continue
        C1=Bed([l[0],int(l[1]),int(l[2])])
        C2=Bed([l[7],int(l[8]),int(l[9])])
        rpkm1="\t".join (str(f) for f in [RPKM(C1,data[n],total_reads[n],n) for n in data.keys()])
        rpkm2="\t".join (str(f) for f in [RPKM(C2,data[n],total_reads[n],n) for n in data.keys()])
        print >> output, "\t".join(str(f) for f in l[:7]+[rpkm1]+l[7:14]+[rpkm2,l[14],l[15]])
	if num%1000==0:
            print >> sys.stderr, "  Output interaction: %d\r"%(num),
Esempio n. 12
0
def Main():
    args = ParseArg()

    #store bed files with indexing and count information:
    bed = {}

    print >> sys.stderr, "Starting index bed files:"
    for i in range(len(args.beds)):
        temp_name = args.name[i]
        print >> sys.stderr, "  #Indexing for bed file of", temp_name, "\r",
        bed[temp_name] = DBI.init(args.beds[i], 'bed')

    half_len = int(args.len)
    print >> sys.stderr
    print >> sys.stderr, "Reading nucleosome peak xls file from Danpos."
    nucleosomes = TableIO.parse(args.nucleosome, 'metabed', header=True)

    print >> sys.stderr, "Start Counting..."
    count_matrix = []

    out = open(args.output, "w")
    line_head = open(args.nucleosome, 'r').readline().strip()
    line_head = line_head + "\t" + "\t".join(str(f) for f in args.name)
    print >> out, line_head
    for i in nucleosomes:
        chrom = i.chr

        if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM':
            continue
        center = int(i.start + i.end) / 2
        count = np.zeros(len(args.beds), dtype="float")
        line = str(i)
        for k, name in enumerate(bed.keys()):
            for j in bed[name].query(
                    Bed([
                        chrom, center - ma - (half_len - 75),
                        center + ma + (half_len - 75)
                    ])):
                j_center = find_center(j, half_len)
                weight = max(min(1, (ma - abs(j_center - center)) / 25.0), 0)
                count[k] += weight
        line = line + "\t" + "\t".join(str(f) for f in count)
        print >> out, line
        count_matrix.append(count)
Esempio n. 13
0
def Main():
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    argv=sys.argv
    argv[0]=argv[0].split("/")[-1]
    print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :\n#\t"," ".join(argv)
   
    dbi=DBI.init(args.db,args.dbformat)
    hits=0
    query=0
    if args.input=="stdin":
        input=sys.stdin
    else:
        input=args.input

    query_length=0
    hits_number=0
    query_sets=[]
    for x in TableIO.parse(input,args.input_format):
        query_sets.append(x)
    length=len(query_sets)
    
    size=length/args.thread
    #results=[[] for i in range(args.thread)]
    for i in range(args.thread):
        end= (i+1)*size if (i+1)*size < length else length
        end= end if (i+1)!=args.thread else length
        querys=query_sets[i*size:end]
     #   print i*size,end
     #   for j in querys: print j
        thread.start_new_thread(fquery,(querys,dbi,i))
Esempio n. 14
0
def Main():
    args=ParseArg()

    #store bed files with indexing and count information:
    bed={}

    print >>sys.stderr,"Starting index bed files:"
    for i in range(len(args.beds)):
        temp_name=args.name[i]
        print >>sys.stderr,"  #Indexing for bed file of",temp_name,"\r",
        bed[temp_name]=DBI.init(args.beds[i],'bed')

    half_len=int(args.len)
    print >>sys.stderr
    print >>sys.stderr,"Reading nucleosome peak xls file from Danpos."
    nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True)

    print >>sys.stderr,"Start Counting..."
    count_matrix=[]


    out=open(args.output,"w")
    line_head=open(args.nucleosome,'r').readline().strip()
    line_head=line_head+"\t"+"\t".join(str(f) for f in args.name)
    print >>out,line_head
    for i in nucleosomes:
        chrom=i.chr
      
        if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM':
            continue
        center=int(i.start+i.end)/2
        count=np.zeros(len(args.beds),dtype="float")
        line=str(i)
        for k,name in enumerate(bed.keys()):
            for j in bed[name].query(Bed([chrom,center-ma-(half_len-75),center+ma+(half_len-75)])):
                j_center=find_center(j,half_len)
                weight = max(min(1,(ma-abs(j_center-center))/25.0),0)
                count[k]+=weight
        line = line + "\t" + "\t".join(str(f) for f in count)
        print >>out,line
        count_matrix.append(count)
Esempio n. 15
0
def Main():
    args = ParseArg()

    anno = DBI.init(args.annotation, "bed")
    ext_dis = args.ext_dis
    target_num = args.target_num

    with open(args.input, "r") as fin, open(args.output, "w") as fout:
        for line in fin:
            bed_region = Bed(line.strip().split())
            mid_point = (bed_region.start + bed_region.stop) / 2
            ori_start = bed_region.start
            ori_stop = bed_region.stop
            bed_region.start = mid_point - ext_dis
            bed_region.stop = mid_point + ext_dis

            gene_list = findNearbyGene(bed_region, anno, ori_start, ori_stop,
                                       target_num)
            for gene in gene_list:
                print >> fout, "\t".join(
                    [bed_region.id, gene[1],
                     str(gene[0])])
Esempio n. 16
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args, out
    args = ParseArg()
    fin = IO.fopen(args.input, "r")
    out = IO.fopen(args.output, "w")
    '''
    END OF IO TEMPLATE 
    '''
    print >> out, "# This data was generated by program ", sys.argv[
        0], " (version: %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :"
    print >> out, "#\t", " ".join(sys.argv)
    dbi = DBI.init(args.db, Tools.guess_format(args.db))
    references = dbi.bamfiles[0].references
    for i in TableIO.parse(fin, args.format):
        print i
        n = 0
        c_count = 0
        reads = dbi.query(i, args.method)
        for read in reads:
            compatible = Tools.compatible_with_transcript(
                read, i, references=references, strand=args.strand)
            print "HT:"
            for i0, r in enumerate(
                    TableIO.parse(read.reads,
                                  "bam2bed12",
                                  references=references)):
                print "READ" + str(i0) + "\t", r
            print "COMPATIBLE:", compatible, "\n\n"
            if compatible: c_count += 1
            n += 1
        print "COMPATIBLE / ALL OVERLAP READS =  ", c_count, "/", n
        print "RATIO\t%.4f" % float(c_count) / n
Esempio n. 17
0
def Main():
    global args, out
    args = ParseArg()
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    argv = sys.argv
    argv[0] = argv[0].split("/")[-1]
    print >> out, "# This data was generated by program ", argv[
        0], "(version %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :\n#\t", " ".join(argv)

    dbi = DBI.init(args.bam, "bam")
    if args.input == "stdin":
        input = sys.stdin
    else:
        input = args.input

    for x in TableIO.parse(input, args.input_format):
        promoter = x.core_promoter(1000, 1000)
        print >> out, x
        print >> out, promoter
        retv = []
        for (i, r) in enumerate(dbi.query(promoter)):
            retv.append(sum(r))
        if x.strand == "-":
            retv = retv[::-1]
        for i in retv:
            print >> out, i,
        print >> out, ""
Esempio n. 18
0
def Main():
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    argv=sys.argv
    argv[0]=argv[0].split("/")[-1]
    print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :\n#\t"," ".join(argv)
   
    dbi=DBI.init(args.bam,"bam")
    if args.input=="stdin":
        input=sys.stdin
    else:
        input=args.input

    for x in TableIO.parse(input,args.input_format):
        promoter=x.core_promoter(1000,1000)
        print >>out,x
        print >>out,promoter
        retv=[]
        for (i,r) in enumerate(dbi.query(promoter)):
            retv.append(sum(r))
        if x.strand=="-":
            retv=retv[::-1]
        for i in retv:
            print >>out,i,
        print >>out,""
Esempio n. 19
0
def Main():
    global args, out
    args = ParseArg()
    dict = {}
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    argv = sys.argv
    argv[0] = argv[0].split("/")[-1]
    print >> out, "# This data was generated by program ", argv[
        0], "(version %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :\n#\t", " ".join(argv)
    if args.query_method:
        dict["method"] = args.query_method
    dbi = DBI.init(args.db, args.dbformat)
    hits = 0
    query = 0
    if args.input == "stdin":
        input = sys.stdin
    else:
        input = args.input

    query_length = 0
    hits_number = 0
    for (i0, x) in enumerate(TableIO.parse(input, args.input_format)):
        if i0 % 10 == 0:
            print >> sys.stderr, "query ", i0, " entries\r",
        print >> out, "QR\t", x
        hit = 0
        query += 1
        query_length += len(x)
        results = dbi.query(x, **dict)
        compatible = 0
        #print >>sys.stderr,type(results)
        if isinstance(results, numpy.ndarray) or isinstance(results, list):
            if not args.silence:
                print >> out, "HT\t",
                for value in results:
                    print >> out, str(value) + ",",
                print >> out, ""
            hit = 1
            hits_number += 1
        elif isinstance(results, str):
            if not args.silence:
                print >> out, "HT\t",
                print >> out, results
            hit = 1
            hits_number += 1

        else:
            this_query_hits = 0
            for j in results:

                if not args.silence:
                    print >> out, "HT\t", j,
                hit = 1
                hits_number += 1
                this_query_hits += 1
                if isinstance(j, xplib.Annotation.Bed12) and isinstance(
                        x, xplib.Annotation.Bed12):
                    compatible_binary = Tools.compatible_with_transcript(j, x)
                    if not args.silence:
                        print >> out, "\tCompatible:", compatible_binary
                    if compatible_binary:
                        compatible += 1
                else:
                    if not args.silence:
                        print >> out, ""
            print >> out, "HN\t", this_query_hits
            if compatible > 0:
                print >> out, "CP\t", compatible

        if args.dbformat == "tabix":
            x.chr = x.chr.replace("chr", "")
            for j in dbi.query(x, **dict):
                print >> out, "HT\t", j
                hit = 1
                hits_number += 1
        hits += hit
    print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits
    print >> out, "# Query Length:", query_length
    print >> out, "# Hits Number:", hits_number
Esempio n. 20
0
def Main():
    global args, out
    args = ParseArg()
    dict = {}
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    argv = sys.argv
    argv[0] = argv[0].split("/")[-1]
    print >> out, "# This data was generated by program ", argv[
        0], "(version %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :\n#\t", " ".join(argv)
    init_dict = {}
    if args.dbformat == "guess":
        if Tools.suffix(args.db) == "gz":
            args.dbformat = "tabix"
            args.tabix_format = Tools.guess_format(args.db)
        else:
            args.dbformat = Tools.guess_format(args.db)

    if args.query_method:
        dict["method"] = args.query_method
    if args.tabix_format:
        init_dict["tabix"] = args.tabix_format

    dbi = DBI.init(args.db, args.dbformat, **init_dict)
    hits = 0
    query = 0
    if args.input == "stdin":
        input = sys.stdin
    else:
        input = args.input

    query_length = 0
    hits_number = 0
    if (args.input_format == "guess"):
        args.input_format = Tools.guess_format(args.input)
    for (i0, x) in enumerate(TableIO.parse(input, args.input_format)):
        if i0 % 100 == 0:
            print >> sys.stderr, "query ", i0, " entries\r",
        print >> out, "QR\t", x
        hit = 0
        query += 1
        query_length += len(x)
        #print dbi;#debug
        results = dbi.query(x, **dict)
        #results=dbi.query(x) #DEBUG
        #print >>sys.stderr,type(results)
        if isinstance(results, numpy.ndarray) or isinstance(results, list):
            print >> out, "HT\t",
            for value in results:
                print >> out, str(value) + ",",
            print >> out, ""
            hit = 1
            hits_number += 1
        elif isinstance(results, str):
            print >> out, "HT\t",
            print >> out, results
            hit = 1
            hits_number += 1

        else:
            for j in results:
                print >> out, "HT\t", j
                hit = 1
                hits_number += 1

        if args.dbformat == "tabix":
            x.chr = x.chr.replace("chr", "")
            for j in dbi.query(x, **dict):
                print >> out, "HT\t", j
                hit = 1
                hits_number += 1
        hits += hit
    print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits
    print >> out, "# Query Length:", query_length
    print >> out, "# Hits Number:", hits_number
Esempio n. 21
0
def Main():
    global args,out
    args=ParseArg()
    dict={}
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    argv=sys.argv
    argv[0]=argv[0].split("/")[-1]
    print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :\n#\t"," ".join(argv)
    if args.query_method:
        dict["method"]=args.query_method
    dbi=DBI.init(args.db,args.dbformat)
    hits=0
    query=0
    if args.input=="stdin":
        input=sys.stdin
    else:
        input=args.input

    query_length=0
    hits_number=0
    for (i0,x) in enumerate(TableIO.parse(input,args.input_format)):
        if i0%10==0:
            print >>sys.stderr,"query ",i0," entries\r",
        print >>out,"QR\t",x
        hit=0
        query+=1
        query_length+=len(x)
        results=dbi.query(x,**dict)
        compatible=0
        #print >>sys.stderr,type(results)
        if isinstance(results,numpy.ndarray) or isinstance(results,list):
            if not args.silence:
                print >>out,"HT\t",
                for value in results:
                    print >>out,str(value)+",",
                print >>out,""
            hit=1
            hits_number+=1
        elif isinstance(results,str):
            if not args.silence:
                print >>out,"HT\t",
                print >>out,results
            hit=1
            hits_number+=1

        else:
            this_query_hits=0
            for j in results:
                
                if not args.silence:
                    print >>out,"HT\t",j,
                hit=1
                hits_number+=1
                this_query_hits+=1
                if isinstance(j,xplib.Annotation.Bed12) and isinstance(x,xplib.Annotation.Bed12):
                    compatible_binary=Tools.compatible_with_transcript(j,x)
                    if not args.silence:
                        print >>out,"\tCompatible:",compatible_binary
                    if compatible_binary:
                        compatible+=1
                else:
                    if not args.silence:
                        print >>out,""
            print >>out,"HN\t",this_query_hits
            if compatible>0:
                print >>out,"CP\t",compatible

        if args.dbformat=="tabix":
            x.chr=x.chr.replace("chr","")
            for j in dbi.query(x,**dict):
                print >>out,"HT\t",j
                hit=1
                hits_number+=1
        hits+=hit
    print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits
    print >>out,"# Query Length:",query_length
    print >>out,"# Hits Number:",hits_number
Esempio n. 22
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out,isoforms_set,selected_isoforms_set,reads_set,selected_reads_set,dbi
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    if args.input=="stdin":
        fin=sys.stdin
    else:
        try:
            x=args.input.split(".")
            if x[-1]=="gz":
                fin=gzip.open(args.input,"r")
            else:
                fin=open(args.input,"r")
        except IOError:
            print >>sys.stderr,"can't read file",args.input
            fin=sys.stdin
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)

    dbi=DBI.init(args.bam,args.format)
    '''
    reading all the isoforms
    '''
    isoforms=[]
    


    iterator=TableIO.parse(fin,"bed")
    beds=[]
    for i in iterator:
        beds.append(i)
    beds.sort()
    if len(beds)==0:
        print >>sys.stderr,"error in reading file",args.input
        exit(1)

    bed=beds[0]
    chr=bed.chr
    min_start=bed.start
    max_stop=bed.stop
    j=0
    for i in beds:
        j+=1
        if (j%10==0): print >>sys.stderr,"processed %d entries\r"%j,
        if Tools.overlap(bed,i):
            if bed.stop < i.stop:
                bed.stop=i.stop
            isoforms.append(i)
        else:
            compare(isoforms)
            isoforms=[i]
            bed=i
    if len(isoforms)>0:
        compare(isoforms)
Esempio n. 23
0
def Main():
    args=ParseArg()

    #store bed files with indexing and count information:
    bam={}

    print >>sys.stderr,"Starting index bed files:"
    for i in range(len(args.bams)):
        temp_name=args.name[i]
        print >>sys.stderr,"  #Indexing for bed file of",temp_name,"\r",
        bam[temp_name]=DBI.init(args.bams[i],args.fmt)

    print >>sys.stderr
    print >>sys.stderr,"Reading nucleosome peak xls file from Danpos."
    nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True)

    print >>sys.stderr,"Initial output files..."

    out=open(args.output,"w")
    # -- for verbose ---
    if args.verbose:
        out_mark=[]
        for n in args.name:
            out_mark.append(open(n+'_shift_nucleosomes.bed','w'))
    # ------------------ 
    line_head=open(args.nucleosome,'r').readline().strip()
    line_head=line_head+"\t"+"\t".join(str(f) for f in args.name)+'\t'+"\t".join(str(f)+'_off' for f in args.name)
    print >>out,line_head
    
    print >>sys.stderr,"Start Counting..."
    num=0
    t0 = time()
    for i in nucleosomes:
        chrom=i.chr
        if i.smt_pval>0.01 or i.fuzziness_pval>0.01: continue # only choose nucleosomes with high value and low fuzziness   
        if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM':
            continue
        num=num+1
        center=int(i.start+i.end)/2
        count=np.zeros(len(args.bams),dtype="float")
        offset=np.zeros(len(args.bams),dtype='int')
        line=str(i)
        for k,name in enumerate(bam.keys()):
            if args.fmt=='bam':
                query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]),method='fetch')
            else:
                query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]))
            read_centers=[]
            for j in query:
                read_centers.append(find_center(j,args.fmt))
            [o,c]=getCount(read_centers,center)
            count[k]=c
            offset[k]=o
            # -- for verbose ---
            if args.verbose:
                print >>out_mark[k],chrom+'\t%d\t%d'%(i.start+o,i.end+o)
            # ------------------
        line = line + "\t" + "\t".join(str(f) for f in count) + '\t' + "\t".join(str(f) for f in offset)
        if num%20000==0:
            t1 = time()
            print >>sys.stderr,"processing %dth nucleosome..., time: %.2fs."%(num,t1-t0),'\r',
            t0 = time()    
        print >>out,line
    print
    out.close()
    
    # -- for verbose ---
    if args.verbose:
        for k in out_mark:
            k.close()
def Main():
    args=ParseArg()

    hist_n=args.hist_n
    clu_n=args.clu_n
    File=args.input


    #read emission matrix and store in Rpy2
    print "#Reading emission matrix from"
    emission=args.emission
    print '\t'+emission
    robjects.r("emission=read.table('"+emission+"',header=T,sep='\t')")
    robjects.r("emission=emission[c(12,11,13,8,7,10,6,9,4,5,2,1,3,15,14),match(c('H3K4me3','H3K4me2','H3K4me1','H3K27me3','H3K36me3','H3K27ac','H2AZ'),colnames(emission))]")
    state_n=robjects.r("dim(emission)[1]")[0] # number of chromatin state
    
    color_state=['red','pink','purple','DarkOrange','Orange','Gold','yellow','DeepSkyBlue','ForestGreen','Green','Lime','GreenYellow','LightCyan','white','white']


    #Find overall distribution of all chromatin states
    print "Counting distribution of chromatin states..."
    chromHMM_segment = TableIO.parse(args.segment,'bed')
    #count represent overall probability distribution of all chromatin states
    count=np.zeros(state_n)
    num=0
    for segment in chromHMM_segment:
        num=num+1
        i=int(segment.id[1:])
        count[i-1]+=(segment.stop-segment.start)/200
        print 'Reading %d segments... [for distribution of chromatin states]'%(num),'\r',
    print


    ## read and index histone pattern data for single nucleosomes in all populations
    print "Indexing histone pattern data for single nucleosomes in all populations..."
    data=TableIO.parse(File,'metabed',header=True)


    ## generate bed file for chromatin states in nucleosomes to be uploaded in UCSC genome browser
    if args.bed:
        name=os.path.basename(File).split('.')[0]
        outbed=open(name+"_State_browser.bed",'w')
        print "## Start generate BED9 file for uploading..."
        print >>outbed,'track name="ChromatinState" description="'+name+'" visibility=2 itemRgb="On"'
        #print >>outbed,'chr\tstart\tend\t'+'\t'.join('P_%d'%(s+1) for s in range(clu_n))

        for n,i in enumerate(data):
            matrix=np.array(str(i).split('\t')[8:(8+hist_n*clu_n)],dtype="int").reshape(hist_n,clu_n,order="F")  # matrix of histone patterns, row: histone, column: population
            if n % 50000 == 0:
                print "\tWriting %dth nucleosomes into BED9 file,\r"%(n),
            line='\t'.join (str(f) for f in [i.chr,i.start,i.stop])
            for k in range(clu_n):
                state=histone2state(matrix.T[k],count)
                color_code=','.join (str(int(f)) for f in np.array(matplotlib.colors.colorConverter.to_rgb(color_state[state-1]))*255)
                print >>outbed,'\t'.join (str(f) for f in [i.chr,i.start,i.stop,'P_%d_%d'%(k+1,state),0,'.',i.start,i.stop,color_code])
                line=line+'\t%d'%(state)
            #print >>outbed,line
        outbed.close()
        sys.exit(1)


    # read region information
    region=args.region
    chro=region.split(":")[0]
    start=int(region.split(":")[1].split("-")[0])
    end=int(region.split(":")[1].split("-")[1])
    print "#Query region:["+chro+": %d-%d]"%(start,end)


    y_nucle=0.47 #location of nucleosome line

    
    ## query data in region
    dbi=binindex(data)
    query=dbi.query(Bed([chro,start,end]))

    ## initialize figure 
    fig=plt.figure(figsize=(10,6))

    ax = plt.subplot(111,frameon=False,yticks=[])
    ax.set_xlim(start-(end-start)/6,end)
    n=0
    print "##Start draw nucleosomes:"

    #################################################
    ## draw genes from y = y_nucle+0.04*(clu_n+1) 
    
    #### index the gene.tab file

    print "  ## drawing gene track ..."
    print "    ## Indexing gene.tab ..."
    gene_dbi=DBI.init(args.genetab,'genebed')


    print "    ## query regions from gene.tab"
    query_gene=gene_dbi.query(Bed([chro,start,end]))
    #### determine height of gene track    
    bottoms=[0 for i in range(100)]
    max_index=0
    for i in query_gene:
        index=0
        while(1):
            if i.start > bottoms[index]:
                bottoms[index]=i.stop
                if max_index<index: max_index=index
                break
            else:
                index+=1
    gene_track_number=max_index+1
    gene_track_height=0.03*gene_track_number+0.02
    ax.set_ylim(0.05,1+gene_track_height+0.01) 
    
    print "    ## start draw gene track"
    # add frame for gene track
    rect=matplotlib.patches.Rectangle((start,y_nucle+0.04),end-start, gene_track_height, edgecolor='black',fill=False)
    ax.add_patch(rect)
    
    bottoms=[0 for i in range(100)]
    for i in gene_dbi.query(Bed([chro,start,end])):
        index=0
        while(1):
            if i.start > bottoms[index]:
                addGeneToFig(i,ax,start,end,1,0.03*index+y_nucle+0.05)
                bottoms[index]=i.stop
                break
            index+=1

 
    ################################################# 
    
    top_heatmap_y = 0.71+gene_track_height # the y axis value for bottom of top heatmaps 

    print "##  Draw nucleosome tracks..."
    for i in query:
        n=n+1
        print "  Nucleosome %d\t at "%(n)+chro+": %d-%d"%(i.start,i.stop)
        matrix=np.array(str(i).split('\t')[8:(8+hist_n*clu_n)],dtype="int").reshape(hist_n,clu_n,order="F")  # matrix of histone patterns, row: histone, column: population
        prob=np.array(str(i).split('\t')[(8+hist_n*clu_n):],dtype=float)

        ax.plot([i.smt_pos,i.smt_pos],[y_nucle+0.03,y_nucle],color='r') #red nucleosome midpoint
        rect=matplotlib.patches.Rectangle((i.start,y_nucle), i.stop-i.start, 0.03, color='#EB70AA') #pink nucleosome region
        ax.add_patch(rect)

        for j in range(clu_n):
            state=histone2state(matrix.T[j],count)
            state_rect=matplotlib.patches.Rectangle((i.start,y_nucle+0.04*(j+1)+gene_track_height+0.01), i.stop-i.start, 0.03, color=color_state[state-1])
            ax.add_patch(state_rect)

    
        im = OffsetImage(matrix, interpolation='nearest',zoom=10/(1+gene_track_height+0.01),cmap=plt.cm.binary,alpha=0.5)
        

        if n<=9:
            xybox=((n+0.5)/10.0,top_heatmap_y)
            xy = [i.smt_pos,y_nucle+0.04*clu_n+0.03+gene_track_height+0.01]
            xytext=((n+0.7)/10.0,top_heatmap_y)
            c_style="bar,angle=180,fraction=-0.1"
        elif n<=18:
            xybox=((n-9+0.5)/10.0,0.2)
            xy = [i.smt_pos,y_nucle]
            xytext = ((n-9+0.7)/10.0,0.40)
            c_style="bar,angle=180,fraction=-0.1"
        else:
            print "WARN: nucleosome number larger than 18 in this region, only plot the pattern for first 18 nucleosomes"
            break

        ab = AnnotationBbox(im, xy,
                            xybox=xybox,
                            xycoords='data',
                            boxcoords=("axes fraction", "data"),
                            box_alignment=(0.,0.),
                            pad=0.1)
        ax.annotate("",xy,
                    xytext=xytext,
                    xycoords='data',
                    textcoords=("axes fraction", "data"),
                    arrowprops=dict(arrowstyle="->",connectionstyle=c_style))
                        #arrowprops=None)
    
        ax.add_artist(ab)
        
        # add mark for histone mark and regions with low confidence
        for i in range(hist_n):
            if prob[i]<0.6:
                xy_star=tuple(map(sum,zip(xybox,(0.065,0.03*(hist_n-1-i)-0.01))))
                ax.annotate("*",xy=xy_star,xycoords=("axes fraction", "data"),color='red')


    ax.annotate('Nucleosome:', xy=(start-(end-start)/6, y_nucle),  xycoords='data',size=12)
    ax.annotate('Epigenetic Pattern:', xy=(start-(end-start)/6, 0.23+top_heatmap_y),  xycoords='data',size=12)
    ax.annotate(chro, xy=(start-(end-start)/6, 0.1),  xycoords='data',size=12)

    name=open(File).readline().split('\t')[8:(8+hist_n)]
    for n,i in enumerate(name):
        ax.annotate(i.split("_")[0],xy=(start-(end-start)/8, top_heatmap_y+0.03*(hist_n-1-n)),xycoords='data',size=10)
        ax.annotate(i.split("_")[0],xy=(start-(end-start)/8, 0.2+0.03*(hist_n-1-n)),xycoords='data',size=10)

    # flame for nucleosome and chromatin state tracks
    rect=matplotlib.patches.Rectangle((start,y_nucle),end-start, 0.03, edgecolor='black',fill=False)
    ax.add_patch(rect)    
    for k in range(clu_n):
        rect=matplotlib.patches.Rectangle((start,y_nucle+0.04*(k+1)+gene_track_height+0.01),end-start, 0.03, edgecolor='grey',fill=False)
        ax.add_patch(rect)
        ax.annotate('Population%d'%(k+1),xy=(start-(end-start)/6, y_nucle+0.04*(k+1)+gene_track_height+0.01),xycoords='data',size=12)

    # chromatin state legend
    for s in range(state_n):
        dist=(end-start)*1.0/state_n 
        length=dist*0.75
        rect=matplotlib.patches.Rectangle((start+dist*s,0.1), length, 0.03, color=color_state[s])
        ax.add_patch(rect)
        ax.annotate(s+1,xy=(start+dist*s+length/3,0.075),xycoords='data',size=10) 
    ax.annotate("Chromatin states:",xy=(start,0.14),xycoords='data',size=12)      
    ax.add_patch(matplotlib.patches.Rectangle((start-length/6,0.07),end-start, 0.1, edgecolor='grey',fill=False))

    plt.title("Region: ["+chro+": %d-%d]"%(start,end),size=14)
    plt.savefig(args.output)
    plt.close()
Esempio n. 25
0
def genome_annotation(outputbam,
                      annotationfile,
                      detail,
                      annotationRepeat,
                      mapq_thred,
                      strandenforced=False,
                      posstrand=True,
                      requireUnique=False,
                      results_dict=dict()):
    # annotationfile is annotation file
    # detail is db_detail file

    if annotationfile:
        dbi1 = DBI.init(annotationfile, "bed")
        dbi2 = DBI.init(detail, "bed")
        dbi3 = DBI.init(annotationRepeat, "bed")

    newdict = dict()
    #    funmap = open(unmapfilename, 'w')

    for record in outputbam:
        # print >> sys.stderr, record.qname
        if "N" not in record.cigarstring:
            anno_start = record.pos
            anno_end = record.aend
            bed_start = record.pos
            bed_end = record.aend
        else:
            bed_list, anno_start, anno_end = Exon_junction(record)
            bed_start = ",".join([str(f[0]) for f in bed_list])
            bed_end = ",".join([str(f[1]) for f in bed_list])


#        print anno_start,anno_end,bed_start,bed_end
        IsMapped = False

        if Included(record, requireUnique, mapq_thred):
            strandactual = ("+" if posstrand else "-")
            strand = "+"
            if record.is_reverse:
                strandactual = ("-" if posstrand else "+")
                strand = "-"
            if annotationfile:
                bed = Bed([
                    outputbam.getrname(record.tid), anno_start, anno_end, '.',
                    0.0, strandactual
                ])
                [typ, name, subtype,
                 strandcol] = annotation(bed, dbi1, dbi2, dbi3)
                if (not strandenforced) or strandcol == 'ProperStrand':
                    curr_anno_arr = (str(f) for f in [
                        outputbam.getrname(
                            record.tid), bed_start, bed_end, strand,
                        record.seq, 'genome', typ, name, subtype, strandcol
                    ])
                    if not record.qname in newdict:
                        newdict[record.qname] = '\t'.join(curr_anno_arr)
                        if not Included(record, True, mapq_thred):
                            # not unique
                            newdict[record.qname] = [newdict[record.qname]]
                    else:
                        if type(newdict[record.qname]) is str:
                            newdict[record.qname] = [newdict[record.qname]]
                        newdict[record.qname].append('\t'.join(curr_anno_arr))
                    IsMapped = True
            else:
                strandcol = '.'
                curr_anno_arr = (str(f) for f in [
                    outputbam.getrname(record.tid), record.aend - record.alen +
                    1, record.aend, strand, record.seq, 'genome', strandcol
                ])
                if not record.qname in newdict:
                    newdict[record.qname] = '\t'.join(curr_anno_arr)
                    if not Included(record, True, mapq_thred):
                        # not unique
                        newdict[record.qname] = [newdict[record.qname]]
                else:
                    if type(newdict[record.qname]) is str:
                        newdict[record.qname] = [newdict[record.qname]]
                    newdict[record.qname].append('\t'.join(curr_anno_arr))
                IsMapped = True

    newanno = dict(results_dict.items() + newdict.items())
    return newanno
Esempio n. 26
0
def Main():
    args=ParseArg()
    distance=args.distance*1000
    pair_dist=args.pair_dist
    
    print "\nChecking if linkedPair file is tabixed..."
    if not os.path.isfile(args.linkedPair):
        print "LinkedPair file is not exist, please check!!"
        sys.exit(0)
    if not os.path.isfile(args.linkedPair+".tbi"):
        print "  tabix-ing..."
        os.system("sort -k1,1 -k2,2n "+args.linkedPair+" > temp_linkedPair.txt")
        os.system("bgzip temp_linkedPair.txt")
        os.system("tabix -p bed temp_linkedPair.txt.gz")
        linkedPair='temp_linkedPair.txt.gz'
    else:
        linkedPair=args.linkedPair
    print "  linkedPair file is tabixed."

    print "\nTabixing the interaction file..."
    os.system("sort -k1,1 -k2,2n "+args.interaction+" > temp_interaction.txt")
    os.system("bgzip temp_interaction.txt")
    os.system("tabix -p bed temp_interaction.txt.gz")
    print "  interaction file is tabixed."


    # start column number for second regions
    # s1 for interaction file and s2 for linkedPair file
    (s1,s2)=args.start

    print "\nExtracting interaction information..."
    if args.n:
        Interactions=open(args.interaction,'r')
        l=Interactions.read().split('\n')[args.n-1].split('\t')
        part1=Bed(l[0:6])
        part2=Bed(l[s1:(s1+6)])
    elif len(args.r)==2:
        part1=read_region(args.r[0])
        part2=read_region(args.r[1])
    else:
        print >> sys.stderr, "need to specify two regions using '-r'"
    
    if "chr" not in part1.chr or "chr" not in part2.chr:
        print >> sys.stderr, "This program only works for genomic regions."
        exit(0)    
    
    start1=part1.start-distance
    end1=part1.stop+distance
    start2=part2.start-distance
    end2=part2.stop+distance
    # if the searched regions for part1 and part2 are overlapped, using the same regions for both part
    if part1.overlap(part2,-2*distance):
        start1=min(start1,start2)
        start2=min(start1,start2)
        end1=max(end1,end2)
        end2=max(end1,end2)

    
    # initialize figure
    print "\n Start plot interaction: "+part1.str_region()+" <-> "+part2.str_region()
    col1="#4F81BD"
    col2="#C0504D"
    fig = plt.figure(figsize=(8,4))
    ax1 = plt.subplot(111,frameon=False,yticks=[])
    plt.tick_params(axis="y",which="both",left="off",right="off",labelleft="off")  # remove y ticks
    plt.subplots_adjust(top=0.75)
    ax2 = ax1.twiny()
    ax1.set_xlim(start1,end1)
    ax2.set_xlim(start2,end2)
    ax1.set_ylim(0,1)
    ax2.set_ylim(0,1)

    #set x ticks withour offset
    locs=ax1.get_xticks()
    ax1.set_xticklabels(map(lambda x: "%i"%x, locs),fontsize=8)
    locs=ax2.get_xticks()
    ax2.set_xticklabels(map(lambda x: "%i"%x, locs),fontsize=8)
    
    
    # input bigWig file for phyloP score
    bw_phyloP = BigWigFile(open(args.phyloP_wig))

    print "\nStart draw gene track"
    gene_dbi=DBI.init(args.genebed,"bed")
    print "  genebed indexed!"
    print "  Plot gene track for Part1"
    gene1_top=Genetrack(Bed([part1.chr,start1,end1]),gene_dbi,ax1,0.08)
    wig1_top=Wigtrack(Bed([part1.chr,start1,end1]), bw_phyloP, ax1, gene1_top,col1)
    
    y_1=wig1_top+0.1
    y_2=y_1+0.2
    
    print "  Plot gene track for Part2"
    gene2_top=Genetrack(Bed([part2.chr,start2,end2]),gene_dbi,ax2,y_2+0.08)    
    wig2_top=Wigtrack(Bed([part2.chr,start2,end2]), bw_phyloP, ax2, gene2_top,col2)
    print "\nQuery interactions within +-%dkbp of interaction"%(distance/1000)
    os.system("tabix temp_interaction.txt.gz %s:%i-%i > temp2.txt"%(part1.chr,start1,end1))
    print "\nList of interactions plotted: "
    k=1
    cmap=cm.get_cmap('Paired', 10)
    cmap=cmap(range(10))
    for b in read_interaction("temp2.txt",s1):
        #if args.Slim and b[0].overlap(b[1],-pair_dist): continue
        if Bed([part2.chr,start2,end2]).overlap(b[1],0):
            k+=1
            x1_2_start=transform(b[0].start,start1,end1,start2,end2)
            x1_2_end=transform(b[0].stop,start1,end1,start2,end2)
            ax2.add_patch(matplotlib.patches.Polygon([[x1_2_start,y_1+0.04],[x1_2_end,y_1+0.04],[b[1].stop,y_2],[b[1].start,y_2]],color=cmap[k%10],alpha=0.4,lw=0.5))
            ax1.add_patch(matplotlib.patches.Rectangle((b[0].start,y_1),b[0].stop-b[0].start,0.04,color=col1,lw=0.5))
            ax2.add_patch(matplotlib.patches.Rectangle((b[1].start,y_2),b[1].stop-b[1].start,0.04,color=col2,lw=0.5))
            print "  "+b[0].str_region()+" <-> "+b[1].str_region()


    ax1.plot([start1,end1],[y_1+0.02,y_1+0.02],color=col1,linewidth=1,alpha=0.7)
    ax2.plot([start2,end2],[y_2+0.02,y_2+0.02],color=col2,linewidth=1,alpha=0.7)


    print "\nQuery linkedPairs within +-%dkbp of interaction"%(distance/1000)
    os.system("tabix "+linkedPair+" %s:%i-%i > temp2.txt"%(part1.chr,start1,end1))
    print "\nList of linked pairs plotted: "
    for b in read_interaction("temp2.txt",s2):
        col='k'
        if args.Slim and SingleFragment(b[0],b[1],pair_dist): continue
        if SingleFragment(b[0],b[1],pair_dist): col='#03C03C'
        if part1.overlap(b[0],-distance) and part2.overlap(b[1],-distance):    
            x1_2_start=transform(b[0].start,start1,end1,start2,end2)
            x1_2_end=transform(b[0].stop,start1,end1,start2,end2)
            if b[0].strand=='-':
                connect1=x1_2_start
            else:
                connect1=x1_2_end
            if b[1].strand=="-":
                connect2=b[1].start
            else:
                connect2=b[1].stop
            ax2.plot([connect1,connect2],[y_1+0.02,y_2+0.02],color=col,alpha=0.3,lw=0.5)
            ax1.plot([b[0].start,b[0].stop],[y_1+0.02,y_1+0.02],color=col,alpha=0.3,lw=0.8)
            ax2.plot([b[1].start,b[1].stop],[y_2+0.02,y_2+0.02],color=col,alpha=0.3,lw=0.8)
           # print "  "+b[0].str_region()+" <-> "+b[1].str_region()
    plt.text(0.5, 1.15, part1.str_region()+" <-> "+part2.str_region(),
         horizontalalignment='center',
         fontsize=10,
         transform = ax1.transAxes)
    plt.text(0.5, 1.10, "Distance: +-%dkbp of interaction"%(distance/1000),
         horizontalalignment='center',
         fontsize=8,
         transform = ax1.transAxes)
    ax1.text(part1.center,y_1-0.03,"|".join([part1.type,part1.name,part1.subtype]),
             verticalalignment='center', horizontalalignment='center',fontsize=8,color=col1)
    ax2.text(part2.center,y_2+0.07,"|".join([part2.type,part2.name,part2.subtype]),
             verticalalignment='center', horizontalalignment='center',fontsize=8,color=col2)
    ax1.set_ylim(0,wig2_top+0.1)
    ax2.set_ylim(0,wig2_top+0.1)
    ax1.text(start1, 0.05, part1.chr,horizontalalignment='left',fontsize=8)
    ax2.text(start2, wig2_top+0.04, part2.chr,horizontalalignment='left',fontsize=8)
    plt.savefig(args.output)
    plt.show()
     
    # remove temp file
    os.system("rm temp_interaction.txt.gz*")
    if not os.path.isfile(args.linkedPair+".tbi"):
        os.system("rm temp_linkedPair.txt.gz*")
    os.system("rm temp2.txt")
Esempio n. 27
0
def Main():
    """
    IO TEMPLATE
    """
    global args, out
    args = ParseArg()
    fin = IO.fopen(args.input, "r")
    out = IO.fopen(args.output, "w")
    """
    END OF IO TEMPLATE 
    """
    print >> out, "# This data was generated by program ", sys.argv[0], " (version: %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :"
    print >> out, "#\t", " ".join(sys.argv)

    hSites = {}
    donorSites = {}
    acceptorSites = {}
    if args.genome is not None:
        genome = DBI.init(args.genome, "genome")
    else:
        genome = None

    j = 0
    for j, i in enumerate(TableIO.parse(fin, "bam2bed12", references=fin.references, strand=args.strand)):
        # print >>out,i
        if j % 1000 == 0:
            print >>sys.stderr, "processing ", j, "reads               \r",
        for intron in i.Introns():
            if len(intron) < args.intron_min_length:
                continue
            donor = intron.head()
            # print >>sys.stderr,intron
            # print >>sys.stderr,donor
            donorID = bedToID(donor)
            if donorSites.has_key(donorID):
                donorSites[donorID] += 1
            else:
                donorSites[donorID] = 1
            acceptor = intron.tail()
            acceptorID = bedToID(acceptor)
            if acceptorSites.has_key(acceptorID):
                acceptorSites[acceptorID] += 1
            else:
                acceptorSites[acceptorID] = 1
            """ 
            if genome is not None:
                
                s=genome.query(intron.head()).upper()+".."+genome.query(intron.tail()).upper()
                if hSites.has_key(s):
                    hSites[s]+=1
                else:
                    hSites[s]=1
            """
    donors = []
    for key in donorSites.keys():
        a = key.split("\t")
        donors.append(Bed([a[0], a[1], a[2], "noname_donor", donorSites[key], a[3]]))
    donors.sort()
    for i, x in enumerate(donors):
        x.id = "donor_" + str(i)
        print >> out, x, "\t", genome.query(x).upper()

    acceptors = []
    for key in acceptorSites.keys():
        a = key.split("\t")
        acceptors.append(Bed([a[0], a[1], a[2], "noname_acceptor", acceptorSites[key], a[3]]))
    acceptors.sort()
    for i, x in enumerate(acceptors):
        x.id = "acceptor_" + str(i)
        print >> out, x, "\t", genome.query(x).upper()
Esempio n. 28
0
def genome_annotation(
    outputbam,
    annotationfile,
    detail,
    annotationRepeat,
    mapq_thred,
    strandenforced=False,
    posstrand=True,
    requireUnique=False,
    results_dict=dict(),
):
    # annotationfile is annotation file
    # detail is db_detail file

    if annotationfile:
        dbi1 = DBI.init(annotationfile, "bed")
        dbi2 = DBI.init(detail, "bed")
        dbi3 = DBI.init(annotationRepeat, "bed")

    newdict = dict()
    #    funmap = open(unmapfilename, 'w')

    for record in outputbam:
        # print >> sys.stderr, record.qname
        if "N" not in record.cigarstring:
            anno_start = record.pos
            anno_end = record.aend
            bed_start = record.pos
            bed_end = record.aend
        else:
            bed_list, anno_start, anno_end = Exon_junction(record)
            bed_start = ",".join([str(f[0]) for f in bed_list])
            bed_end = ",".join([str(f[1]) for f in bed_list])
        #        print anno_start,anno_end,bed_start,bed_end
        IsMapped = False

        if Included(record, requireUnique, mapq_thred):
            strandactual = "+" if posstrand else "-"
            strand = "+"
            if record.is_reverse:
                strandactual = "-" if posstrand else "+"
                strand = "-"
            if annotationfile:
                bed = Bed([outputbam.getrname(record.tid), anno_start, anno_end, ".", 0.0, strandactual])
                [typ, name, subtype, strandcol] = annotation(bed, dbi1, dbi2, dbi3)
                if (not strandenforced) or strandcol == "ProperStrand":
                    curr_anno_arr = (
                        str(f)
                        for f in [
                            outputbam.getrname(record.tid),
                            bed_start,
                            bed_end,
                            strand,
                            record.seq,
                            "genome",
                            typ,
                            name,
                            subtype,
                            strandcol,
                        ]
                    )
                    if not record.qname in newdict:
                        newdict[record.qname] = "\t".join(curr_anno_arr)
                        if not Included(record, True, mapq_thred):
                            # not unique
                            newdict[record.qname] = [newdict[record.qname]]
                    else:
                        if type(newdict[record.qname]) is str:
                            newdict[record.qname] = [newdict[record.qname]]
                        newdict[record.qname].append("\t".join(curr_anno_arr))
                    IsMapped = True
            else:
                strandcol = "."
                curr_anno_arr = (
                    str(f)
                    for f in [
                        outputbam.getrname(record.tid),
                        record.aend - record.alen + 1,
                        record.aend,
                        strand,
                        record.seq,
                        "genome",
                        strandcol,
                    ]
                )
                if not record.qname in newdict:
                    newdict[record.qname] = "\t".join(curr_anno_arr)
                    if not Included(record, True, mapq_thred):
                        # not unique
                        newdict[record.qname] = [newdict[record.qname]]
                else:
                    if type(newdict[record.qname]) is str:
                        newdict[record.qname] = [newdict[record.qname]]
                    newdict[record.qname].append("\t".join(curr_anno_arr))
                IsMapped = True

    newanno = dict(results_dict.items() + newdict.items())
    return newanno
Esempio n. 29
0
def Main():
    args = ParseArg()
    fastq1 = open("simulated_" + str(args.num) + "_read_R1.fastq", "w")
    fastq2 = open("simulated_" + str(args.num) + "_read_R2.fastq", "w")

    RNA = TableIO.parse(args.annotation, 'bed')

    # create a dictionary for all RNAs pools except rRNA
    RNAs = {}
    for b in RNA:
        if b.id.startswith('rRNA'): continue
        if b.chr.startswith('chrM') or b.chr.startswith('chrNT'): continue
        Type = b.id.split(".")[0]
        if Type in RNAs:
            RNAs[Type].append(b)
        else:
            RNAs[Type] = [b]

    #---------------- read linker seq ------------------
    linkers = []
    for i in open(args.linker, 'r'):
        i = i.strip()
        linkers.append(i)
    #---------------------------------------------------

    #---------------- read barcode ---------------------
    barcodes = []
    for i in open(args.barcode, 'r'):
        i = i.strip()
        barcodes.append(i)
    #---------------------------------------------------

    # sample different classes: LinkerOnly, Nolinker, RNA1-linker, linker-RNA2, RNA1-linker-RNA2
    xk = range(5)
    pk = args.parameter
    custm = stats.rv_discrete(name='custm', values=(xk, pk))
    Class_index = custm.rvs(size=args.num)
    # specify output
    out = open(args.output, 'w')

    # initiate the annotation database
    if args.db_detail:
        print >> sys.stderr, " # Index for the annotation database"
        dbi1 = DBI.init(args.annotation, "bed")
        dbi2 = DBI.init(args.db_detail, "bed")
        dbi3 = DBI.init(
            "/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt",
            "bed")

    print >> sys.stderr, " # Start to simulate reads"
    t0 = time.time()
    for i in range(0, args.num):
        pair_id = "read_" + str(i)
        # barcode
        randSeq = "".join([random.choice("ACGT") for x in range(6)])
        barcode = randSeq[0:4] + barcodes[0] + randSeq[4:6]

        index = Class_index[i]  # index for different classes of fragments
        # Sample RNA1 and RNA2
        RNA1_len = random.randrange(15, 150)
        b, Type = randRegion(RNA1_len, RNAs)
        RNA1_seq = fetchSeq(b.chr, b.start, b.stop, b.strand, args.genomeFa,
                            args.spath)
        if args.db_detail:
            [name1, typ1, subtype1] = annotation(b, dbi1, dbi2, dbi3)
            RNA1_str = "\t".join(
                str(f) for f in
                [b.chr, b.start, b.stop, b.strand, name1, typ1, subtype1])
        else:
            RNA1_str = "\t".join(
                str(f) for f in [b.chr, b.start, b.stop, b.strand, Type])
        RNA2_len = random.randrange(15, 150)
        b, Type = randRegion(RNA2_len, RNAs)
        RNA2_seq = fetchSeq(b.chr, b.start, b.stop, b.strand, args.genomeFa,
                            args.spath)
        if args.db_detail:
            [name2, typ2, subtype2] = annotation(b, dbi1, dbi2, dbi3)
            RNA2_str = "\t".join(
                str(f) for f in
                [b.chr, b.start, b.stop, b.strand, name2, typ2, subtype2])
        else:
            RNA2_str = "\t".join(
                str(f) for f in [b.chr, b.start, b.stop, b.strand, Type])

        # fragment is the recovered cDNA fragment
        if index == 1:  # single RNA or RNA1-RNA2
            if random.choice([0, 1]) == 0:  # single RNAs
                fragment = barcode + RNA1_seq + RNA2_seq
                print >> out, pair_id + "\t%d\tRNA1-RNA2\t0" % (
                    len(fragment)) + "\t" + RNA1_str + '\t' + RNA2_str
            else:  # RNA1-RNA2
                fragment = barcode + RNA1_seq
                print >> out, pair_id + "\t%d\tsingleRNA\t0" % (
                    len(fragment)) + "\t" + RNA1_str
        else:
            linker_n = random.choice([1, 2])  # number of linkers in fragment
            linker = "".join([linkers[0]] * linker_n)
            if index == 0:
                fragment = barcode + linker
                print >> out, pair_id + "\t%d\tlinkerOnly\t%d" % (
                    len(fragment), linker_n)
            elif index == 2:
                fragment = barcode + RNA1_seq + linker
                print >> out, pair_id + "\t%d\tRNA1-linker\t%d" % (
                    len(fragment), linker_n) + "\t" + RNA1_str
            elif index == 3:
                fragment = barcode + linker + RNA2_seq
                print >> out, pair_id + "\t%d\tlinker-RNA2\t%d" % (
                    len(fragment), linker_n) + "\t" + RNA2_str
            elif index == 4:
                fragment = barcode + RNA1_seq + linker + RNA2_seq
                print >> out, pair_id + "\t%d\tRNA1-linker-RNA2\t%d" % (len(
                    fragment), linker_n) + "\t" + RNA1_str + "\t" + RNA2_str

        read1, read2 = generatePairs(fragment, args.len, args.errorRate)
        score = []
        for j in range(0, args.len):
            score.append(random.randrange(10, 40))
        record1 = SeqRecord(Seq(read1, generic_dna), id=pair_id)
        record1.letter_annotations["phred_quality"] = score
        record2 = SeqRecord(Seq(read2, generic_dna), id=pair_id)
        record2.letter_annotations["phred_quality"] = score
        SeqIO.write(record1, fastq1, "fastq")
        SeqIO.write(record2, fastq2, "fastq")

        if i % 100 == 0:
            print >> sys.stderr, "generate pairs %d\r" % (i),
    fastq1.close()
    fastq2.close()
    out.close()
    print time.time() - t0
def Main():
    t1 = time()
    args = ParseArg()
    inp = open(args.input, 'r')
    min_clusterS = args.min_clusterS
    min_interaction = args.min_interaction
    p_value = args.p_value
    output = open(args.output, 'w')
    outputIntra = open(args.output_intra, 'w')

    hasAnnotation = False
    if args.annotation:
        dbi = DBI.init(args.annotation, "bed")
        hasAnnotation = True
    else:
        dbi = False

    if args.annotation_repeat:
        dbirepeat = DBI.init(args.annotation_repeat, "bed")
        hasAnnotationRepeat = True
    else:
        dbirepeat = False

    #store count of RNA for part1 and part2
    part = {}

    k = 0
    sgcount = 0  #single fragment count

    print >> sys.stderr, "# Inputing data..."
    interaction = {}  # store number of interactions for different RNA
    selfinteraction = {}

    #Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript","sense_overlapping","rRNA_repeat","rRNA"]
    for line in inp.read().split('\n'):
        if line == '': continue
        line = line.strip().split('\t')
        p1 = annotated_bed_proper(line[0:10], id=k, cluster=1)
        p2 = annotated_bed_proper(line[11:], id=k, cluster=1)
        if isinstance(p1.start, list):
            p1.start = int(p1.start[0])
            p1.end = int(p1.end[-1])
        if isinstance(p2.start, list):
            p2.start = int(p2.start[0])
            p2.end = int(p2.end[-1])

        if SingleFragment(p1, p2):
            sgcount += 1
            continue
        k += 1
        #if p1.subtype=="intron" or p2.subtype=="intron": continue
        #if p1.type in Types:
        try:
            p1_name = GetAnnotationName(p1, hasAnnotation, dbi,
                                        hasAnnotationRepeat, dbirepeat)
            if p1_name not in part:
                part[p1_name] = 1
            else:
                part[p1_name] += 1
            #if p2.type in Types:
            p2_name = GetAnnotationName(p2, hasAnnotation, dbi,
                                        hasAnnotationRepeat, dbirepeat)
            if not p1_name == p2_name:  # count once for self-interaction
                if p2_name not in part:
                    part[p2_name] = 1
                else:
                    part[p2_name] += 1
            #if p1.type in Types and p2.type in Types:
            if p1_name == p2_name:
                if p1_name not in selfinteraction:
                    selfinteraction[p1_name] = copy.deepcopy(p1)
                else:
                    selfinteraction[p1_name].Update(p1.start, p1.end)
                    selfinteraction[p1_name].Update(p2.start, p2.end)
                    selfinteraction[p1_name].cluster += 1
            else:
                if p1_name > p2_name:
                    temp = p1
                    p1 = p2
                    p2 = temp
                    tempName = p1_name
                    p1_name = p2_name
                    p2_name = tempName
                inter_name = p1_name + "--" + p2_name
                if inter_name not in interaction:
                    interaction[inter_name] = [
                        copy.deepcopy(p1),
                        copy.deepcopy(p2)
                    ]
                else:
                    interaction[inter_name][0].Update(p1.start, p1.end)
                    interaction[inter_name][1].Update(p2.start, p2.end)
                    interaction[inter_name][0].cluster += 1
        except Exception as e:
            print >> sys.stderr, e
        if k % 20000 == 0:
            print >> sys.stderr, "  Reading %d pairs of segments\r" % (k),
    print >> sys.stdout, "Get total %d pairs." % (k)
    print >> sys.stdout, "Single fragment count: %d." % (sgcount)

    print >> sys.stdout, "   number of different RNAs is %d          " % (
        len(part))

    total = k  # total pairs used
    n = 0
    k = 0  # record number of strong interactions
    for i in interaction:
        n += 1
        count = interaction[i][0].cluster
        if count < min_interaction: continue
        p1_name = i.split("--")[0]
        p2_name = i.split("--")[1]
        P1 = interaction[i][0]
        P2 = interaction[i][1]
        P1.cluster = part[p1_name]
        P2.cluster = part[p2_name]
        if part[p1_name] < min_clusterS or part[p2_name] < min_clusterS:
            continue
        real_p = 1 - hypergeom.cdf(count, total, part[p1_name], part[p2_name])
        if real_p <= p_value:
            k = k + 1
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output, str(P1) + '\t' + str(P2) + '\t%d\t%.4f' % (count,
                                                                        log_p)
        if n % 500 == 0:
            print >> sys.stderr, "  Progress ( %d / %d )\r" % (
                n, len(interaction)),
    k1 = 0
    for i in selfinteraction:
        n += 1
        count = selfinteraction[i].cluster
        if count < min_interaction: continue
        p1_name = i
        P1 = selfinteraction[i]
        P1.cluster = part[p1_name]
        if part[p1_name] < min_clusterS: continue
        k1 = k1 + 1
        print >> outputIntra, str(P1) + '\t%d' % (count)
        if n % 500 == 0:
            print >> sys.stderr, "  Progress ( %d / %d )\r" % (
                n, len(interaction)),
    print >> sys.stdout, "# Find %d strong and %d self interactions. Cost time: %.2f s" % (
        k, k1, time() - t1)
Esempio n. 31
0
def Main():
    t1=time()
    
    global min_interaction, p_value
    args=ParseArg()
    inp = open(args.input, 'r')
    min_clusterS=args.min_clusterS
    min_interaction=args.min_interaction
    p_value=args.p_value
    output=open(args.output,'w')
    ncpus=args.parallel


    #store genomic location of part1 and part2
    part1=[]
    part2=[]


    k=0
    
    print >> sys.stderr,"# Inputing data..."

    chr_list=[]
    for line in inp.read().split('\n'):
        if line=='': continue
        line=line.strip().split('\t')
        p1=annotated_bed(line[0:10],id=k)
        p2=annotated_bed(line[11:],id=k)
        if isinstance(p1.start, list):
            p1.start=int(p1.start[0])
            p1.end=int(p1.end[-1])
        if isinstance(p2.start, list):
            p2.start=int(p2.start[0])
            p2.end=int(p2.end[-1])
        if SingleFragment(p1,p2): continue
        k+=1
        part1.append(p1)
        part2.append(p2)
        if p1.chr not in chr_list: chr_list.append(p1.chr)
        if p2.chr not in chr_list: chr_list.append(p2.chr)
        if k%20000==0: 
            print >> sys.stderr,"  Reading %d pairs of segments\r"%(k),
    print >> sys.stderr,"Get total %d pairs."%(k)
    
    if len(part1)!=len(part2):
        print >> sys.stderr, "## ERROR: number of regions in two part not match!!"
        sys.exit(0)

    # sort in genomic order, easy for clustering
    part1=sorted(part1, key=attrgetter('start'))
    part1=sorted(part1, key=attrgetter('chr'))
    part2=sorted(part2, key=attrgetter('start'))
    part2=sorted(part2, key=attrgetter('chr'))

    # for parallel computing 
    print >>sys.stderr,"# Generating clusters for two parts..."
    # tuple of all parallel python servers to connect with
    ppservers = ()
    job_server = pp.Server(ncpus, ppservers=ppservers)
    jobs1=[]
    jobs2=[]
    for chro in chr_list:
        part1_temp=filter(lambda p: p.chr==chro, part1)
        if len(part1_temp)>0:
            jobs1.append(job_server.submit(cluster_regions,(part1_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",)))
        part2_temp=filter(lambda p: p.chr==chro, part2)
        if len(part2_temp)>0:
            jobs2.append(job_server.submit(cluster_regions,(part2_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",)))
        

    cluster_pool1={}
    part1=[]
    for job in jobs1: 
        try:
            part1=part1+job()[1]
            cluster_pool1.update(job()[0])
        except:
            print >> sys.stderr, "Wrong in %s, part1"%(job()[2])
            continue
    cluster_pool2={}
    part2=[]
    for job in jobs2:
        try:
            part2=part2+job()[1]
            cluster_pool2.update(job()[0])
        except:
            continue


    print >>sys.stderr,"   cluster number for part1 is %d          "%(len(cluster_pool1))
    print >>sys.stderr,"   cluster number for part2 is %d          "%(len(cluster_pool2))

    # sort back to pair two parts
    part1=sorted(part1, key=attrgetter('id'))
    part2=sorted(part2, key=attrgetter('id'))

    print >> sys.stderr,"size of part1&2:",len(part1),len(part2)

    c_interaction={}
    for i in range(len(part1)):
        region1=str(part1[i])
        region2=str(part2[i])
        try:
            inter=part1[i].cluster+"--"+part2[i].cluster
        except:
            print >> sys.stderr,i,part1[i].cluster,part2[i].cluster
            sys.exit()
        if c_interaction.has_key(inter):
            c_interaction[inter]+=1
        else:
            c_interaction[inter]=1

    # annotation file
    print >> sys.stderr,"# Indexing annotation files"
    dbi_all=DBI.init(args.annotation,"bed")
    dbi_detail=DBI.init(args.db_detail,"bed")
    dbi_repeat=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed")


    print >> sys.stderr,"# finding strong interactions from clusters..."
    k=0 # record for strong interactions
    n=0

    # annotation file

    for interaction in c_interaction:
        n=n+1
        count=c_interaction[interaction]
        if count<min_interaction: continue
        i=interaction.split("--")[0]
        j=interaction.split("--")[1]
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1=cluster_pool1[i].cluster
            count2=cluster_pool2[j].cluster
        except:
            continue
        real_p=1-hypergeom.cdf(count,len(part1),count1,count2)
        if real_p<=p_value:
            k=k+1
            cluster_pool1[i].Annotate(dbi_all,dbi_detail,dbi_repeat)
            cluster_pool2[j].Annotate(dbi_all,dbi_detail,dbi_repeat)
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output,str(cluster_pool1[i])+'\t'+str(cluster_pool2[j])+'\t%d\t%.4f'%(count,log_p)
        if n%1000==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(c_interaction)),

    print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1)

    if args.FDR:
        print >> sys.stderr, "# Permutated results:"
        for i in range(10):
            shuffle(part2)
            [n_r_I,n_r_SI]=Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2)
            print >> sys.stderr, "  ",i, n_r_I, n_r_SI, n_r_SI*1.0/n_r_I
def Main():
    t1=time()
    args=ParseArg()
    inp = open(args.input, 'r')
    min_clusterS=args.min_clusterS
    min_interaction=args.min_interaction
    p_value=args.p_value
    output=open(args.output,'w')
    outputIntra = open(args.output_intra, 'w')

    hasAnnotation = False
    if args.annotation:
        dbi = DBI.init(args.annotation, "bed")
        hasAnnotation = True
    else:
        dbi = False

    if args.annotation_repeat:
        dbirepeat = DBI.init(args.annotation_repeat, "bed")
        hasAnnotationRepeat = True
    else:
        dbirepeat = False        

    #store count of RNA for part1 and part2
    part={}


    k=0
    sgcount = 0 #single fragment count
    
    print >> sys.stderr,"# Inputing data..."
    interaction = {}  # store number of interactions for different RNA
    selfinteraction = {}



    #Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript","sense_overlapping","rRNA_repeat","rRNA"]
    for line in inp.read().split('\n'):
        if line=='': continue
        line=line.strip().split('\t')
        p1=annotated_bed_proper(line[0:10],id=k,cluster=1)
        p2=annotated_bed_proper(line[11:],id=k,cluster=1)
        if isinstance(p1.start, list):
            p1.start=int(p1.start[0])
            p1.end=int(p1.end[-1])
        if isinstance(p2.start, list):
            p2.start=int(p2.start[0])
            p2.end=int(p2.end[-1])
                
        if SingleFragment(p1,p2):
            sgcount += 1
            continue
        k+=1
        #if p1.subtype=="intron" or p2.subtype=="intron": continue
        #if p1.type in Types:
        try:
            p1_name = GetAnnotationName(p1, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) 
            if p1_name not in part:
                part[p1_name]=1
            else:
                part[p1_name]+=1  
            #if p2.type in Types:
            p2_name = GetAnnotationName(p2, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) 
            if not p1_name == p2_name: # count once for self-interaction
                if p2_name not in part:
                    part[p2_name]=1
                else:
                    part[p2_name]+=1
            #if p1.type in Types and p2.type in Types:
            if p1_name == p2_name:
                if p1_name not in selfinteraction:
                    selfinteraction[p1_name]=copy.deepcopy(p1)
                else:
                    selfinteraction[p1_name].Update(p1.start, p1.end)
                    selfinteraction[p1_name].Update(p2.start, p2.end)
                    selfinteraction[p1_name].cluster += 1
            else:
                if p1_name>p2_name:
                    temp = p1
                    p1 = p2
                    p2 = temp
                    tempName = p1_name
                    p1_name = p2_name
                    p2_name = tempName
                inter_name = p1_name + "--" + p2_name
                if inter_name not in interaction:
                    interaction[inter_name]=[copy.deepcopy(p1),copy.deepcopy(p2)]
                else:
                    interaction[inter_name][0].Update(p1.start,p1.end)
                    interaction[inter_name][1].Update(p2.start,p2.end)
                    interaction[inter_name][0].cluster+=1
        except Exception as e:
            print >> sys.stderr, e
        if k%20000==0: 
            print >> sys.stderr,"  Reading %d pairs of segments\r"%(k),
    print >> sys.stdout,"Get total %d pairs."%(k)
    print >> sys.stdout,"Single fragment count: %d."%(sgcount)

    print >>sys.stdout,"   number of different RNAs is %d          "%(len(part))
    
    total = k # total pairs used
    n=0
    k=0  # record number of strong interactions
    for i in interaction:
        n+=1
        count = interaction[i][0].cluster
        if count < min_interaction: continue
        p1_name = i.split("--")[0]
        p2_name = i.split("--")[1]
        P1 = interaction[i][0]
        P2 = interaction[i][1]
        P1.cluster = part[p1_name]
        P2.cluster = part[p2_name]
        if part[p1_name]<min_clusterS or part[p2_name]<min_clusterS: continue
        real_p=1-hypergeom.cdf(count,total,part[p1_name],part[p2_name])
        if real_p<=p_value:
            k=k+1
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output, str(P1)+'\t'+str(P2)+'\t%d\t%.4f'%(count,log_p)
        if n%500==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(interaction)),
    k1=0
    for i in selfinteraction:
        n+=1
        count = selfinteraction[i].cluster
        if count < min_interaction: continue
        p1_name = i
        P1 = selfinteraction[i]
        P1.cluster = part[p1_name]
        if part[p1_name]<min_clusterS: continue
        k1=k1+1
        print >> outputIntra, str(P1)+'\t%d'%(count)
        if n%500==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(interaction)),
    print >> sys.stdout,"# Find %d strong and %d self interactions. Cost time: %.2f s"%(k,k1,time()-t1)
Esempio n. 33
0
def genome_annotation(outputbam, annotationfile, detail, readfilename, unmapfilename, strandenforced = False, posstrand = True, requireUnique = False, results_dict = dict()):
    # annotationfile is annotation file
    # detail is db_detail file

    if annotationfile:
        dbi1=DBI.init(annotationfile,"bed")
        dbi2=DBI.init(detail,"bed")
        dbi3=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed")
    
    newdict = dict()
    funmap = open(unmapfilename, 'w')

    for record in outputbam:
        # print >> sys.stderr, record.qname
        IsMapped = False

        if Included(record, requireUnique):
            strandactual = ("+" if posstrand else "-")
            strand = "+"
            if record.is_reverse:
                strandactual = ("-" if posstrand else "+")
                strand = "-"
            if annotationfile:
                bed=Bed([outputbam.getrname(record.tid), record.pos, record.aend,'.',0.0,strandactual])
                [typ, name, subtype, strandcol] = annotation(bed,dbi1,dbi2,dbi3)
                if (not strandenforced) or strandcol == 'ProperStrand':
                    curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.pos, record.aend, strand, record.seq, 'genome', typ, name, subtype, strandcol])
                    if not record.qname in newdict:
                        newdict[record.qname] = '\t'.join(curr_anno_arr)
                        if not Included(record, True):
                            # not unique
                            newdict[record.qname] = [newdict[record.qname]]
                    else:
                        if type(newdict[record.qname]) is str:
                            newdict[record.qname] = [newdict[record.qname]]
                        newdict[record.qname].append('\t'.join(curr_anno_arr))
                    IsMapped = True
            else:
                strandcol = '.'
                curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.aend - record.alen + 1, record.aend, strand, record.seq, 'genome', strandcol])
                if not record.qname in newdict:
                    newdict[record.qname] = '\t'.join(curr_anno_arr)
                    if not Included(record, True):
                        # not unique
                        newdict[record.qname] = [newdict[record.qname]]
                else:
                    if type(newdict[record.qname]) is str:
                        newdict[record.qname] = [newdict[record.qname]]
                    newdict[record.qname].append('\t'.join(curr_anno_arr))
                IsMapped = True

        if not IsMapped:
            # output all pairs that cannot be mapped on both sides as unmaped pairs into two fasta file
            seq = record.seq
            if record.is_reverse:
                seq = revcomp(record.seq, rev_table)
            unmap_rec = SeqRecord(Seq(seq, IUPAC.unambiguous_dna), id = record.qname, description='')
            SeqIO.write(unmap_rec, funmap, "fasta")
    
    funmap.close()
    
    newanno = dict(results_dict.items() + newdict.items())
    return newanno
Esempio n. 34
0
def Main():
    global args
    args = ParseArg()
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout

    dbi = DBI.init(args.db, "genebed")
    count = {}
    count["Intergenic"] = 0
    for x in TableIO.parse(args.input, args.input_format):
        flag = 0
        gene = ""
        for hit in dbi.query(x):
            flag = 1
            if hit.align_id == gene:
                continue
            gene = hit.align_id
            #print hit
            #print hit.cds_start,hit.cds_stop
            if (hit.cds_start == hit.cds_stop):
                if hit.align_id[0:3] == "Mir":
                    loc = "MiRNA"
                else:
                    loc = "Non-coding"
            elif hit.strand == "+":
                if x.stop <= hit.cds_start:
                    loc = "5-UTR"
                elif x.start >= hit.cds_stop:
                    loc = "3-UTR"
                else:
                    loc = judge_exon(x, hit)

            else:
                if x.stop <= hit.cds_start:
                    loc = "3-UTR"
                elif x.start >= hit.cds_stop:
                    loc = "5-UTR"
                else:
                    loc = judge_exon(x, hit)
            print >> out, "\t".join(
                str(f) for f in [
                    x.chr, x.start, x.stop, x.id, x.score, x.strand,
                    hit.align_id, loc
                ])
            if count.has_key(loc):
                count[loc] += 1
            else:
                count[loc] = 1

        if flag == 0:
            print >> out, "\t".join(
                str(f) for f in [
                    x.chr, x.start, x.stop, x.id, x.score, x.strand, "None",
                    "Intergenic"
                ])
            count["Intergenic"] += 1

    out2 = open(args.output.split(".")[0] + ".cisStat", "w")
    for key in sorted(count.keys()):
        print >> out2, key + "\t" + str(count[key])
Esempio n. 35
0
def Main():
    global args,chrs,lengths,out
    args=ParseArg()
    if args.out=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.out,"w")
        except IOError:
            print >>sys.stderr,"can't open file",args.out,"to write, using stdout instead"
            out=sys.stdout

    if args.bamlistA:
        dbi_A=DBI.init(args.bamlistA,"bamlist")
    else:
        dbi_A=DBI.init(args.bamA,"bamlist")
    if args.bamlistB:
        dbi_B=DBI.init(args.bamlistB,"bamlist")
    else:
        dbi_B=DBI.init(args.bamB,"bamlist")

    print_header()
    '''
    Priority:
    Region > Annotations > chromSize
    '''
    if args.region:
        '''
        Query Only Region
        '''
        i=parseRegion(args.region)
        for aps in QueryBed(i,dbi_A,dbi_B):
                print >>out,aps
    elif args.annotations:
        '''
        Query Regions in Bed file or VCF file etc.
        '''
        for i in TableIO.parse(args.annotations,args.annotation_format):
            for aps in QueryBed(i,dbi_A,dbi_B):
                print >>out,aps
    elif args.chromsize:
        '''
        Query Whole Genome
        Chromsize File Example:
        chr1    249250621
        chr2    243199373
        chr3    198022430
        .
        .
        .

        '''
        for x in TableIO.parse(args.chromsize):
            (chr,size)=x
            binsize=1000000
            chr=chr.strip()
            for i in xrange(0,size,binsize):
                start=i
                stop=i+binsize
                if stop>size: stop=size
                bed=Bed([chr,start,stop,".",".","."])
                for aps in QueryBed(bed,dbi_A,dbi_B):
                    print >>out,aps
    else:
        print >>sys.stderr," at least one of the options -r,-g,-a are required"
        exit(0)
    s: start column number for second part of interaction
    '''
    a=open(File,'r')
    for l in a.read().split('\n'):
        if l.strip()=="": continue
        lsep=l.split('\t')
        if lsep[3] in ['+','-']:
            bed1=Bed(lsep[0:3],strand=lsep[3])
            bed2=Bed(lsep[s:(s+3)],strand=lsep[s+3])
        else:
            bed1=Bed(lsep[0:3])
            bed2=Bed(lsep[s:(s+3)])
        yield (bed1,bed2,lsep)

# annotation files
db="/home/yu68/bharat-interaction/new_lincRNA_data/all_RNAs-rRNA_repeat.txt"
db_detail="/home/yu68/bharat-interaction/new_lincRNA_data/Ensembl_mm9.genebed"
db_repeat="/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt"
print >>sys.stderr, "Indexing annotation files..."
ref_allRNA=DBI.init(db,"bed") # the DBI init file for bed6 file of all kinds of RNA
ref_detail=DBI.init(db_detail,"bed") # the DBI init file for bed12 file of lincRNA and mRNA with intron, exon, UTR
ref_repeat=DBI.init(db_repeat,"bed")

print >>sys.stderr, "Start to update..."
for l in read_interaction(sys.argv[1],7):
    l[2][3:6] = annotation(l[0],ref_allRNA,ref_detail,ref_repeat)
    l[2][10:13] = annotation(l[1],ref_allRNA,ref_detail,ref_repeat)
    print "\t".join(l[2])

    
Esempio n. 37
0
def Main():
    args=ParseArg()
    fastq1=open("simulated_"+str(args.num)+"_read_R1.fastq","w")
    fastq2=open("simulated_"+str(args.num)+"_read_R2.fastq","w")


    RNA=TableIO.parse(args.annotation,'bed')

    # create a dictionary for all RNAs pools except rRNA
    RNAs = {}
    for b in RNA:
        if b.id.startswith('rRNA'): continue
        if b.chr.startswith('chrM') or b.chr.startswith('chrNT'): continue
        Type = b.id.split(".")[0]
        if Type in RNAs:
            RNAs[Type].append(b)
        else:
            RNAs[Type]=[b]
    
    #---------------- read linker seq ------------------
    linkers=[]
    for i in open(args.linker,'r'):
        i=i.strip()
        linkers.append(i)
    #---------------------------------------------------

    #---------------- read barcode ---------------------
    barcodes=[]
    for i in open(args.barcode,'r'):
        i=i.strip()
        barcodes.append(i)
    #---------------------------------------------------
    
 
    # sample different classes: LinkerOnly, Nolinker, RNA1-linker, linker-RNA2, RNA1-linker-RNA2
    xk = range(5)
    pk = args.parameter
    custm = stats.rv_discrete(name='custm', values=(xk, pk))
    Class_index = custm.rvs(size=args.num)
    # specify output
    out = open(args.output,'w')


    # initiate the annotation database
    if args.db_detail:
        print >> sys.stderr, " # Index for the annotation database"
        dbi1=DBI.init(args.annotation,"bed")
        dbi2=DBI.init(args.db_detail,"bed")
        dbi3=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed")

    print >> sys.stderr, " # Start to simulate reads"
    t0 = time.time()
    for i in range(0,args.num):
        pair_id = "read_"+str(i)
        # barcode
        randSeq = "".join([random.choice("ACGT") for x in range(6)])
        barcode = randSeq[0:4]+barcodes[0]+randSeq[4:6]

        index = Class_index[i]  # index for different classes of fragments
        # Sample RNA1 and RNA2
        RNA1_len = random.randrange(15,150)
        b,Type = randRegion(RNA1_len,RNAs)
        RNA1_seq = fetchSeq(b.chr,b.start,b.stop,b.strand,args.genomeFa,args.spath)
        if args.db_detail:
            [name1,typ1,subtype1]=annotation(b,dbi1,dbi2,dbi3)
            RNA1_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,name1,typ1,subtype1])
        else:
            RNA1_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,Type])
        RNA2_len = random.randrange(15,150)
        b,Type = randRegion(RNA2_len,RNAs)
        RNA2_seq = fetchSeq(b.chr,b.start,b.stop,b.strand,args.genomeFa,args.spath)
        if args.db_detail:
            [name2,typ2,subtype2]=annotation(b,dbi1,dbi2,dbi3)
            RNA2_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,name2,typ2,subtype2])
        else:
            RNA2_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,Type])
       
        # fragment is the recovered cDNA fragment       
        if index == 1:  # single RNA or RNA1-RNA2
            if random.choice([0,1])==0:  # single RNAs
                fragment = barcode+RNA1_seq+RNA2_seq  
                print >> out, pair_id+"\t%d\tRNA1-RNA2\t0"%(len(fragment))+"\t"+RNA1_str+'\t'+RNA2_str
            else:  # RNA1-RNA2
                fragment = barcode+RNA1_seq
                print >> out, pair_id+"\t%d\tsingleRNA\t0"%(len(fragment))+"\t"+RNA1_str
        else:
            linker_n = random.choice([1,2])  # number of linkers in fragment
            linker = "".join([linkers[0]]*linker_n)
            if index == 0:
                fragment = barcode+linker
                print >> out, pair_id+"\t%d\tlinkerOnly\t%d"%(len(fragment),linker_n)
            elif index == 2:
                fragment = barcode+RNA1_seq+linker
                print >> out, pair_id+"\t%d\tRNA1-linker\t%d"%(len(fragment),linker_n)+"\t"+RNA1_str
            elif index == 3:
                fragment = barcode+linker+RNA2_seq
                print >> out, pair_id+"\t%d\tlinker-RNA2\t%d"%(len(fragment),linker_n)+"\t"+RNA2_str
            elif index == 4:
                fragment = barcode+RNA1_seq+linker+RNA2_seq
                print >> out, pair_id+"\t%d\tRNA1-linker-RNA2\t%d"%(len(fragment),linker_n)+"\t"+RNA1_str+"\t"+RNA2_str

        read1,read2 = generatePairs(fragment,args.len,args.errorRate)
        score=[]
        for j in range(0, args.len):
            score.append(random.randrange(10,40))
        record1 = SeqRecord(Seq(read1,generic_dna),id=pair_id)
        record1.letter_annotations["phred_quality"] = score
        record2 = SeqRecord(Seq(read2,generic_dna),id=pair_id)
        record2.letter_annotations["phred_quality"] = score
        SeqIO.write(record1,fastq1,"fastq")
        SeqIO.write(record2,fastq2,"fastq")
     
        if i%100==0:
            print >>sys.stderr, "generate pairs %d\r"%(i),
    fastq1.close()
    fastq2.close()
    out.close()
    print time.time()-t0
Esempio n. 38
0
def Main():
    """
    IO TEMPLATE
    """
    global args, out
    args = ParseArg()
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >>sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    if args.input == "stdin":
        fin = sys.stdin
    else:
        try:
            x = args.input.split(".")
            if x[-1] == "gz":
                fin = gzip.open(args.input, "r")
            else:
                fin = open(args.input, "r")
        except IOError:
            print >>sys.stderr, "can't read file", args.input
            fin = sys.stdin
    """
    END OF IO TEMPLATE 
    """
    print >> out, "# This data was generated by program ", sys.argv[0], " (version: %s)" % VERSION,
    print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )"
    print >> out, "# Date: ", time.asctime()
    print >> out, "# The command line is :"
    print >> out, "#\t", " ".join(sys.argv)
    gene = DBI.init(args.genetab, args.gene_format)
    upstream_list = []
    downstream_list = []
    exons_list = []
    introns_list = []
    utr3_list = []
    utr5_list = []
    for g in gene:
        upstream_list.append(g.upstream(args.upstream))
        downstream_list.append(g.downstream(args.downstream))
        for e in g.Exons():
            exons_list.append(e)
        for i in g.Introns():
            introns_list.append(i)
        if not (g.utr3() is None):
            utr3_list.append(g.utr3())
        if not (g.utr5() is None):
            utr5_list.append(g.utr5())
    upstream = DBI.init(upstream_list, "bed")
    downstream = DBI.init(downstream_list, "bed")
    exons = DBI.init(exons_list, "bed")
    introns = DBI.init(introns_list, "bed")
    utr3 = DBI.init(utr3_list, "genebed")
    utr5 = DBI.init(utr5_list, "genebed")

    if args.format == "guess":
        args.format = Tools.guess_format(args.input)
    for (i0, i) in enumerate(TableIO.parse(fin, args.format)):
        if i0 == 0:
            if isinstance(i, Bed12):
                print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            elif isinstance(i, GeneBed):
                print >> out, "#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            else:
                print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"

        print >> out, i,
        print >> out, "\t", toIDs(gene.query(i)),

        print >> out, "\t", toIDs(upstream.query(i)),
        print >> out, "\t", toIDs(downstream.query(i)),
        print >> out, "\t", toIDs(exons.query(i)),
        print >> out, "\t", toIDs(introns.query(i)),
        print >> out, "\t", toIDs(utr3.query(i)),
        print >> out, "\t", toIDs(utr5.query(i))
Esempio n. 39
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# QUERY (VCF A):",args.VCF_A
    print >>out,"# DATA  (VCF B):",args.VCF_B
    print >>out,"# A11 VCF in A and B, and alt nt is the same : A VCF entry" 
    print >>out,"# B11 VCF in A and B, and alt nt is the same : B VCF entry" 
    print >>out,"# A12 VCF position in A and B, and alt nt is not the same : A VCF entry" 
    print >>out,"# B12 VCF position in A and B, and alt nt is not the same : B VCF entry" 
    print >>out,"# A10 VCF, only exists in A"
    print >>out,"# B01 VCF, only exists in B"
    print >>sys.stderr,"Initialize data: reading ",args.VCF_A
    VCF_A_DBI=DBI.init(args.VCF_A,"vcf")
    print >>sys.stderr,"Initialize data: reading ",args.VCF_B
    VCF_B_DBI=DBI.init(args.VCF_B,"vcf")
    
    A11=0
    A12=0
    A10=0
    B01=0
    
    i0=0
    print >>sys.stderr,"Query ",args.VCF_A
    for (x,i) in enumerate(VCF_A_DBI):
        if x%1000==0: print >>sys.stderr,x," entries\r",
        flag=0
        hit=None
        for j in VCF_B_DBI.query(i):
            if i==j: 
                hit=j
                flag=1
                continue
            else:
                hit=j
                flag=2
        if flag==1:
            print >>out,"A11_%08d\t"%A11,i
            print >>out,"B11_%08d\t"%A11,hit
            print >>out,""
            A11+=1
        elif flag==2:
            print >>out,"A12_%08d\t"%A12,i
            print >>out,"B12_%08d\t"%A12,hit
            print >>out,""
            A12+=1
        else:
            print >>out,"A10_%08d\t"%A10,i
            print >>out,""
            print >>out,""
            A10+=1
    print >>sys.stderr,"Query ",args.VCF_B
    for (x,i) in enumerate(VCF_B_DBI):
        if x%1000==0: print >>sys.stderr,x," entries\r",
        flag=0
        for j in VCF_A_DBI.query(i):
            flag=1
        if flag==0:
            print >>out,"B01_%08d\t"%B01,i
            print >>out,""
            print >>out,""
            B01+=1
    print >>out,"# [AB]11 number:",A11
    print >>out,"# [AB]12 number:",A12
    print >>out,"# A10 number:",A10
    print >>out,"# B01 number:",B01
Esempio n. 40
0
def Main():
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout

    db_format=args.db_format
    if len(db_format)==0:
        db_format=["bed" for i in range(len(args.db))]
    if len(db_format)==1:
        db_format=[db_format[0] for i in range(len(args.db))]
    if len(db_format)!=len(args.db):
        print >>sys.stderr,"the number of annotation files is not same with the number of annotation formats"
        print >>sys.stderr,"db format",db_format
        print >>sys.stderr,"db ",args.db
        exit(0)
       
    print >>out,"# Input:",args.input
    dbis=[]
    hits=[]  #count the hits
    hcode={}
    for i,f in enumerate(args.db):
        print >>out,"# Features File No."+str(i+1),":",f
        dbis.append(DBI.init(f,db_format[i]))
        hits.append(0)
    query_num=0
    for bed in TableIO.parse(args.input,args.input_format):
        if not args.m:
            print >>out,"QR\t",bed
        query_num+=1
        code="@"
        for i,dbi in enumerate(dbis):
            flag=0
            for hit in dbi.query(bed):
                if not args.m:
                    print >>out,"\tDB"+str(i+1)+" HT\t",hit
                flag=1
            hits[i]+=flag
            code+=str(flag)
        if hcode.has_key(code):
            hcode[code]+=1
        else:
            hcode[code]=1
        if not args.m:
            print >>out,"CD "+code,"\t",bed
            print >>out,""
            print >>out,""
        else:
            print >>out,bed,"\t","CD "+code

    for i,x in enumerate(hits):
        print >>out,"#",x,"/",query_num,"overlap with No."+str(i+1),args.db[i]

    for key in sorted(hcode.keys()):
        print >>out,"# code:"+key,"\t",hcode[key]
Esempio n. 41
0
def Main():
    """
    IO TEMPLATE
    """
    global args, out
    args = ParseArg()
    if args.output == "stdout":
        out = sys.stdout
    else:
        try:
            out = open(args.output, "w")
        except IOError:
            print >>sys.stderr, "can't open file ", args.output, "to write. Using stdout instead"
            out = sys.stdout
    """
    END OF IO TEMPLATE 
    """
    print >> out, "# QUERY (VCF A):", args.VCF_A
    print >> out, "# DATA  (VCF B):", args.VCF_B
    print >> out, "# A11 VCF in A and B, and alt nt is the same : A VCF entry"
    print >> out, "# B11 VCF in A and B, and alt nt is the same : B VCF entry"
    print >> out, "# A12 VCF position in A and B, and alt nt is not the same : A VCF entry"
    print >> out, "# B12 VCF position in A and B, and alt nt is not the same : B VCF entry"
    print >> out, "# A10 VCF, only exists in A"
    print >> out, "# B01 VCF, only exists in B"
    print >>sys.stderr, "Initialize data: reading ", args.VCF_A
    VCF_A_DBI = DBI.init(args.VCF_A, "vcf")
    print >>sys.stderr, "Initialize data: reading ", args.VCF_B
    VCF_B_DBI = DBI.init(args.VCF_B, "vcf")

    A11 = 0
    A12 = 0
    A10 = 0
    B01 = 0

    i0 = 0
    print >>sys.stderr, "Query ", args.VCF_A
    for (x, i) in enumerate(VCF_A_DBI):
        if x % 1000 == 0:
            print >>sys.stderr, x, " entries\r",
        flag = 0
        hit = None
        for j in VCF_B_DBI.query(i):
            if i == j:
                hit = j
                flag = 1
                continue
            else:
                hit = j
                flag = 2
        if flag == 1:
            print >> out, "A11_%08d\t" % A11, i
            print >> out, "B11_%08d\t" % A11, hit
            print >> out, ""
            A11 += 1
        elif flag == 2:
            print >> out, "A12_%08d\t" % A12, i
            print >> out, "B12_%08d\t" % A12, hit
            print >> out, ""
            A12 += 1
        else:
            print >> out, "A10_%08d\t" % A10, i
            print >> out, ""
            print >> out, ""
            A10 += 1
    print >>sys.stderr, "Query ", args.VCF_B
    for (x, i) in enumerate(VCF_B_DBI):
        if x % 1000 == 0:
            print >>sys.stderr, x, " entries\r",
        flag = 0
        for j in VCF_A_DBI.query(i):
            flag = 1
        if flag == 0:
            print >> out, "B01_%08d\t" % B01, i
            print >> out, ""
            print >> out, ""
            B01 += 1
    print >> out, "# [AB]11 number:", A11
    print >> out, "# [AB]12 number:", A12
    print >> out, "# A10 number:", A10
    print >> out, "# B01 number:", B01
Esempio n. 42
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    if args.input=="stdin":
        fin=sys.stdin
    else:
        try:
            x=args.input.split(".")
            if x[-1]=="gz":
                fin=gzip.open(args.input,"r")
            else:
                fin=open(args.input,"r")
        except IOError:
            print >>sys.stderr,"can't read file",args.input
            fin=sys.stdin
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    gene=DBI.init(args.genetab,args.gene_format);
    upstream_list=[]
    downstream_list=[]
    exons_list=[]
    introns_list=[]
    utr3_list=[]
    utr5_list=[]
    for g in gene:
        upstream_list.append(g.upstream(args.upstream));
        downstream_list.append(g.downstream(args.downstream));
        for e in g.Exons():
            exons_list.append(e)
        for i in g.Introns():
            introns_list.append(i)
        if not (g.utr3() is None):
            utr3_list.append(g.utr3())
        if not (g.utr5() is None):
            utr5_list.append(g.utr5())
    upstream=DBI.init(upstream_list,"bed")
    downstream=DBI.init(downstream_list,"bed")
    exons=DBI.init(exons_list,"bed")
    introns=DBI.init(introns_list,"bed")
    utr3=DBI.init(utr3_list,"genebed")
    utr5=DBI.init(utr5_list,"genebed")



    if args.format=="guess":
        args.format=Tools.guess_format(args.input)
    for (i0,i) in enumerate(TableIO.parse(fin,args.format)):
        if i0==0:
            if isinstance(i,Bed12):
                print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            elif isinstance(i,GeneBed):
                print >>out,"#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
            else:
                print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
        


        print >>out,i,
        print >>out,"\t",toIDs(gene.query(i)),

        print >>out,"\t",toIDs(upstream.query(i)),
        print >>out,"\t",toIDs(downstream.query(i)),
        print >>out,"\t",toIDs(exons.query(i)),
        print >>out,"\t",toIDs(introns.query(i)),
        print >>out,"\t",toIDs(utr3.query(i)),
        print >>out,"\t",toIDs(utr5.query(i))
Esempio n. 43
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    
    hSites={};
    donorSites={};
    acceptorSites={}
    if args.genome is not None:
        genome=DBI.init(args.genome,"genome")
    else:
        genome=None

    j=0
    for j,i in enumerate(TableIO.parse(fin,"bam2bed12",references=fin.references,strand=args.strand)):
        #print >>out,i
        if j%1000==0: print >>sys.stderr,"processing ",j,"reads               \r",
        for intron in i.Introns():
            if len(intron)< args.intron_min_length: continue
            donor=intron.head();
            #print >>sys.stderr,intron
            #print >>sys.stderr,donor
            donorID=bedToID(donor)
            if(donorSites.has_key(donorID)):
                donorSites[donorID]+=1
            else:
                donorSites[donorID]=1
            acceptor=intron.tail();
            acceptorID=bedToID(acceptor)
            if(acceptorSites.has_key(acceptorID)):
                acceptorSites[acceptorID]+=1
            else:
                acceptorSites[acceptorID]=1
            ''' 
            if genome is not None:
                
                s=genome.query(intron.head()).upper()+".."+genome.query(intron.tail()).upper()
                if hSites.has_key(s):
                    hSites[s]+=1
                else:
                    hSites[s]=1
            '''
    donors=[]
    for key in donorSites.keys():
        a=key.split("\t")
        donors.append(Bed([a[0],a[1],a[2],"noname_donor",donorSites[key],a[3]]));
    donors.sort()
    for i,x in enumerate(donors):
        x.id="donor_"+str(i)
        print >>out,x,"\t",genome.query(x).upper() 

    acceptors=[]
    for key in acceptorSites.keys():
        a=key.split("\t")
        acceptors.append(Bed([a[0],a[1],a[2],"noname_acceptor",acceptorSites[key],a[3]]));
    acceptors.sort()
    for i,x in enumerate(acceptors):
        x.id="acceptor_"+str(i)
        print >>out,x,"\t",genome.query(x).upper()
def Main():
    args=ParseArg()

    #store bed files with indexing and count information:
    bam={}

    print >>sys.stderr,"Starting index bam/bed files:"
    for i in range(len(args.bams)):
        temp_name=args.name[i]
        print >>sys.stderr,"  #Indexing for bam/bed file of",temp_name,"\r",
        bam[temp_name]=DBI.init(args.bams[i],args.fmt)
    
    print >>sys.stderr
    print >>sys.stderr,"Reading nucleosome peak xls file from Danpos."
    nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True)

    print >>sys.stderr,"Initial output files..."

    out=open(args.output,"w")
    # -- for verbose ---
    if args.verbose:
        out_mark=[]
        for n in args.name:
            out_mark.append(open(n+'_shift_nucleosomes.bed','w'))
    # ------------------ 
    line_head=open(args.nucleosome,'r').readline().strip()
    line_head=line_head+"\t"+"\t".join(str(f) for f in args.name)+'\t'+"\t".join(str(f)+'_off' for f in args.name)
    print >>out,line_head
    
    print >>sys.stderr,"Start Counting..."
    num=0
    t0 = time()
    for i in nucleosomes:
        chrom=i.chr
        if i.smt_pval>0.01 or i.fuzziness_pval>0.01: continue # only choose nucleosomes with high value and low fuzziness   
        if "random" in chrom or chrom == 'chrM':
            continue
        num=num+1
        center=int(i.start+i.end)/2
        count=np.zeros(len(args.bams),dtype="float")
        offset=np.zeros(len(args.bams),dtype='int')
        line=str(i)
        for k,name in enumerate(args.name):
            if args.fmt=='bam':
                query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]),method='fetch')
            else:
                query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]))
            read_centers=[]
            for j in query:
                read_centers.append(find_center(j,args.fmt))
            [o,c]=getCount(read_centers,center)
            count[k]=c
            offset[k]=o
            # -- for verbose ---
            if args.verbose:
                print >>out_mark[k],chrom+'\t%d\t%d'%(i.start+o,i.end+o)
            # ------------------
        line = line + "\t" + "\t".join(str(f) for f in count) + '\t' + "\t".join(str(f) for f in offset)
        if num%20000==0:
            t1 = time()
            print >>sys.stderr,"processing %dth nucleosome..., time: %.2fs."%(num,t1-t0),'\r',
            t0 = time()    
        print >>out,line
    print
    out.close()
    
    # -- for verbose ---
    if args.verbose:
        for k in out_mark:
            k.close()
Esempio n. 45
0
def Main():
    global args,out
    args=ParseArg()
    dict={}
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    argv=sys.argv
    argv[0]=argv[0].split("/")[-1]
    print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :\n#\t"," ".join(argv)
    init_dict={}
    if args.dbformat=="guess":
        if Tools.suffix(args.db)=="gz": 
            args.dbformat="tabix"
            args.tabix_format=Tools.guess_format(args.db)
        else:
            args.dbformat=Tools.guess_format(args.db)

    if args.query_method:
        dict["method"]=args.query_method
    if args.tabix_format:
        init_dict["tabix"]=args.tabix_format

    dbi=DBI.init(args.db,args.dbformat,**init_dict)
    hits=0
    query=0
    if args.input=="stdin":
        input=sys.stdin
    else:
        input=args.input

    query_length=0
    hits_number=0
    if (args.input_format=="guess"):
        args.input_format=Tools.guess_format(args.input)
    for (i0,x) in enumerate(TableIO.parse(input,args.input_format)):
        if i0%100==0:
            print >>sys.stderr,"query ",i0," entries\r",
        print >>out,"QR\t",x
        hit=0
        query+=1
        query_length+=len(x)
        #print dbi;#debug
        results=dbi.query(x,**dict)
        #results=dbi.query(x) #DEBUG
        #print >>sys.stderr,type(results)
        if isinstance(results,numpy.ndarray) or isinstance(results,list):
            print >>out,"HT\t",
            for value in results:
                print >>out,str(value)+",",
            print >>out,""
            hit=1
            hits_number+=1
        elif isinstance(results,str):
            print >>out,"HT\t",
            print >>out,results
            hit=1
            hits_number+=1

        else:
            for j in results:
                print >>out,"HT\t",j
                hit=1
                hits_number+=1

        if args.dbformat=="tabix":
            x.chr=x.chr.replace("chr","")
            for j in dbi.query(x,**dict):
                print >>out,"HT\t",j
                hit=1
                hits_number+=1
        hits+=hit
    print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits
    print >>out,"# Query Length:",query_length
    print >>out,"# Hits Number:",hits_number
Esempio n. 46
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout
    if args.input=="stdin":
        fin=sys.stdin
    else:
        try:
            x=args.input.split(".")
            if x[-1]=="gz":
                fin=gzip.open(args.input,"r")
            else:
                fin=open(args.input,"r")
        except IOError:
            print >>sys.stderr,"can't read file",args.input
            fin=sys.stdin
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    gene=DBI.init(args.genetab,args.gene_format);
    upstream_list=[]
    downstream_list=[]
    exons_list=[]
    introns_list=[]
    utr3_list=[]
    utr5_list=[]
    for g in gene:
        upstream_list.append(g.upstream(args.upstream));
        downstream_list.append(g.downstream(args.downstream));
        for e in g.Exons():
            exons_list.append(e)
        for i in g.Introns():
            introns_list.append(i)
        if not (g.utr3() is None):
            utr3_list.append(g.utr3())
        if not (g.utr5() is None):
            utr5_list.append(g.utr5())
    upstream=DBI.init(upstream_list,"bed")
    downstream=DBI.init(downstream_list,"bed")
    exons=DBI.init(exons_list,"bed")
    introns=DBI.init(introns_list,"bed")
    utr3=DBI.init(utr3_list,"genebed")
    utr5=DBI.init(utr5_list,"genebed")


    print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5"
    for i in TableIO.parse(fin,args.format):
        print >>out,i,
        print >>out,"\t",toIDs(gene.query(i)),

        print >>out,"\t",toIDs(upstream.query(i)),
        print >>out,"\t",toIDs(downstream.query(i)),
        print >>out,"\t",toIDs(exons.query(i)),
        print >>out,"\t",toIDs(introns.query(i)),
        print >>out,"\t",toIDs(utr3.query(i)),
        print >>out,"\t",toIDs(utr5.query(i))
Esempio n. 47
0
def Main():
    t1 = time()

    global min_interaction, p_value
    args = ParseArg()
    inp = open(args.input, 'r')
    min_clusterS = args.min_clusterS
    min_interaction = args.min_interaction
    p_value = args.p_value
    output = open(args.output, 'w')
    ncpus = args.parallel

    #store genomic location of part1 and part2
    part1 = []
    part2 = []

    k = 0

    print >> sys.stderr, "# Inputing data..."

    chr_list = []
    for line in inp.read().split('\n'):
        if line == '': continue
        line = line.strip().split('\t')
        p1 = annotated_bed(line[0:10], id=k)
        p2 = annotated_bed(line[11:], id=k)
        if isinstance(p1.start, list):
            p1.start = int(p1.start[0])
            p1.end = int(p1.end[-1])
        if isinstance(p2.start, list):
            p2.start = int(p2.start[0])
            p2.end = int(p2.end[-1])
        if SingleFragment(p1, p2): continue
        k += 1
        part1.append(p1)
        part2.append(p2)
        if p1.chr not in chr_list: chr_list.append(p1.chr)
        if p2.chr not in chr_list: chr_list.append(p2.chr)
        if k % 20000 == 0:
            print >> sys.stderr, "  Reading %d pairs of segments\r" % (k),
    print >> sys.stderr, "Get total %d pairs." % (k)

    if len(part1) != len(part2):
        print >> sys.stderr, "## ERROR: number of regions in two part not match!!"
        sys.exit(0)

    # sort in genomic order, easy for clustering
    part1 = sorted(part1, key=attrgetter('start'))
    part1 = sorted(part1, key=attrgetter('chr'))
    part2 = sorted(part2, key=attrgetter('start'))
    part2 = sorted(part2, key=attrgetter('chr'))

    # for parallel computing
    print >> sys.stderr, "# Generating clusters for two parts..."
    # tuple of all parallel python servers to connect with
    ppservers = ()
    job_server = pp.Server(ncpus, ppservers=ppservers)
    jobs1 = []
    jobs2 = []
    for chro in chr_list:
        part1_temp = filter(lambda p: p.chr == chro, part1)
        if len(part1_temp) > 0:
            jobs1.append(
                job_server.submit(cluster_regions, (part1_temp, min_clusterS),
                                  (annotated_bed, ), (
                                      "UnionFind",
                                      "copy",
                                  )))
        part2_temp = filter(lambda p: p.chr == chro, part2)
        if len(part2_temp) > 0:
            jobs2.append(
                job_server.submit(cluster_regions, (part2_temp, min_clusterS),
                                  (annotated_bed, ), (
                                      "UnionFind",
                                      "copy",
                                  )))

    cluster_pool1 = {}
    part1 = []
    for job in jobs1:
        try:
            part1 = part1 + job()[1]
            cluster_pool1.update(job()[0])
        except:
            print >> sys.stderr, "Wrong in %s, part1" % (job()[2])
            continue
    cluster_pool2 = {}
    part2 = []
    for job in jobs2:
        try:
            part2 = part2 + job()[1]
            cluster_pool2.update(job()[0])
        except:
            continue

    print >> sys.stderr, "   cluster number for part1 is %d          " % (
        len(cluster_pool1))
    print >> sys.stderr, "   cluster number for part2 is %d          " % (
        len(cluster_pool2))

    # sort back to pair two parts
    part1 = sorted(part1, key=attrgetter('id'))
    part2 = sorted(part2, key=attrgetter('id'))

    print >> sys.stderr, "size of part1&2:", len(part1), len(part2)

    c_interaction = {}
    for i in range(len(part1)):
        region1 = str(part1[i])
        region2 = str(part2[i])
        try:
            inter = part1[i].cluster + "--" + part2[i].cluster
        except:
            print >> sys.stderr, i, part1[i].cluster, part2[i].cluster
            sys.exit()
        if c_interaction.has_key(inter):
            c_interaction[inter] += 1
        else:
            c_interaction[inter] = 1

    # annotation file
    print >> sys.stderr, "# Indexing annotation files"
    dbi_all = DBI.init(args.annotation, "bed")
    dbi_detail = DBI.init(args.db_detail, "bed")
    dbi_repeat = DBI.init(
        "/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt",
        "bed")

    print >> sys.stderr, "# finding strong interactions from clusters..."
    k = 0  # record for strong interactions
    n = 0

    # annotation file

    for interaction in c_interaction:
        n = n + 1
        count = c_interaction[interaction]
        if count < min_interaction: continue
        i = interaction.split("--")[0]
        j = interaction.split("--")[1]
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1 = cluster_pool1[i].cluster
            count2 = cluster_pool2[j].cluster
        except:
            continue
        real_p = 1 - hypergeom.cdf(count, len(part1), count1, count2)
        if real_p <= p_value:
            k = k + 1
            cluster_pool1[i].Annotate(dbi_all, dbi_detail, dbi_repeat)
            cluster_pool2[j].Annotate(dbi_all, dbi_detail, dbi_repeat)
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output, str(cluster_pool1[i]) + '\t' + str(
                cluster_pool2[j]) + '\t%d\t%.4f' % (count, log_p)
        if n % 1000 == 0:
            print >> sys.stderr, "  Progress ( %d / %d )\r" % (
                n, len(c_interaction)),

    print >> sys.stderr, "# Find %d strong interactions. Cost time: %.2f s" % (
        k, time() - t1)

    if args.FDR:
        print >> sys.stderr, "# Permutated results:"
        for i in range(10):
            shuffle(part2)
            [n_r_I,
             n_r_SI] = Random_strongInteraction(part1, part2, cluster_pool1,
                                                cluster_pool2)
            print >> sys.stderr, "  ", i, n_r_I, n_r_SI, n_r_SI * 1.0 / n_r_I
Esempio n. 48
0
def Main():
    args=ParseArg()
    pair_dist=args.pair_dist
    step=args.step
    
    print "\nChecking if linkedPair file is tabixed..."
    if not os.path.isfile(args.linkedPair):
        print "LinkedPair file is not exist, please check!!"
        sys.exit(0)
    if not os.path.isfile(args.linkedPair+".tbi"):
        print "  tabix-ing..."
        os.system("sort -k1,1 -k2,2n "+args.linkedPair+" > temp_linkedPair.txt")
        os.system("bgzip temp_linkedPair.txt")
        os.system("tabix -p bed temp_linkedPair.txt.gz")
        linkedPair='temp_linkedPair.txt.gz'
    else:
        linkedPair=args.linkedPair
    print "  linkedPair file is tabixed."

    print "\nTabixing the interaction file..."
    os.system("sort -k1,1 -k2,2n "+args.interaction+" > temp_interaction.txt")
    os.system("bgzip temp_interaction.txt")
    os.system("tabix -p bed temp_interaction.txt.gz")
    print "  interaction file is tabixed."

    # start column number for second regions
    # s1 for interaction file and s2 for linkedPair file
    (s1,s2)=args.start    

    print "\nGet region information."
    if args.r:
        Region = read_region(args.r)
    elif args.name:
        os.system('grep "%s" %s > temp2.txt'%(args.name,args.genebed))
        g = open("temp2.txt").read().split('\t')
        if len(g)<2:
            print >> sys.stderr, "Error: the gene name is not found in database"
            sys.exit(0)
        s = int(g[1])
        e = int(g[2])
        Region = Bed([g[0],s-(e-s)/10,e+(e-s)/10,"region",".","."])
    else:
        print >> sys.stderr, "Error: Need to specify the region by '-r' or specify the gene name by '-n'"
        sys.exit(0)

    
    print "\n Start plot heatmaps on region: "+Region.str_region()
    fig = plt.figure(figsize=(8,6))
    ax = plt.subplot(111,frameon=False,yticks=[])
    start = Region.start
    end = Region.stop
    ax.set_xlim(start,end)

    #set x ticks withour offset
    locs=ax.get_xticks()
    ax.set_xticklabels(map(lambda x: "%i"%x, locs),fontsize=6)


    print "\nStart draw gene track"
    gene_dbi=DBI.init(args.genebed,"bed")
    print "  genebed indexed!"
    print "  Plot gene track"
    gene_top=Genetrack(Region,gene_dbi,ax,0.08)
    
    h = 1.5*step/(end-start) # unit height for triangles or polycons in heatmap

    

    print "\nQuery linkedPairs within specified region"
    os.system("tabix "+linkedPair+" %s:%i-%i > temp2.txt"%(Region.chr,Region.start,Region.stop))
    Count = {}
    for b in read_interaction("temp2.txt",s2):
        col='k'
        if args.Slim and SingleFragment(b[0],b[1],pair_dist): continue
        if Region.overlap(b[0],0) and Region.overlap(b[1],0): 
            if b[0].strand=='-':
                i = b[0].start
            else:
                i = b[0].stop
            if b[1].strand=='-':
                j = b[1].start
            else:
                j = b[1].stop       
            i = (i/step+1) * step  # approximate to the nearest central point
            j = (j/step+1) * step
            if i > j:
                temp=j
                j=i
                i=temp
            if (i,j) not in Count:
                Count[(i,j)] = 1
            else:
                Count[(i,j)] +=1
    
    print Count

    patches = []
    colors = []
    for i in range(start,end+1):
        if i%step!=0: continue
        for j in range(i,end+1):
            if j%step!=0 or (i,j) not in Count: continue
            patches.append(PatchGen(i,j,h,step,gene_top+0.01))
            colors.append(np.log(Count[(i,j)]+1))

    p = PatchCollection(patches, cmap=matplotlib.cm.Reds, alpha=0.7, edgecolor='k',linewidths=0.1)
    p.set_array(np.array(colors))
    ax.add_collection(p)

    ax.set_ylim(0,((end-start)/step+2)*h+gene_top+0.01)
    plt.colorbar(p)

    if not args.SI:
        plt.savefig(args.output)
        plt.show()
        os.system("rm temp_interaction.txt.gz*")
        if not os.path.isfile(args.linkedPair+".tbi"):
            os.system("rm temp_linkedPair.txt.gz*")
        os.system("rm temp2.txt")
        sys.exit(0)

    print "\nQuery interactions"
    os.system("tabix temp_interaction.txt.gz %s:%i-%i > temp2.txt"%(Region.chr,Region.start,Region.stop))
    print "\nList of interactions plotted: "
    k=1
    cmap=cm.get_cmap('Paired', 10)
    cmap=cmap(range(10))
    bottom = gene_top+0.01
    for b in read_interaction("temp2.txt",s1):
        if b[0].overlap(b[1],0): continue
        if Region.overlap(b[1],0):
            k+=1
            if b[1].stop > b[0].stop:
                start1 = b[0].start
                end1 = b[0].stop
                start2 = b[1].start
                end2 = b[1].stop
            else:
                start1 = b[1].start
                end1 = b[1].stop
                start2 = b[0].start
                end2 = b[0].stop
            P1=Polygon([[start1,bottom],[end1,bottom],[(end1+end2)*0.5,(end2-end1)*h/step+bottom],[(start1+end2)*0.5,(end2-start1)*h/step+bottom]],"True",facecolor='none',edgecolor=cmap[k%10],alpha=0.4,lw=0.5)
            P2=Polygon([[start2,bottom],[end2,bottom],[(start1+end2)*0.5,(end2-start1)*h/step+bottom],[(start1+start2)*0.5,(start2-start1)*h/step+bottom]],"True",facecolor='none',edgecolor=cmap[k%10],alpha=0.4,lw=0.5)
            ax.add_patch(P1)
            ax.add_patch(P2)
            print "  "+b[0].str_region()+" <-> "+b[1].str_region()


    plt.savefig(args.output)
    plt.show()

    # remove temp file
    os.system("rm temp_interaction.txt.gz*")
    if not os.path.isfile(args.linkedPair+".tbi"):
        os.system("rm temp_linkedPair.txt.gz*")
    os.system("rm temp2.txt")
def Main():
    t1 = time()

    global min_interaction, p_value
    args = ParseArg()
    inp = open(args.input, 'r')
    min_clusterS = args.min_clusterS
    min_interaction = args.min_interaction
    p_value = args.p_value
    output = open(args.output, 'w')
    ncpus = args.parallel

    #store genomic location of part1 and part2
    part = []

    k = 0

    print >> sys.stderr, "# Inputing data..."

    chr_list = []
    for line in inp.read().split('\n'):
        if line == '': continue
        line = line.strip().split('\t')
        p1 = annotated_bed(line[0:8], id=k, part=1)
        p2 = annotated_bed(line[9:], id=k, part=2)
        if SingleFragment(p1, p2): continue
        k += 1
        part.append(p1)
        part.append(p2)
        if p1.chr not in chr_list: chr_list.append(p1.chr)
        if p2.chr not in chr_list: chr_list.append(p2.chr)
        if k % 20000 == 0:
            print >> sys.stderr, "  Reading %d pairs of segments\r" % (k),
    print >> sys.stderr, "Get total %d pairs." % (k)

    # sort in genomic order, easy for clustering
    part = sorted(part, key=attrgetter('start'))
    part = sorted(part, key=attrgetter('chr'))

    # for parallel computing
    print >> sys.stderr, "# Generating clusters for two parts..."
    # tuple of all parallel python servers to connect with
    ppservers = ()
    job_server = pp.Server(ncpus, ppservers=ppservers)
    jobs = []
    for chro in chr_list:
        part_temp = filter(lambda p: p.chr == chro, part)
        if len(part_temp) > 0:
            jobs.append(
                job_server.submit(cluster_regions, (part_temp, min_clusterS),
                                  (annotated_bed, ), (
                                      "UnionFind",
                                      "copy",
                                  )))

    cluster_pool = {}
    part = []
    for job in jobs:
        try:
            part = part + job()[1]
            cluster_pool.update(job()[0])
        except:
            print >> sys.stderr, "Wrong in %s, part1" % (job()[2])
            continue

    print >> sys.stderr, "   cluster number is %d             " % (
        len(cluster_pool))

    # sort back to pair two parts
    part = sorted(part, key=attrgetter('part'))
    part = sorted(part, key=attrgetter('id'))

    print >> sys.stderr, "size of part", len(part)

    c_interaction = {}
    i = 0
    while i < len(part):
        P1 = part[i]
        P2 = part[i + 1]
        assert P1.id == P2.id
        i += 2
        print >> sys.stderr, "%d\r" % (i),
        if P1.cluster == P2.cluster: continue
        if P1.cluster < P2.cluster:
            inter = P1.cluster + "--" + P2.cluster
        else:
            inter = P2.cluster + "--" + P1.cluster
        if c_interaction.has_key(inter):
            c_interaction[inter] += 1
        else:
            c_interaction[inter] = 1

    # annotation file
    print >> sys.stderr, "# Indexing annotation files"
    dbi_all = DBI.init(args.annotation, "bed")
    dbi_detail = DBI.init(args.db_detail, "bed")
    dbi_repeat = DBI.init(
        "/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt",
        "bed")

    print >> sys.stderr, "# finding strong interactions from clusters..."
    k = 0  # record for strong interactions
    n = 0

    # annotation file

    for interaction in c_interaction:
        n = n + 1
        count = c_interaction[interaction]
        if count < min_interaction: continue
        i = interaction.split("--")[0]
        j = interaction.split("--")[1]
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1 = cluster_pool[i].cluster
            count2 = cluster_pool[j].cluster
        except:
            continue
        real_p = 1 - hypergeom.cdf(count, len(part) / 2, count1, count2)
        if real_p <= p_value:
            k = k + 1
            cluster_pool[i].Annotate(dbi_all, dbi_detail, dbi_repeat)
            cluster_pool[j].Annotate(dbi_all, dbi_detail, dbi_repeat)
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output, str(cluster_pool[i]) + '\t' + str(
                cluster_pool[j]) + '\t%d\t%.4f' % (count, log_p)
        if n % 1000 == 0:
            print >> sys.stderr, "  Progress ( %d / %d )\r" % (
                n, len(c_interaction)),

    print >> sys.stderr, "# Find %d strong interactions. Cost time: %.2f s" % (
        k, time() - t1)
Esempio n. 50
0
def Main():
    args = ParseArg()
    pair_dist = args.pair_dist
    step = args.step

    print "\nChecking if linkedPair file is tabixed..."
    if not os.path.isfile(args.linkedPair):
        print "LinkedPair file is not exist, please check!!"
        sys.exit(0)
    if not os.path.isfile(args.linkedPair + ".tbi"):
        print "  tabix-ing..."
        os.system("sort -k1,1 -k2,2n " + args.linkedPair +
                  " > temp_linkedPair.txt")
        os.system("bgzip temp_linkedPair.txt")
        os.system("tabix -p bed temp_linkedPair.txt.gz")
        linkedPair = 'temp_linkedPair.txt.gz'
    else:
        linkedPair = args.linkedPair
    print "  linkedPair file is tabixed."

    print "\nTabixing the interaction file..."
    os.system("sort -k1,1 -k2,2n " + args.interaction +
              " > temp_interaction.txt")
    os.system("bgzip temp_interaction.txt")
    os.system("tabix -p bed temp_interaction.txt.gz")
    print "  interaction file is tabixed."

    # start column number for second regions
    # s1 for interaction file and s2 for linkedPair file
    (s1, s2) = args.start

    print "\nGet region information."
    if args.r:
        Region = read_region(args.r)
    elif args.name:
        os.system('grep "%s" %s > temp2.txt' % (args.name, args.genebed))
        g = open("temp2.txt").read().split('\t')
        if len(g) < 2:
            print >> sys.stderr, "Error: the gene name is not found in database"
            sys.exit(0)
        s = int(g[1])
        e = int(g[2])
        Region = Bed(
            [g[0], s - (e - s) / 10, e + (e - s) / 10, "region", ".", "."])
    else:
        print >> sys.stderr, "Error: Need to specify the region by '-r' or specify the gene name by '-n'"
        sys.exit(0)

    print "\n Start plot heatmaps on region: " + Region.str_region()
    fig = plt.figure(figsize=(8, 6))
    ax = plt.subplot(111, frameon=False, yticks=[])
    start = Region.start
    end = Region.stop
    ax.set_xlim(start, end)

    #set x ticks withour offset
    locs = ax.get_xticks()
    ax.set_xticklabels(map(lambda x: "%i" % x, locs), fontsize=6)

    print "\nStart draw gene track"
    gene_dbi = DBI.init(args.genebed, "bed")
    print "  genebed indexed!"
    print "  Plot gene track"
    gene_top = Genetrack(Region, gene_dbi, ax, 0.08)

    h = 1.5 * step / (end - start
                      )  # unit height for triangles or polycons in heatmap

    print "\nQuery linkedPairs within specified region"
    os.system("tabix " + linkedPair + " %s:%i-%i > temp2.txt" %
              (Region.chr, Region.start, Region.stop))
    Count = {}
    for b in read_interaction("temp2.txt", s2):
        col = 'k'
        if args.Slim and SingleFragment(b[0], b[1], pair_dist): continue
        if Region.overlap(b[0], 0) and Region.overlap(b[1], 0):
            if b[0].strand == '-':
                i = b[0].start
            else:
                i = b[0].stop
            if b[1].strand == '-':
                j = b[1].start
            else:
                j = b[1].stop
            i = (i / step +
                 1) * step  # approximate to the nearest central point
            j = (j / step + 1) * step
            if i > j:
                temp = j
                j = i
                i = temp
            if (i, j) not in Count:
                Count[(i, j)] = 1
            else:
                Count[(i, j)] += 1

    print Count

    patches = []
    colors = []
    for i in range(start, end + 1):
        if i % step != 0: continue
        for j in range(i, end + 1):
            if j % step != 0 or (i, j) not in Count: continue
            patches.append(PatchGen(i, j, h, step, gene_top + 0.01))
            colors.append(np.log(Count[(i, j)] + 1))

    p = PatchCollection(patches,
                        cmap=matplotlib.cm.Reds,
                        alpha=0.7,
                        edgecolor='k',
                        linewidths=0.1)
    p.set_array(np.array(colors))
    ax.add_collection(p)

    ax.set_ylim(0, ((end - start) / step + 2) * h + gene_top + 0.01)
    plt.colorbar(p)

    if not args.SI:
        plt.savefig(args.output)
        plt.show()
        os.system("rm temp_interaction.txt.gz*")
        if not os.path.isfile(args.linkedPair + ".tbi"):
            os.system("rm temp_linkedPair.txt.gz*")
        os.system("rm temp2.txt")
        sys.exit(0)

    print "\nQuery interactions"
    os.system("tabix temp_interaction.txt.gz %s:%i-%i > temp2.txt" %
              (Region.chr, Region.start, Region.stop))
    print "\nList of interactions plotted: "
    k = 1
    cmap = cm.get_cmap('Paired', 10)
    cmap = cmap(range(10))
    bottom = gene_top + 0.01
    for b in read_interaction("temp2.txt", s1):
        if b[0].overlap(b[1], 0): continue
        if Region.overlap(b[1], 0):
            k += 1
            if b[1].stop > b[0].stop:
                start1 = b[0].start
                end1 = b[0].stop
                start2 = b[1].start
                end2 = b[1].stop
            else:
                start1 = b[1].start
                end1 = b[1].stop
                start2 = b[0].start
                end2 = b[0].stop
            P1 = Polygon([[start1, bottom], [end1, bottom],
                          [(end1 + end2) * 0.5,
                           (end2 - end1) * h / step + bottom],
                          [(start1 + end2) * 0.5,
                           (end2 - start1) * h / step + bottom]],
                         "True",
                         facecolor='none',
                         edgecolor=cmap[k % 10],
                         alpha=0.4,
                         lw=0.5)
            P2 = Polygon([[start2, bottom], [end2, bottom],
                          [(start1 + end2) * 0.5,
                           (end2 - start1) * h / step + bottom],
                          [(start1 + start2) * 0.5,
                           (start2 - start1) * h / step + bottom]],
                         "True",
                         facecolor='none',
                         edgecolor=cmap[k % 10],
                         alpha=0.4,
                         lw=0.5)
            ax.add_patch(P1)
            ax.add_patch(P2)
            print "  " + b[0].str_region() + " <-> " + b[1].str_region()

    plt.savefig(args.output)
    plt.show()

    # remove temp file
    os.system("rm temp_interaction.txt.gz*")
    if not os.path.isfile(args.linkedPair + ".tbi"):
        os.system("rm temp_linkedPair.txt.gz*")
    os.system("rm temp2.txt")