def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = BigWigFile( file=open(options.BigWig_File1) ) bw2 = BigWigFile( file=open(options.BigWig_File2) ) chrom_sizes = load_chromsize(options.chromSize) for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom print >>sys.stderr, "Processing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2]) bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2]) if all_nan(bw_signal1) and all_nan(bw_signal2): continue bw_signal1 = replace_nan( bw_signal1 ) bw_signal2 = replace_nan( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file.") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file. Both BigWig files should use the same reference genome.") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = pyBigWig.open(options.BigWig_File1) bw2 = pyBigWig.open(options.BigWig_File2) print("Get chromosome sizes from BigWig header ...", file=sys.stderr) chrom_sizes = {} for chr,size in bw1.chroms().items(): chrom_sizes[chr] = size for chr,size in bw2.chroms().items(): chrom_sizes[chr] = size for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom print("Processing " + chr_name + " ...", file=sys.stderr) OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): if (bw1.stats(chr_name,interval[1],interval[2] )[0] is None) and (bw2.stats(chr_name,interval[1],interval[2] )[0] is None): continue coord = interval[1] try: bw_signal1 = bw1.values(chr_name,interval[1],interval[2]) except: bw_signal1 = numpy.array() try: bw_signal2 = bw2.values(chr_name,interval[1],interval[2]) except: bw_signal2 = numpy.array() if bw_signal1 is None and bw_signal2 is None: continue if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)): continue if len(bw_signal1) == 0 and len(bw_signal2) == 0: continue bw_signal1 = numpy.nan_to_num( bw_signal1 ) bw_signal2 = numpy.nan_to_num( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v != 0 : print("%d\t%.2f" % (coord,v), file=OUT)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in BAM format. BAM file must be sorted and indexed using samTools. HowTo: http://genome.ucsc.edu/goldenPath/help/bam.html") parser.add_option("-r","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output wig files(s). \"Prefix_Forward.wig\" and \"Prefix_Reverse.wig\" will be generated") parser.add_option("-b","--bin",action="store",type="int",dest="bin",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-e","--extension",action="store",type="int",dest="extension",default=None,help="Extended coverage from 5' end of read. default=%default (full read coverage will be used)") (options,args)=parser.parse_args() if not (options.output_prefix and options.input_file and options.chromSize): parser.print_help() sys.exit(0) for file in (options.input_file,options.chromSize): if not os.path.exists(file): print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n' sys.exit(0) if not os.path.exists(options.input_file + '.bai'): print >>sys.stderr, "index file " + options.input_file + '.bai' + "does not exists" sys.exit(0) chrom_sizes = load_chromsize(options.chromSize) samfile = SAM.ParseBAM(options.input_file) FWOUT = open(options.output_prefix + "_Forward.wig",'w') RWOUT = open(options.output_prefix + "_Reverse.wig",'w') for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: samfile.fetchAlignments(chr_name,0,chr_size) except: print >>sys.stderr, "No alignments for " + chr_name + '. skipped' continue print >>sys.stderr, "Processing " + chr_name + " ..." FWOUT.write('variableStep chrom='+chr_name+'\n') RWOUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.bin): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) Fwig={} Rwig={} alignedReads = samfile.fetchAlignments(interval[0],interval[1],interval[2]) (Fwig,Rwig) = build_wig(alignedReads,options.extension) if (len(Fwig)>0): for i in xrange(interval[1]+1,interval[2]+1): if Fwig.has_key(i): FWOUT.write("%d\t%d\n" % (i, Fwig[i])) if (len(Rwig)>0): for i in xrange(interval[1]+1,interval[2]+1): if Rwig.has_key(i): RWOUT.write("%d\t%d\n" % (i, Rwig[i]))
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-m","--min_signal",action="store",type="float",dest="min_score",default=0.0,help="To redude the size of output wigfile, genomic positions with signal value smaller than (<) this threshold will be filtered out. default=%default") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = BigWigFile( file=open(options.BigWig_File1) ) bw2 = BigWigFile( file=open(options.BigWig_File2) ) chrom_sizes = load_chromsize(options.chromSize) for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom print >>sys.stderr, "Processing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] try: bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2]) except: bw_signal1 = numpy.array() try: bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2]) except: bw_signal2 = numpy.array() if bw_signal1 is None and bw_signal2 is None: continue if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)): continue if len(bw_signal1) == 0 and len(bw_signal2) == 0: continue bw_signal1 = numpy.nan_to_num( bw_signal1 ) bw_signal2 = numpy.nan_to_num( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v >= options.min_score: print >>OUT, "%d\t%.2f" % (coord,v)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.2f" % (coord, v) elif options.out_format.upper() == "BGR": print >> sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda (i, x): i - x): for l in [map(itemgetter(1), g)]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print >> OUT, chr_name + '\t' + str(i) + '\t' + str( i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >> sys.stderr, "unknown output format" sys.exit(1)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]") parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]") parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]") parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default") (options,args)=parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw = BigWigFile( file=open(options.BigWig_File) ) chrom_sizes = load_chromsize(options.chromSize) exons=[] WIG_SUM=0.0 if (options.refgene_bed): print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom,st,end in exons: try: bw.get_as_array(chrom,0,1).size except:continue bw_signal = bw.get_as_array(chrom,st,end) tmp = numpy.nansum(bw_signal) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): bw_signal = bw.get_as_array(interval[0],interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum/WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) elif options.out_format.upper() == "BGR": print >>sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p={} #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x): for l in [map(itemgetter(1), g)]: range2p[l[0]-1] = [len(l),v] for i in sorted(range2p): print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >>sys.stderr, "unknown output format" sys.exit(1)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = pyBigWig.open(options.BigWig_File) if bw.isBigWig(): pass else: print("%s is not a bigwig file!" % options.BigWig_File, file=sys.stderr) sys.exit(0) print("Get chromosome sizes from BigWig header ...", file=sys.stderr) chrom_sizes = {} for chr, size in bw.chroms().items(): chrom_sizes[chr] = size exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print("Extract exons from " + options.refgene_bed, file=sys.stderr) obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print("Merge overlapping exons ...", file=sys.stderr) exons = BED.unionBed3(exons) print("Calculate wigsum covered by " + options.refgene_bed + ' only', file=sys.stderr) for chrom, st, end in exons: if bw.stats(chrom, st, end)[0] is None: continue bw_signal = bw.values(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print("Total wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) else: print("Calculate wigsum from " + options.BigWig_File, file=sys.stderr) for chr_name, chr_size in list( chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue print("Processing " + chr_name + " ...", file=sys.stderr) for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): if bw.stats(interval[0], interval[1], interval[2])[0] is None: continue bw_signal = bw.values(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print("\nTotal wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" sys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print("Normalizing bigwig file ...", file=sys.stderr) for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue if options.out_format.upper() == "WIG": print("Writing " + chr_name + " ...", file=sys.stderr) OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print("%d\t%.2f" % (coord, v), file=OUT) elif options.out_format.upper() == "BGR": print("Writing " + chr_name + " ...", file=sys.stderr) #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda i_x: i_x[0] - i_x[1]): for l in [list(map(itemgetter(1), g))]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print(chr_name + '\t' + str(i) + '\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]), file=OUT) else: print("unknown output format", file=sys.stderr) sys.exit(1)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-b","--forward",action="store",type="string",dest="forward_bw",help="BigWig file for forward reads (extend 1 nt from 5' end of read)") parser.add_option("-d","--reverse",action="store",type="string",dest="reverse_bw",help="BigWig file for reverse reads (extend 1 nt from 5' end of read)") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files") parser.add_option("-z","--fuzziness",action="store",type="int",dest="fuzzy_size",default=10,help="Peaks within fuzzy window will be merged. default=%default (bp)") parser.add_option("-w","--bgw",action="store",type="int",dest="window_size",default=200,help="Background window size used to determine background signal level (lambda in Poisson model). default=%default (bp)") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-p","--pvalue",action="store",type="float",dest="pvalue_cutoff",default=0.1,help="Pvalue cutoff for peak detection. default=%default") parser.add_option("-r","--bg-root-num",action="store",type="float",dest="bg_root_num",default=100,help="Background peak root number. default=%default") parser.add_option("-e","--extention",action="store",type="int",dest="extention_size",default=5,help="Window size used to calculate peak area. Larger number will signficantly reduce speed, and make peak calling more meaningless. default=%default") (options,args)=parser.parse_args() if not (options.output_prefix and options.chromSize and options.forward_bw and options.reverse_bw): parser.print_help() sys.exit(0) for file in (options.chromSize,options.forward_bw,options.reverse_bw): if not os.path.exists(file): print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n' sys.exit(0) chrom_sizes = load_chromsize(options.chromSize) OUT = open(options.output_prefix + ".single_nt_peak.xls",'w') fw_bw_obj = BigWigFile( file = open(options.forward_bw)) rv_bw_obj = BigWigFile( file = open(options.reverse_bw)) rv_peak_roots = {} rv_peak_height = {} rv_ranges={} rv_peak_pvalue={} pv_cutoff = -10*math.log10(options.pvalue_cutoff) signal.signal(signal.SIGINT, signal_handler) print >>sys.stderr, logo #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #calculate peak height and peak area for forward bigwig print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.forward_bw + ' ...' for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom fw_peak_roots = {} #key is chr,pos,strand,height: ("chr19 51345387 + 2.83"), value is area("2.82999992371") fw_peak_height = {} fw_ranges={} fw_peak_pvalue={} if chr_name != 'chrY': continue print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..." progress = 0 coord = 0 #for each chunk for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) for indx,val in enumerate(fw_bw_obj.get_as_array(interval[0],interval[1],interval[2])): coord += 1 #coord is 1-based on genome if numpy.isnan(val):continue area_value = sum_bwfile(chr_name, coord, options.extention_size, fw_bw_obj,chrom_sizes) fw_peak_roots[chr_name + "\t" + str(coord) + "\t+"] = area_value #key is chrom + position + strand,value is area fw_peak_height[chr_name + "\t" + str(coord) + "\t+"] = val if chr_name not in fw_ranges: fw_ranges[chr_name] = IntervalTree() else: fw_ranges[chr_name].insert_interval( Interval( coord-1, coord, value=area_value) ) finish_part = int(interval[2]*100/chr_size) if finish_part > progress: print >>sys.stderr, " %d%% finished\r" % (finish_part), progress = finish_part #fw_global_lamda = numpy.mean(fw_peak_roots.values()) #print >>sys.stderr, "Global mean (Forward) = " + str(fw_global_lamda) print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.forward_bw + ' ...' for k in fw_peak_roots: chrom = k.split("\t")[0] coord = int(k.split("\t")[1]) fw_peak_pvalue[k] = cal_poisson_pvalue(int(fw_peak_roots[k]), coord-1, coord, fw_ranges[chrom],options.window_size,options.bg_root_num) fw_peak_filtered = merge_peaks(fw_peak_height,fuzziness=options.fuzzy_size) for k,v in fw_peak_filtered.items(): #print k + '\t' + str(v) (chrom,end,strand) = k.split('\t') end = int(end) start = end -1 height = str(v) area = str(fw_peak_roots[k]) pvalue = fw_peak_pvalue[k] if pvalue < pv_cutoff:continue print >>OUT, '\t'.join([chrom, str(start), str(end), area,str(round(pvalue)),strand,height]) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #calculate peak height and peak area for reverse bigwig print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.reverse_bw + ' ...' for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom if chr_name != 'chrY': continue print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..." progress = 0 coord = 0 #for each chunk for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) for indx,val in enumerate(rv_bw_obj.get_as_array(interval[0],interval[1],interval[2])): coord += 1 #coord is 1-based on genome if numpy.isnan(val):continue area_value = sum_bwfile(chr_name, coord, options.extention_size, rv_bw_obj,chrom_sizes) rv_peak_roots[chr_name + "\t" + str(coord) + "\t-"] = area_value rv_peak_height[chr_name + "\t" + str(coord) + "\t-"] = val if chr_name not in rv_ranges: rv_ranges[chr_name] = IntervalTree() else: rv_ranges[chr_name].insert_interval( Interval( coord-1, coord, value = area_value) ) finish_part = int(interval[2]*100/chr_size) if finish_part > progress: print >>sys.stderr, " %d%% finished\r" % (finish_part), progress = finish_part #rv_global_lamda = numpy.mean(rv_peak_roots.values()) #print >>sys.stderr, "Global mean (Reverse) = " + str(rv_global_lamda) print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.reverse_bw + ' ... ' for k in rv_peak_roots: chrom = k.split("\t")[0] coord = int(k.split("\t")[1]) rv_peak_pvalue[k] = cal_poisson_pvalue(int(rv_peak_roots[k]),coord-1,coord, rv_ranges[chrom],options.window_size,options.bg_root_num) #print k + '\t' + str(rv_peak_roots[k]) + '\t' + str(pvalue) rv_peak_filtered = merge_peaks(rv_peak_height,fuzziness=options.fuzzy_size) for k,v in rv_peak_filtered.items(): (chrom,end,strand) = k.split('\t') end = int(end) start = end -1 height = str(v) area = str(rv_peak_roots[k]) pvalue = rv_peak_pvalue[k] if pvalue < pv_cutoff:continue print >>OUT, '\t'.join([chrom, str(start), str(end), area, str(round(pvalue)),strand,height])
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]" ) parser.add_option( "-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]" ) parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]", ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]", ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]", ) parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]", ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, "w") bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if options.refgene_bed: print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + " only" for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) # nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write("variableStep chrom=" + chr_name + "\n") for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)