Example #1
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1 = BigWigFile( file=open(options.BigWig_File1) )
	bw2 = BigWigFile( file=open(options.BigWig_File2) )
	chrom_sizes = load_chromsize(options.chromSize)
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		print >>sys.stderr, "Processing " + chr_name + " ..."
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			coord = interval[1]
			bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
			bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
			if all_nan(bw_signal1) and all_nan(bw_signal2):
				continue
			bw_signal1 = replace_nan( bw_signal1 )
			bw_signal2 = replace_nan( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
Example #2
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file.")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file. Both BigWig files should use the same reference genome.")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1   = pyBigWig.open(options.BigWig_File1)
	bw2   = pyBigWig.open(options.BigWig_File2)
	
	print("Get chromosome sizes from BigWig header ...", file=sys.stderr)
	chrom_sizes = {}
	for chr,size in bw1.chroms().items():
		chrom_sizes[chr] = size
	for chr,size in bw2.chroms().items():
		chrom_sizes[chr] = size
		
	for chr_name, chr_size in list(chrom_sizes.items()):		#iterate each chrom
		print("Processing " + chr_name + " ...", file=sys.stderr)
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			if (bw1.stats(chr_name,interval[1],interval[2] )[0] is None) and (bw2.stats(chr_name,interval[1],interval[2] )[0] is None):
				continue
			coord = interval[1]
			try:
				bw_signal1 = bw1.values(chr_name,interval[1],interval[2])
			except:
				bw_signal1 = numpy.array()
			try:
				bw_signal2 = bw2.values(chr_name,interval[1],interval[2])
			except:
				bw_signal2 = numpy.array()
			if bw_signal1 is None and bw_signal2 is None:
				continue
			if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)):
				continue
			if len(bw_signal1) == 0 and len(bw_signal2) == 0:
				continue
			bw_signal1 = numpy.nan_to_num( bw_signal1 )
			bw_signal2 = numpy.nan_to_num( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v != 0 : print("%d\t%.2f" % (coord,v), file=OUT)
Example #3
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in BAM format. BAM file must be sorted and indexed using samTools. HowTo: http://genome.ucsc.edu/goldenPath/help/bam.html")
	parser.add_option("-r","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output wig files(s). \"Prefix_Forward.wig\" and \"Prefix_Reverse.wig\" will be generated")
	parser.add_option("-b","--bin",action="store",type="int",dest="bin",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-e","--extension",action="store",type="int",dest="extension",default=None,help="Extended coverage from 5' end of read. default=%default (full read coverage will be used)")

	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_file and options.chromSize):
		parser.print_help()
		sys.exit(0)
	for file in (options.input_file,options.chromSize):
		if not os.path.exists(file):
			print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n'
			sys.exit(0)
	if not os.path.exists(options.input_file + '.bai'):
		print >>sys.stderr, "index file " + options.input_file + '.bai' + "does not exists"
		sys.exit(0)


	chrom_sizes = load_chromsize(options.chromSize)
	samfile = SAM.ParseBAM(options.input_file)
	FWOUT = open(options.output_prefix + "_Forward.wig",'w')
	RWOUT = open(options.output_prefix + "_Reverse.wig",'w')
	
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		try:
			samfile.fetchAlignments(chr_name,0,chr_size)
		except:
			print >>sys.stderr, "No alignments for " + chr_name + '. skipped'
			continue
		print >>sys.stderr, "Processing " + chr_name + " ..."
		FWOUT.write('variableStep chrom='+chr_name+'\n')
		RWOUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.bin):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)
			Fwig={}
			Rwig={}
			alignedReads = samfile.fetchAlignments(interval[0],interval[1],interval[2])
			(Fwig,Rwig) = build_wig(alignedReads,options.extension)
			
			if (len(Fwig)>0):
				for i in xrange(interval[1]+1,interval[2]+1):
					if Fwig.has_key(i):
						FWOUT.write("%d\t%d\n" % (i, Fwig[i])) 
			if (len(Rwig)>0):
				for i in xrange(interval[1]+1,interval[2]+1):
					if Rwig.has_key(i):
						RWOUT.write("%d\t%d\n" % (i, Rwig[i]))
Example #4
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-m","--min_signal",action="store",type="float",dest="min_score",default=0.0,help="To redude the size of output wigfile, genomic positions with signal value smaller than (<) this threshold will be filtered out. default=%default")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1 = BigWigFile( file=open(options.BigWig_File1) )
	bw2 = BigWigFile( file=open(options.BigWig_File2) )
	chrom_sizes = load_chromsize(options.chromSize)
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		print >>sys.stderr, "Processing " + chr_name + " ..."
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			coord = interval[1]
			try:
				bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
			except:
				bw_signal1 = numpy.array()
			try:
				bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
			except:
				bw_signal2 = numpy.array()
			if bw_signal1 is None and bw_signal2 is None:
				continue
			if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)):
				continue
			if len(bw_signal1) == 0 and len(bw_signal2) == 0:
				continue
			bw_signal1 = numpy.nan_to_num( bw_signal1 )
			bw_signal2 = numpy.nan_to_num( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v >= options.min_score: print >>OUT, "%d\t%.2f" % (coord,v)
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help=
        "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]"
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=500000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    parser.add_option(
        "-f",
        "--format",
        action="store",
        type="string",
        dest="out_format",
        default="bgr",
        help=
        "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print >> sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >> sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom
            #if chr_name != "chrY":continue
            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >> sys.stderr, "Skip " + chr_name + "!"
                continue

            print >> sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1],
                                            interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >> sys.stderr, "Normalizing bigwig file ..."
    for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom
        #if chr_name != "chrY":continue
        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >> sys.stderr, "Skip " + chr_name + "!"
            continue

        if options.out_format.upper() == "WIG":
            print >> sys.stderr, "Writing " + chr_name + " ..."
            OUT.write('variableStep chrom=' + chr_name + '\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                coord = interval[1]
                bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    if v != 0: print >> OUT, "%d\t%.2f" % (coord, v)
        elif options.out_format.upper() == "BGR":
            print >> sys.stderr, "Writing " + chr_name + " ..."
            #OUT.write('variableStep chrom='+chr_name+'\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                v2p = collections.defaultdict(list)  #value to position
                range2p = {
                }  #coorindate range to value, bedgraph. #[start]=[len,value]
                coord = interval[1]
                bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
                    if v != 0: v2p[v].append(coord)
                for v in v2p:
                    for k, g in groupby(enumerate(v2p[v]), lambda
                                        (i, x): i - x):
                        for l in [map(itemgetter(1), g)]:
                            range2p[l[0] - 1] = [len(l), v]
                for i in sorted(range2p):
                    print >> OUT, chr_name + '\t' + str(i) + '\t' + str(
                        i + range2p[i][0]) + '\t' + str(range2p[i][1])
        else:
            print >> sys.stderr, "unknown output format"
            sys.exit(1)
Example #6
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help=
        "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]"
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=100000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print >> sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >> sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom

            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >> sys.stderr, "Skip " + chr_name + "!"
                continue

            print >> sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1],
                                            interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >> sys.stderr, "Normalizing bigwig file, output wiggle file"
    for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom

        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >> sys.stderr, "Skip " + chr_name + "!"
            continue

        print >> sys.stderr, "Writing " + chr_name + " ..."
        OUT.write('variableStep chrom=' + chr_name + '\n')
        for interval in BED.tillingBed(chrName=chr_name,
                                       chrSize=chr_size,
                                       stepSize=options.chunk_size):
            coord = interval[1]
            bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
            tmp = numpy.nansum(bw_signal)
            if numpy.isnan(tmp): continue
            bw_signal = numpy.nan_to_num(bw_signal)
            for v in bw_signal:
                coord += 1
                if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]")
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]")
	parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]")	
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]")
	parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)

	OUT=open(options.output_wig,'w')
	bw = BigWigFile( file=open(options.BigWig_File) )
	chrom_sizes = load_chromsize(options.chromSize)	
	exons=[]
	WIG_SUM=0.0
	if (options.refgene_bed):	
		print >>sys.stderr, "Extract exons from " + options.refgene_bed
		obj = BED.ParseBED(options.refgene_bed)
		exons = obj.getExon()
		print >>sys.stderr, "Merge overlapping exons ..."
		exons = BED.unionBed3(exons)
		print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
		for chrom,st,end in exons:
			try: bw.get_as_array(chrom,0,1).size
			except:continue

			bw_signal = bw.get_as_array(chrom,st,end)
			tmp = numpy.nansum(bw_signal)			#nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
			if numpy.isnan(tmp):continue	
			WIG_SUM += tmp
		print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
	else:
		print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File
		for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
			#if chr_name != "chrY":continue
			try: bw.get_as_array(chr_name,0,1).size
			except:
				print >>sys.stderr, "Skip " + chr_name + "!"
				continue

			print >>sys.stderr, "Processing " + chr_name + " ..."	
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				bw_signal = bw.get_as_array(interval[0],interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				WIG_SUM += tmp
		print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM
	
	try:
		weight = options.total_wigsum/WIG_SUM
	except:
		"Error, WIG_SUM cannot be 0"
		eys.exit(1)

	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	print >>sys.stderr, "Normalizing bigwig file ..."
	for chr_name, chr_size in chrom_sizes.items():          #iterate each chrom
		#if chr_name != "chrY":continue
		try: bw.get_as_array(chr_name,0,1).size
		except:
			print >>sys.stderr, "Skip " + chr_name + "!"
			continue
		
		if options.out_format.upper() == "WIG":
			print >>sys.stderr, "Writing " + chr_name + " ..."
			OUT.write('variableStep chrom='+chr_name+'\n')
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				coord = interval[1]
				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				bw_signal = numpy.nan_to_num(bw_signal) * weight
				for v in bw_signal:
					coord +=1
					if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
		elif options.out_format.upper() == "BGR":
			print >>sys.stderr, "Writing " + chr_name + " ..."
			#OUT.write('variableStep chrom='+chr_name+'\n')
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				v2p = collections.defaultdict(list)     #value to position
				range2p={}      #coorindate range to value, bedgraph. #[start]=[len,value]
				coord = interval[1]
				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				bw_signal = numpy.nan_to_num(bw_signal) * weight
				for v in bw_signal:
					coord +=1
					#if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
					if v != 0: v2p[v].append(coord)
				for v in v2p:
					for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x):
						for l in [map(itemgetter(1), g)]:
							range2p[l[0]-1] = [len(l),v]
				for i in sorted(range2p):
					print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1])
		else:
			print >>sys.stderr, "unknown output format"
			sys.exit(1)
Example #8
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=500000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    parser.add_option(
        "-f",
        "--format",
        action="store",
        type="string",
        dest="out_format",
        default="bgr",
        help=
        "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = pyBigWig.open(options.BigWig_File)

    if bw.isBigWig():
        pass
    else:
        print("%s is not a bigwig file!" % options.BigWig_File,
              file=sys.stderr)
        sys.exit(0)

    print("Get chromosome sizes from BigWig header ...", file=sys.stderr)
    chrom_sizes = {}
    for chr, size in bw.chroms().items():
        chrom_sizes[chr] = size

    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print("Extract exons from " + options.refgene_bed, file=sys.stderr)
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print("Merge overlapping exons ...", file=sys.stderr)
        exons = BED.unionBed3(exons)
        print("Calculate wigsum covered by " + options.refgene_bed + ' only',
              file=sys.stderr)
        for chrom, st, end in exons:
            if bw.stats(chrom, st, end)[0] is None:
                continue
            bw_signal = bw.values(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print("Total wigsum is %.2f\n" % WIG_SUM, file=sys.stderr)
    else:
        print("Calculate wigsum from " + options.BigWig_File, file=sys.stderr)
        for chr_name, chr_size in list(
                chrom_sizes.items()):  #iterate each chrom
            if bw.stats(chr_name, 0, chr_size)[0] is None:
                print("Skip " + chr_name + "!", file=sys.stderr)
                continue

            print("Processing " + chr_name + " ...", file=sys.stderr)
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                if bw.stats(interval[0], interval[1], interval[2])[0] is None:
                    continue
                bw_signal = bw.values(interval[0], interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print("\nTotal wigsum is %.2f\n" % WIG_SUM, file=sys.stderr)

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        sys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print("Normalizing bigwig file ...", file=sys.stderr)
    for chr_name, chr_size in list(chrom_sizes.items()):  #iterate each chrom

        if bw.stats(chr_name, 0, chr_size)[0] is None:
            print("Skip " + chr_name + "!", file=sys.stderr)
            continue

        if options.out_format.upper() == "WIG":
            print("Writing " + chr_name + " ...", file=sys.stderr)
            OUT.write('variableStep chrom=' + chr_name + '\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                coord = interval[1]
                bw_signal = bw.values(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    if v != 0: print("%d\t%.2f" % (coord, v), file=OUT)
        elif options.out_format.upper() == "BGR":
            print("Writing " + chr_name + " ...", file=sys.stderr)
            #OUT.write('variableStep chrom='+chr_name+'\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                v2p = collections.defaultdict(list)  #value to position
                range2p = {
                }  #coorindate range to value, bedgraph. #[start]=[len,value]
                coord = interval[1]
                bw_signal = bw.values(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    if v != 0: v2p[v].append(coord)
                for v in v2p:
                    for k, g in groupby(enumerate(v2p[v]),
                                        lambda i_x: i_x[0] - i_x[1]):
                        for l in [list(map(itemgetter(1), g))]:
                            range2p[l[0] - 1] = [len(l), v]
                for i in sorted(range2p):
                    print(chr_name + '\t' + str(i) + '\t' +
                          str(i + range2p[i][0]) + '\t' + str(range2p[i][1]),
                          file=OUT)
        else:
            print("unknown output format", file=sys.stderr)
            sys.exit(1)
Example #9
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-b","--forward",action="store",type="string",dest="forward_bw",help="BigWig file for forward reads (extend 1 nt from 5' end of read)")
	parser.add_option("-d","--reverse",action="store",type="string",dest="reverse_bw",help="BigWig file for reverse reads (extend 1 nt from 5' end of read)")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files")
	parser.add_option("-z","--fuzziness",action="store",type="int",dest="fuzzy_size",default=10,help="Peaks within fuzzy window will be merged. default=%default (bp)")
	parser.add_option("-w","--bgw",action="store",type="int",dest="window_size",default=200,help="Background window size used to determine background signal level (lambda in Poisson model). default=%default (bp)")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-p","--pvalue",action="store",type="float",dest="pvalue_cutoff",default=0.1,help="Pvalue cutoff for peak detection. default=%default")
	parser.add_option("-r","--bg-root-num",action="store",type="float",dest="bg_root_num",default=100,help="Background peak root number. default=%default")
	parser.add_option("-e","--extention",action="store",type="int",dest="extention_size",default=5,help="Window size used to calculate peak area. Larger number will signficantly reduce speed, and make peak calling more meaningless.  default=%default")

	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.chromSize and options.forward_bw and options.reverse_bw):
		parser.print_help()
		sys.exit(0)
	for file in (options.chromSize,options.forward_bw,options.reverse_bw):
		if not os.path.exists(file):
			print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n'
			sys.exit(0)
	
	chrom_sizes = load_chromsize(options.chromSize)
	OUT = open(options.output_prefix + ".single_nt_peak.xls",'w')
	fw_bw_obj = BigWigFile( file = open(options.forward_bw))
	rv_bw_obj = BigWigFile( file = open(options.reverse_bw))
	rv_peak_roots = {}
	rv_peak_height = {}
	rv_ranges={}
	rv_peak_pvalue={}
	pv_cutoff = -10*math.log10(options.pvalue_cutoff)	
	signal.signal(signal.SIGINT, signal_handler)


	print >>sys.stderr, logo	
	
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#calculate peak height and peak area for forward bigwig
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.forward_bw + '  ...'
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		fw_peak_roots = {}	#key is chr,pos,strand,height: ("chr19   51345387        +       2.83"), value is area("2.82999992371")
		fw_peak_height = {}
		fw_ranges={}
		fw_peak_pvalue={}
		if chr_name != 'chrY':
			continue
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..."
		progress = 0
		coord = 0	
		#for each chunk
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)				
			for indx,val in enumerate(fw_bw_obj.get_as_array(interval[0],interval[1],interval[2])):
				coord += 1	#coord is 1-based on genome
				if numpy.isnan(val):continue
				area_value = sum_bwfile(chr_name, coord, options.extention_size, fw_bw_obj,chrom_sizes)
				fw_peak_roots[chr_name + "\t" + str(coord) + "\t+"] = area_value		#key is chrom + position + strand,value is area
				fw_peak_height[chr_name + "\t" + str(coord) + "\t+"] = val
				if chr_name not in fw_ranges:
					fw_ranges[chr_name] = IntervalTree()
				else:
					fw_ranges[chr_name].insert_interval( Interval( coord-1, coord, value=area_value) )
			finish_part = int(interval[2]*100/chr_size)
			if finish_part > progress:
				print >>sys.stderr, " %d%% finished\r" % (finish_part),
				progress = finish_part	
	
	
		#fw_global_lamda = numpy.mean(fw_peak_roots.values())
		#print >>sys.stderr, "Global mean (Forward) = " + str(fw_global_lamda)
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.forward_bw + '  ...'
		for k in fw_peak_roots:
			chrom = k.split("\t")[0]
			coord = int(k.split("\t")[1])
			fw_peak_pvalue[k] = cal_poisson_pvalue(int(fw_peak_roots[k]), coord-1, coord, fw_ranges[chrom],options.window_size,options.bg_root_num)

	
		fw_peak_filtered = merge_peaks(fw_peak_height,fuzziness=options.fuzzy_size)	
		for k,v in fw_peak_filtered.items():
			#print k + '\t' + str(v)
			(chrom,end,strand) = k.split('\t')
			end = int(end)
			start = end -1
			height = str(v)
			area = str(fw_peak_roots[k])
			pvalue = fw_peak_pvalue[k]
			if pvalue < pv_cutoff:continue
			print >>OUT, '\t'.join([chrom, str(start), str(end), area,str(round(pvalue)),strand,height])
	
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#calculate peak height and peak area for reverse bigwig
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.reverse_bw + '  ...'
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		if chr_name != 'chrY':
			continue
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..."
		progress = 0
		coord = 0	
		#for each chunk
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)				
			
			for indx,val in enumerate(rv_bw_obj.get_as_array(interval[0],interval[1],interval[2])):
				coord += 1	#coord is 1-based on genome
				if numpy.isnan(val):continue
				area_value = sum_bwfile(chr_name, coord, options.extention_size, rv_bw_obj,chrom_sizes)				
				rv_peak_roots[chr_name + "\t" + str(coord) + "\t-"] = area_value
				rv_peak_height[chr_name + "\t" + str(coord) + "\t-"] = val
				if chr_name not in rv_ranges:
					rv_ranges[chr_name] = IntervalTree()
				else:
					rv_ranges[chr_name].insert_interval( Interval( coord-1, coord, value = area_value) )
			finish_part = int(interval[2]*100/chr_size)
			if finish_part > progress:
				print >>sys.stderr, " %d%% finished\r" % (finish_part),
				progress = finish_part
	

	#rv_global_lamda = numpy.mean(rv_peak_roots.values())
	#print >>sys.stderr, "Global mean (Reverse) = " + str(rv_global_lamda)
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.reverse_bw + '  ... '
	for k in rv_peak_roots:
		chrom = k.split("\t")[0]
		coord = int(k.split("\t")[1])
		rv_peak_pvalue[k] = cal_poisson_pvalue(int(rv_peak_roots[k]),coord-1,coord, rv_ranges[chrom],options.window_size,options.bg_root_num)
		#print k + '\t' + str(rv_peak_roots[k]) + '\t' + str(pvalue)


	rv_peak_filtered = merge_peaks(rv_peak_height,fuzziness=options.fuzzy_size)
	for k,v in rv_peak_filtered.items():
		(chrom,end,strand) = k.split('\t')
		end = int(end)
		start = end -1
		height = str(v)
		area = str(rv_peak_roots[k])
		pvalue = rv_peak_pvalue[k]
		if pvalue < pv_cutoff:continue
		
		print >>OUT, '\t'.join([chrom, str(start), str(end), area, str(round(pvalue)),strand,height])
Example #10
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option(
        "-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]"
    )
    parser.add_option(
        "-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]"
    )
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]",
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]",
    )
    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="refgene_bed",
        help="Reference gene model in bed format. [optional]",
    )
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=100000,
        help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]",
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, "w")
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if options.refgene_bed:
        print >>sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >>sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + " only"
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  # nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp):
                continue
            WIG_SUM += tmp
        print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  # iterate each chrom

            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >>sys.stderr, "Skip " + chr_name + "!"
                continue

            print >>sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp):
                    continue
                WIG_SUM += tmp
        print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >>sys.stderr, "Normalizing bigwig file, output wiggle file"
    for chr_name, chr_size in chrom_sizes.items():  # iterate each chrom

        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >>sys.stderr, "Skip " + chr_name + "!"
            continue

        print >>sys.stderr, "Writing " + chr_name + " ..."
        OUT.write("variableStep chrom=" + chr_name + "\n")
        for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size):
            coord = interval[1]
            bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
            tmp = numpy.nansum(bw_signal)
            if numpy.isnan(tmp):
                continue
            bw_signal = numpy.nan_to_num(bw_signal)
            for v in bw_signal:
                coord += 1
                if v != 0:
                    print >> OUT, "%d\t%.4f" % (coord, v * weight)