Example #1
0
def union_exons(refbed):
    '''
	take the union of all exons defined in refbed file and build bitset
	'''
    from qcmodule import BED
    tmp = BED.ParseBED(refbed)
    all_exons = tmp.getExon()
    unioned_exons = BED.unionBed3(all_exons)
    exon_ranges = build_bitsets(unioned_exons)
    return exon_ranges
Example #2
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1 = BigWigFile( file=open(options.BigWig_File1) )
	bw2 = BigWigFile( file=open(options.BigWig_File2) )
	chrom_sizes = load_chromsize(options.chromSize)
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		print >>sys.stderr, "Processing " + chr_name + " ..."
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			coord = interval[1]
			bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
			bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
			if all_nan(bw_signal1) and all_nan(bw_signal2):
				continue
			bw_signal1 = replace_nan( bw_signal1 )
			bw_signal2 = replace_nan( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
Example #3
0
def union_exons(refbed):
	'''
	take the union of all exons defined in refbed file and build bitset
	'''
	from qcmodule import BED
	tmp = BED.ParseBED(refbed)
	all_exons = tmp.getExon()
	unioned_exons = BED.unionBed3(all_exons)
	exon_ranges = build_bitsets(unioned_exons)
	return exon_ranges
Example #4
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file.")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file. Both BigWig files should use the same reference genome.")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1   = pyBigWig.open(options.BigWig_File1)
	bw2   = pyBigWig.open(options.BigWig_File2)
	
	print("Get chromosome sizes from BigWig header ...", file=sys.stderr)
	chrom_sizes = {}
	for chr,size in bw1.chroms().items():
		chrom_sizes[chr] = size
	for chr,size in bw2.chroms().items():
		chrom_sizes[chr] = size
		
	for chr_name, chr_size in list(chrom_sizes.items()):		#iterate each chrom
		print("Processing " + chr_name + " ...", file=sys.stderr)
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			if (bw1.stats(chr_name,interval[1],interval[2] )[0] is None) and (bw2.stats(chr_name,interval[1],interval[2] )[0] is None):
				continue
			coord = interval[1]
			try:
				bw_signal1 = bw1.values(chr_name,interval[1],interval[2])
			except:
				bw_signal1 = numpy.array()
			try:
				bw_signal2 = bw2.values(chr_name,interval[1],interval[2])
			except:
				bw_signal2 = numpy.array()
			if bw_signal1 is None and bw_signal2 is None:
				continue
			if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)):
				continue
			if len(bw_signal1) == 0 and len(bw_signal2) == 0:
				continue
			bw_signal1 = numpy.nan_to_num( bw_signal1 )
			bw_signal2 = numpy.nan_to_num( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v != 0 : print("%d\t%.2f" % (coord,v), file=OUT)
Example #5
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in BAM format. BAM file must be sorted and indexed using samTools. HowTo: http://genome.ucsc.edu/goldenPath/help/bam.html")
	parser.add_option("-r","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output wig files(s). \"Prefix_Forward.wig\" and \"Prefix_Reverse.wig\" will be generated")
	parser.add_option("-b","--bin",action="store",type="int",dest="bin",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-e","--extension",action="store",type="int",dest="extension",default=None,help="Extended coverage from 5' end of read. default=%default (full read coverage will be used)")

	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_file and options.chromSize):
		parser.print_help()
		sys.exit(0)
	for file in (options.input_file,options.chromSize):
		if not os.path.exists(file):
			print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n'
			sys.exit(0)
	if not os.path.exists(options.input_file + '.bai'):
		print >>sys.stderr, "index file " + options.input_file + '.bai' + "does not exists"
		sys.exit(0)


	chrom_sizes = load_chromsize(options.chromSize)
	samfile = SAM.ParseBAM(options.input_file)
	FWOUT = open(options.output_prefix + "_Forward.wig",'w')
	RWOUT = open(options.output_prefix + "_Reverse.wig",'w')
	
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		try:
			samfile.fetchAlignments(chr_name,0,chr_size)
		except:
			print >>sys.stderr, "No alignments for " + chr_name + '. skipped'
			continue
		print >>sys.stderr, "Processing " + chr_name + " ..."
		FWOUT.write('variableStep chrom='+chr_name+'\n')
		RWOUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.bin):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)
			Fwig={}
			Rwig={}
			alignedReads = samfile.fetchAlignments(interval[0],interval[1],interval[2])
			(Fwig,Rwig) = build_wig(alignedReads,options.extension)
			
			if (len(Fwig)>0):
				for i in xrange(interval[1]+1,interval[2]+1):
					if Fwig.has_key(i):
						FWOUT.write("%d\t%d\n" % (i, Fwig[i])) 
			if (len(Rwig)>0):
				for i in xrange(interval[1]+1,interval[2]+1):
					if Rwig.has_key(i):
						RWOUT.write("%d\t%d\n" % (i, Rwig[i]))
Example #6
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-m","--min_signal",action="store",type="float",dest="min_score",default=0.0,help="To redude the size of output wigfile, genomic positions with signal value smaller than (<) this threshold will be filtered out. default=%default")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1 = BigWigFile( file=open(options.BigWig_File1) )
	bw2 = BigWigFile( file=open(options.BigWig_File2) )
	chrom_sizes = load_chromsize(options.chromSize)
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		print >>sys.stderr, "Processing " + chr_name + " ..."
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			coord = interval[1]
			try:
				bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
			except:
				bw_signal1 = numpy.array()
			try:
				bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
			except:
				bw_signal2 = numpy.array()
			if bw_signal1 is None and bw_signal2 is None:
				continue
			if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)):
				continue
			if len(bw_signal1) == 0 and len(bw_signal2) == 0:
				continue
			bw_signal1 = numpy.nan_to_num( bw_signal1 )
			bw_signal2 = numpy.nan_to_num( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v >= options.min_score: print >>OUT, "%d\t%.2f" % (coord,v)
def process_gene_model(gene_model):
	print >>sys.stderr, "processing " + gene_model + ' ...',
	obj = BED.ParseBED(gene_model)
	utr_3 = obj.getUTR(utr=3)
	utr_5 = obj.getUTR(utr=5)
	cds_exon = obj.getCDSExon()
	intron = obj.getIntron()
	
	intron = BED.unionBed3(intron)
	cds_exon=BED.unionBed3(cds_exon)
	utr_5 = BED.unionBed3(utr_5)
	utr_3 = BED.unionBed3(utr_3)
	
	utr_5 = BED.subtractBed3(utr_5,cds_exon)
	utr_3 = BED.subtractBed3(utr_3,cds_exon)
	intron = BED.subtractBed3(intron,cds_exon)
	intron = BED.subtractBed3(intron,utr_5)
	intron = BED.subtractBed3(intron,utr_3)
	
	intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000)
	intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000)
	intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000)
	intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000)	
	intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000)
	intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000)
	
	#merge integenic region
	intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb)
	intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb)
	intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb)
	intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb)
	intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb)
	intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb)	
	
	#purify intergenic region
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron)	

	#purify intergenic region
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron)	
	
	#purify intergenic region
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron)	
	
	#build intervalTree 
	cds_exon_ranges = build_bitsets(cds_exon)
	utr_5_ranges = build_bitsets(utr_5)
	utr_3_ranges = build_bitsets(utr_3)
	intron_ranges = build_bitsets(intron)
	interg_ranges_up_1kb_ranges = build_bitsets(intergenic_up_1kb)
	interg_ranges_up_5kb_ranges = build_bitsets(intergenic_up_5kb)
	interg_ranges_up_10kb_ranges = build_bitsets(intergenic_up_10kb)
	interg_ranges_down_1kb_ranges = build_bitsets(intergenic_down_1kb)
	interg_ranges_down_5kb_ranges = build_bitsets(intergenic_down_5kb)
	interg_ranges_down_10kb_ranges = build_bitsets(intergenic_down_10kb)
	
	exon_size = cal_size(cds_exon)
	intron_size = cal_size(intron)
	utr3_size = cal_size(utr_3)
	utr5_size = cal_size(utr_5)
	int_up1k_size = cal_size(intergenic_up_1kb)
	int_up5k_size = cal_size(intergenic_up_5kb)
	int_up10k_size = cal_size(intergenic_up_10kb)
	int_down1k_size = cal_size(intergenic_down_1kb)
	int_down5k_size = cal_size(intergenic_down_5kb)
	int_down10k_size = cal_size(intergenic_down_10kb)
	
	print >>sys.stderr, "Done"
	return (cds_exon_ranges,intron_ranges,utr_5_ranges,utr_3_ranges,\
			interg_ranges_up_1kb_ranges,interg_ranges_up_5kb_ranges,interg_ranges_up_10kb_ranges,\
			interg_ranges_down_1kb_ranges,interg_ranges_down_5kb_ranges,interg_ranges_down_10kb_ranges,\
			exon_size,intron_size,utr5_size,utr3_size,\
			int_up1k_size,int_up5k_size,int_up10k_size,\
			int_down1k_size,int_down5k_size,int_down10k_size)
Example #8
0
def process_gene_model(gene_model):
	print >>sys.stderr, "processing " + gene_model + ' ...',
	obj = BED.ParseBED(gene_model)
	utr_3 = obj.getUTR(utr=3)
	utr_5 = obj.getUTR(utr=5)
	cds_exon = obj.getCDSExon()
	intron = obj.getIntron()
	
	intron = BED.unionBed3(intron)
	cds_exon=BED.unionBed3(cds_exon)
	utr_5 = BED.unionBed3(utr_5)
	utr_3 = BED.unionBed3(utr_3)
	
	utr_5 = BED.subtractBed3(utr_5,cds_exon)
	utr_3 = BED.subtractBed3(utr_3,cds_exon)
	intron = BED.subtractBed3(intron,cds_exon)
	intron = BED.subtractBed3(intron,utr_5)
	intron = BED.subtractBed3(intron,utr_3)
	
	intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000)
	intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000)
	intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000)
	intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000)	
	intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000)
	intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000)
	
	#merge integenic region
	intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb)
	intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb)
	intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb)
	intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb)
	intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb)
	intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb)	
	
	#purify intergenic region
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron)	

	#purify intergenic region
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron)	
	
	#purify intergenic region
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron)	
	
	#build intervalTree 
	cds_exon_ranges = build_bitsets(cds_exon)
	utr_5_ranges = build_bitsets(utr_5)
	utr_3_ranges = build_bitsets(utr_3)
	intron_ranges = build_bitsets(intron)
	interg_ranges_up_1kb_ranges = build_bitsets(intergenic_up_1kb)
	interg_ranges_up_5kb_ranges = build_bitsets(intergenic_up_5kb)
	interg_ranges_up_10kb_ranges = build_bitsets(intergenic_up_10kb)
	interg_ranges_down_1kb_ranges = build_bitsets(intergenic_down_1kb)
	interg_ranges_down_5kb_ranges = build_bitsets(intergenic_down_5kb)
	interg_ranges_down_10kb_ranges = build_bitsets(intergenic_down_10kb)
	
	exon_size = cal_size(cds_exon)
	intron_size = cal_size(intron)
	utr3_size = cal_size(utr_3)
	utr5_size = cal_size(utr_5)
	int_up1k_size = cal_size(intergenic_up_1kb)
	int_up5k_size = cal_size(intergenic_up_5kb)
	int_up10k_size = cal_size(intergenic_up_10kb)
	int_down1k_size = cal_size(intergenic_down_1kb)
	int_down5k_size = cal_size(intergenic_down_5kb)
	int_down10k_size = cal_size(intergenic_down_10kb)
	
	print >>sys.stderr, "Done"
	return (cds_exon_ranges,intron_ranges,utr_5_ranges,utr_3_ranges,\
			interg_ranges_up_1kb_ranges,interg_ranges_up_5kb_ranges,interg_ranges_up_10kb_ranges,\
			interg_ranges_down_1kb_ranges,interg_ranges_down_5kb_ranges,interg_ranges_down_10kb_ranges,\
			exon_size,intron_size,utr5_size,utr3_size,\
			int_up1k_size,int_up5k_size,int_up10k_size,\
			int_down1k_size,int_down5k_size,int_down10k_size)
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help=
        "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]"
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=500000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    parser.add_option(
        "-f",
        "--format",
        action="store",
        type="string",
        dest="out_format",
        default="bgr",
        help=
        "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print >> sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >> sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom
            #if chr_name != "chrY":continue
            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >> sys.stderr, "Skip " + chr_name + "!"
                continue

            print >> sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1],
                                            interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >> sys.stderr, "Normalizing bigwig file ..."
    for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom
        #if chr_name != "chrY":continue
        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >> sys.stderr, "Skip " + chr_name + "!"
            continue

        if options.out_format.upper() == "WIG":
            print >> sys.stderr, "Writing " + chr_name + " ..."
            OUT.write('variableStep chrom=' + chr_name + '\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                coord = interval[1]
                bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    if v != 0: print >> OUT, "%d\t%.2f" % (coord, v)
        elif options.out_format.upper() == "BGR":
            print >> sys.stderr, "Writing " + chr_name + " ..."
            #OUT.write('variableStep chrom='+chr_name+'\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                v2p = collections.defaultdict(list)  #value to position
                range2p = {
                }  #coorindate range to value, bedgraph. #[start]=[len,value]
                coord = interval[1]
                bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
                    if v != 0: v2p[v].append(coord)
                for v in v2p:
                    for k, g in groupby(enumerate(v2p[v]), lambda
                                        (i, x): i - x):
                        for l in [map(itemgetter(1), g)]:
                            range2p[l[0] - 1] = [len(l), v]
                for i in sorted(range2p):
                    print >> OUT, chr_name + '\t' + str(i) + '\t' + str(
                        i + range2p[i][0]) + '\t' + str(range2p[i][1])
        else:
            print >> sys.stderr, "unknown output format"
            sys.exit(1)
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. BAM file should be sorted and indexed.")
	parser.add_option("-r","--genelist",action="store",type="string",dest="gene_list",help="Gene list in bed foramt. All reads hits to exon regions (defined by this gene list) will be saved into one BAM file, the remaining reads will saved into another BAM file.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. \"prefix.in.bam\" file contains reads mapped to the gene list specified by \"-r\", \"prefix.ex.bam\" contains reads that cannot mapped to gene list. \"prefix.junk.bam\" contains qcfailed or unmapped reads.")
	(options,args)=parser.parse_args()
		
	if not (options.input_file and options.gene_list):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.gene_list):
		print >>sys.stderr, '\n\n' + options.gene_list + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.input_file):
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		sys.exit(0)		
	
	#build bitset for gene list
	print >>sys.stderr, 'reading ' + options.gene_list + ' ... ',
	obj = BED.ParseBED(options.gene_list)
	exons = obj.getExon()
	exon_ranges = build_bitsets(exons)
	print >>sys.stderr, 'Done'
	
	samfile = pysam.Samfile(options.input_file,'rb')
	out1 = pysam.Samfile(options.output_prefix + '.in.bam','wb',template=samfile)	#bam file containing reads hit to exon region
	out2 = pysam.Samfile(options.output_prefix + '.ex.bam','wb',template=samfile)	#bam file containing reads not hit to exon region
	out3 = pysam.Samfile(options.output_prefix + '.junk.bam','wb',template=samfile)	#bam file containing reads not hit to exon region
	
	total_alignment = 0
	in_alignment = 0
	ex_alignment = 0
	bad_alignment = 0
	print >>sys.stderr, "spliting " + options.input_file + " ...",
	try:
		while(1):
			aligned_read = samfile.next()
			total_alignment += 1
			
			if aligned_read.is_qcfail:
				bad_alignment +=1
				out3.write(aligned_read)
				continue
			if aligned_read.is_unmapped:
				bad_alignment +=1
				out3.write(aligned_read)
				continue
			
			chrom = samfile.getrname(aligned_read.tid)
			chrom=chrom.upper()	
			read_start = aligned_read.pos
			mate_start = aligned_read.mpos
				
			#read_exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar)
			if aligned_read.mate_is_unmapped:	#only one end mapped
				if chrom not in exon_ranges:
					out2.write(aligned_read)
					ex_alignment += 1
					continue		
				else:		
					if len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1:
						out1.write(aligned_read)
						in_alignment += 1
						continue
					elif len(exon_ranges[chrom].find(read_start, read_start +1)) == 0:
						out2.write(aligned_read)
						ex_alignment += 1
						continue
			else:							#both end mapped
				if chrom not in exon_ranges:
					out2.write(aligned_read)
					ex_alignment += 1
					continue
				else:
					if (len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1) or (len(exon_ranges[chrom].find(mate_start, mate_start +1)) >= 1):
						out1.write(aligned_read)
						in_alignment += 1
					else:
						out2.write(aligned_read)
						ex_alignment += 1
				
	except StopIteration:
		print >>sys.stderr, "Done"
				
	print "%-55s%d" % ("Total records:",total_alignment)
	print "%-55s%d" % (options.output_prefix + '.in.bam (Reads consumed by input gene list):',in_alignment)
	print "%-55s%d" % (options.output_prefix + '.ex.bam (Reads not consumed by input gene list):',ex_alignment)
	print "%-55s%d" % (options.output_prefix + '.junk.bam (qcfailed, unmapped reads):',bad_alignment)
Example #11
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help=
        "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]"
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=100000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print >> sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >> sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom

            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >> sys.stderr, "Skip " + chr_name + "!"
                continue

            print >> sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1],
                                            interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >> sys.stderr, "Normalizing bigwig file, output wiggle file"
    for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom

        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >> sys.stderr, "Skip " + chr_name + "!"
            continue

        print >> sys.stderr, "Writing " + chr_name + " ..."
        OUT.write('variableStep chrom=' + chr_name + '\n')
        for interval in BED.tillingBed(chrName=chr_name,
                                       chrSize=chr_size,
                                       stepSize=options.chunk_size):
            coord = interval[1]
            bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
            tmp = numpy.nansum(bw_signal)
            if numpy.isnan(tmp): continue
            bw_signal = numpy.nan_to_num(bw_signal)
            for v in bw_signal:
                coord += 1
                if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]")
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]")
	parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]")	
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]")
	parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)

	OUT=open(options.output_wig,'w')
	bw = BigWigFile( file=open(options.BigWig_File) )
	chrom_sizes = load_chromsize(options.chromSize)	
	exons=[]
	WIG_SUM=0.0
	if (options.refgene_bed):	
		print >>sys.stderr, "Extract exons from " + options.refgene_bed
		obj = BED.ParseBED(options.refgene_bed)
		exons = obj.getExon()
		print >>sys.stderr, "Merge overlapping exons ..."
		exons = BED.unionBed3(exons)
		print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
		for chrom,st,end in exons:
			try: bw.get_as_array(chrom,0,1).size
			except:continue

			bw_signal = bw.get_as_array(chrom,st,end)
			tmp = numpy.nansum(bw_signal)			#nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
			if numpy.isnan(tmp):continue	
			WIG_SUM += tmp
		print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
	else:
		print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File
		for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
			#if chr_name != "chrY":continue
			try: bw.get_as_array(chr_name,0,1).size
			except:
				print >>sys.stderr, "Skip " + chr_name + "!"
				continue

			print >>sys.stderr, "Processing " + chr_name + " ..."	
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				bw_signal = bw.get_as_array(interval[0],interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				WIG_SUM += tmp
		print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM
	
	try:
		weight = options.total_wigsum/WIG_SUM
	except:
		"Error, WIG_SUM cannot be 0"
		eys.exit(1)

	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	print >>sys.stderr, "Normalizing bigwig file ..."
	for chr_name, chr_size in chrom_sizes.items():          #iterate each chrom
		#if chr_name != "chrY":continue
		try: bw.get_as_array(chr_name,0,1).size
		except:
			print >>sys.stderr, "Skip " + chr_name + "!"
			continue
		
		if options.out_format.upper() == "WIG":
			print >>sys.stderr, "Writing " + chr_name + " ..."
			OUT.write('variableStep chrom='+chr_name+'\n')
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				coord = interval[1]
				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				bw_signal = numpy.nan_to_num(bw_signal) * weight
				for v in bw_signal:
					coord +=1
					if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
		elif options.out_format.upper() == "BGR":
			print >>sys.stderr, "Writing " + chr_name + " ..."
			#OUT.write('variableStep chrom='+chr_name+'\n')
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				v2p = collections.defaultdict(list)     #value to position
				range2p={}      #coorindate range to value, bedgraph. #[start]=[len,value]
				coord = interval[1]
				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				bw_signal = numpy.nan_to_num(bw_signal) * weight
				for v in bw_signal:
					coord +=1
					#if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
					if v != 0: v2p[v].append(coord)
				for v in v2p:
					for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x):
						for l in [map(itemgetter(1), g)]:
							range2p[l[0]-1] = [len(l),v]
				for i in sorted(range2p):
					print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1])
		else:
			print >>sys.stderr, "unknown output format"
			sys.exit(1)
Example #13
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=500000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    parser.add_option(
        "-f",
        "--format",
        action="store",
        type="string",
        dest="out_format",
        default="bgr",
        help=
        "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = pyBigWig.open(options.BigWig_File)

    if bw.isBigWig():
        pass
    else:
        print("%s is not a bigwig file!" % options.BigWig_File,
              file=sys.stderr)
        sys.exit(0)

    print("Get chromosome sizes from BigWig header ...", file=sys.stderr)
    chrom_sizes = {}
    for chr, size in bw.chroms().items():
        chrom_sizes[chr] = size

    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print("Extract exons from " + options.refgene_bed, file=sys.stderr)
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print("Merge overlapping exons ...", file=sys.stderr)
        exons = BED.unionBed3(exons)
        print("Calculate wigsum covered by " + options.refgene_bed + ' only',
              file=sys.stderr)
        for chrom, st, end in exons:
            if bw.stats(chrom, st, end)[0] is None:
                continue
            bw_signal = bw.values(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print("Total wigsum is %.2f\n" % WIG_SUM, file=sys.stderr)
    else:
        print("Calculate wigsum from " + options.BigWig_File, file=sys.stderr)
        for chr_name, chr_size in list(
                chrom_sizes.items()):  #iterate each chrom
            if bw.stats(chr_name, 0, chr_size)[0] is None:
                print("Skip " + chr_name + "!", file=sys.stderr)
                continue

            print("Processing " + chr_name + " ...", file=sys.stderr)
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                if bw.stats(interval[0], interval[1], interval[2])[0] is None:
                    continue
                bw_signal = bw.values(interval[0], interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print("\nTotal wigsum is %.2f\n" % WIG_SUM, file=sys.stderr)

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        sys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print("Normalizing bigwig file ...", file=sys.stderr)
    for chr_name, chr_size in list(chrom_sizes.items()):  #iterate each chrom

        if bw.stats(chr_name, 0, chr_size)[0] is None:
            print("Skip " + chr_name + "!", file=sys.stderr)
            continue

        if options.out_format.upper() == "WIG":
            print("Writing " + chr_name + " ...", file=sys.stderr)
            OUT.write('variableStep chrom=' + chr_name + '\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                coord = interval[1]
                bw_signal = bw.values(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    if v != 0: print("%d\t%.2f" % (coord, v), file=OUT)
        elif options.out_format.upper() == "BGR":
            print("Writing " + chr_name + " ...", file=sys.stderr)
            #OUT.write('variableStep chrom='+chr_name+'\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                v2p = collections.defaultdict(list)  #value to position
                range2p = {
                }  #coorindate range to value, bedgraph. #[start]=[len,value]
                coord = interval[1]
                bw_signal = bw.values(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    if v != 0: v2p[v].append(coord)
                for v in v2p:
                    for k, g in groupby(enumerate(v2p[v]),
                                        lambda i_x: i_x[0] - i_x[1]):
                        for l in [list(map(itemgetter(1), g))]:
                            range2p[l[0] - 1] = [len(l), v]
                for i in sorted(range2p):
                    print(chr_name + '\t' + str(i) + '\t' +
                          str(i + range2p[i][0]) + '\t' + str(range2p[i][1]),
                          file=OUT)
        else:
            print("unknown output format", file=sys.stderr)
            sys.exit(1)
Example #14
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-b","--forward",action="store",type="string",dest="forward_bw",help="BigWig file for forward reads (extend 1 nt from 5' end of read)")
	parser.add_option("-d","--reverse",action="store",type="string",dest="reverse_bw",help="BigWig file for reverse reads (extend 1 nt from 5' end of read)")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files")
	parser.add_option("-z","--fuzziness",action="store",type="int",dest="fuzzy_size",default=10,help="Peaks within fuzzy window will be merged. default=%default (bp)")
	parser.add_option("-w","--bgw",action="store",type="int",dest="window_size",default=200,help="Background window size used to determine background signal level (lambda in Poisson model). default=%default (bp)")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-p","--pvalue",action="store",type="float",dest="pvalue_cutoff",default=0.1,help="Pvalue cutoff for peak detection. default=%default")
	parser.add_option("-r","--bg-root-num",action="store",type="float",dest="bg_root_num",default=100,help="Background peak root number. default=%default")
	parser.add_option("-e","--extention",action="store",type="int",dest="extention_size",default=5,help="Window size used to calculate peak area. Larger number will signficantly reduce speed, and make peak calling more meaningless.  default=%default")

	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.chromSize and options.forward_bw and options.reverse_bw):
		parser.print_help()
		sys.exit(0)
	for file in (options.chromSize,options.forward_bw,options.reverse_bw):
		if not os.path.exists(file):
			print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n'
			sys.exit(0)
	
	chrom_sizes = load_chromsize(options.chromSize)
	OUT = open(options.output_prefix + ".single_nt_peak.xls",'w')
	fw_bw_obj = BigWigFile( file = open(options.forward_bw))
	rv_bw_obj = BigWigFile( file = open(options.reverse_bw))
	rv_peak_roots = {}
	rv_peak_height = {}
	rv_ranges={}
	rv_peak_pvalue={}
	pv_cutoff = -10*math.log10(options.pvalue_cutoff)	
	signal.signal(signal.SIGINT, signal_handler)


	print >>sys.stderr, logo	
	
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#calculate peak height and peak area for forward bigwig
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.forward_bw + '  ...'
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		fw_peak_roots = {}	#key is chr,pos,strand,height: ("chr19   51345387        +       2.83"), value is area("2.82999992371")
		fw_peak_height = {}
		fw_ranges={}
		fw_peak_pvalue={}
		if chr_name != 'chrY':
			continue
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..."
		progress = 0
		coord = 0	
		#for each chunk
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)				
			for indx,val in enumerate(fw_bw_obj.get_as_array(interval[0],interval[1],interval[2])):
				coord += 1	#coord is 1-based on genome
				if numpy.isnan(val):continue
				area_value = sum_bwfile(chr_name, coord, options.extention_size, fw_bw_obj,chrom_sizes)
				fw_peak_roots[chr_name + "\t" + str(coord) + "\t+"] = area_value		#key is chrom + position + strand,value is area
				fw_peak_height[chr_name + "\t" + str(coord) + "\t+"] = val
				if chr_name not in fw_ranges:
					fw_ranges[chr_name] = IntervalTree()
				else:
					fw_ranges[chr_name].insert_interval( Interval( coord-1, coord, value=area_value) )
			finish_part = int(interval[2]*100/chr_size)
			if finish_part > progress:
				print >>sys.stderr, " %d%% finished\r" % (finish_part),
				progress = finish_part	
	
	
		#fw_global_lamda = numpy.mean(fw_peak_roots.values())
		#print >>sys.stderr, "Global mean (Forward) = " + str(fw_global_lamda)
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.forward_bw + '  ...'
		for k in fw_peak_roots:
			chrom = k.split("\t")[0]
			coord = int(k.split("\t")[1])
			fw_peak_pvalue[k] = cal_poisson_pvalue(int(fw_peak_roots[k]), coord-1, coord, fw_ranges[chrom],options.window_size,options.bg_root_num)

	
		fw_peak_filtered = merge_peaks(fw_peak_height,fuzziness=options.fuzzy_size)	
		for k,v in fw_peak_filtered.items():
			#print k + '\t' + str(v)
			(chrom,end,strand) = k.split('\t')
			end = int(end)
			start = end -1
			height = str(v)
			area = str(fw_peak_roots[k])
			pvalue = fw_peak_pvalue[k]
			if pvalue < pv_cutoff:continue
			print >>OUT, '\t'.join([chrom, str(start), str(end), area,str(round(pvalue)),strand,height])
	
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#calculate peak height and peak area for reverse bigwig
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.reverse_bw + '  ...'
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		if chr_name != 'chrY':
			continue
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..."
		progress = 0
		coord = 0	
		#for each chunk
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)				
			
			for indx,val in enumerate(rv_bw_obj.get_as_array(interval[0],interval[1],interval[2])):
				coord += 1	#coord is 1-based on genome
				if numpy.isnan(val):continue
				area_value = sum_bwfile(chr_name, coord, options.extention_size, rv_bw_obj,chrom_sizes)				
				rv_peak_roots[chr_name + "\t" + str(coord) + "\t-"] = area_value
				rv_peak_height[chr_name + "\t" + str(coord) + "\t-"] = val
				if chr_name not in rv_ranges:
					rv_ranges[chr_name] = IntervalTree()
				else:
					rv_ranges[chr_name].insert_interval( Interval( coord-1, coord, value = area_value) )
			finish_part = int(interval[2]*100/chr_size)
			if finish_part > progress:
				print >>sys.stderr, " %d%% finished\r" % (finish_part),
				progress = finish_part
	

	#rv_global_lamda = numpy.mean(rv_peak_roots.values())
	#print >>sys.stderr, "Global mean (Reverse) = " + str(rv_global_lamda)
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.reverse_bw + '  ... '
	for k in rv_peak_roots:
		chrom = k.split("\t")[0]
		coord = int(k.split("\t")[1])
		rv_peak_pvalue[k] = cal_poisson_pvalue(int(rv_peak_roots[k]),coord-1,coord, rv_ranges[chrom],options.window_size,options.bg_root_num)
		#print k + '\t' + str(rv_peak_roots[k]) + '\t' + str(pvalue)


	rv_peak_filtered = merge_peaks(rv_peak_height,fuzziness=options.fuzzy_size)
	for k,v in rv_peak_filtered.items():
		(chrom,end,strand) = k.split('\t')
		end = int(end)
		start = end -1
		height = str(v)
		area = str(rv_peak_roots[k])
		pvalue = rv_peak_pvalue[k]
		if pvalue < pv_cutoff:continue
		
		print >>OUT, '\t'.join([chrom, str(start), str(end), area, str(round(pvalue)),strand,height])
Example #15
0
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in SAM format. Use \"-\" represents standard input [required]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. [required]")
	(options,args)=parser.parse_args()
		
	if not (options.input_file and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	if os.path.exists(options.input_file):
		file_obj=open(options.input_file)
		pass
	elif options.input_file == '-':
		file_obj=sys.stdin
		pass
	else:
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)		
	print >>sys.stderr, "processing " + options.ref_gene_model + ' ...',
	obj = BED.ParseBED(options.ref_gene_model)
	utr_3 = obj.getUTR(utr=3)
	utr_5 = obj.getUTR(utr=5)
	cds_exon = obj.getCDSExon()
	intron = obj.getIntron()
	
	intron = BED.unionBed3(intron)
	cds_exon=BED.unionBed3(cds_exon)
	utr_5 = BED.unionBed3(utr_5)
	utr_3 = BED.unionBed3(utr_3)
	
	utr_5 = BED.subtractBed3(utr_5,cds_exon)
	utr_3 = BED.subtractBed3(utr_3,cds_exon)
	intron = BED.subtractBed3(intron,cds_exon)
	intron = BED.subtractBed3(intron,utr_5)
	intron = BED.subtractBed3(intron,utr_3)
	
	intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000)
	intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000)
	intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000)
	intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000)	
	intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000)
	intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000)
	
	#merge integenic region
	intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb)
	intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb)
	intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb)
	intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb)
	intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb)
	intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb)	
	
	#purify intergenic region
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron)	

	#purify intergenic region
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron)	
	
	#purify intergenic region
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron)	
	
	print >>sys.stderr, "Done"
	
	ranges={}
	totalReads=0
	spliceReads=0
	cUR=0
	multiMapReads=0
	
	print >>sys.stderr, "reading SAM file",
	for line in file_obj:
		if line.startswith("@"):continue
		fields=line.rstrip('\n ').split()
		flagCode=string.atoi(fields[1])
		if (flagCode & 0x0004) != 0: continue		#skip unmap reads
		totalReads +=1
		if not SAM.ParseSAM._uniqueHit_pat.search(line):		#skip multiple mapped reads
			multiMapReads +=1
			continue

		chrom = fields[2].upper()
		chromStart = string.atoi(fields[3])-1
		comb=[int(i) for i in SAM.ParseSAM._splicedHit_pat.findall(fields[5])]	#"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8']
		cUR += (len(comb) +1)/2
		if(len(comb)>1):
			spliceReads += 1
		blockStart=[]
		blockSize=[]
			
		for i in range(0,len(comb),2):
			blockStart.append(chromStart + sum(comb[:i]) )
				
		for i in range(0,len(comb),2):
			blockSize.append(comb[i])
			
		for st,size in zip(blockStart,blockSize):
			mid = int(st) + (size/2)
			if chrom not in ranges:
				ranges[chrom] = Intersecter()
			else:
				ranges[chrom].add_interval( Interval( mid, mid ) )
	print >>sys.stderr, "Done"
	print >>sys.stderr, "Total Reads: " + str(totalReads)
	print >>sys.stderr, "Multiple Hits: " + str(multiMapReads)
	print >>sys.stderr, "Unique Hits: " + str(totalReads-multiMapReads)
	print >>sys.stderr, "Spliced Hits: " + str(spliceReads)
	print >>sys.stderr, "Total fragments: " + str(cUR)
	
	
	
	print >>sys.stderr, "\nAssignning reads ...",
	intron_read=0
	intron_base=0
	cds_exon_read=0
	cds_exon_base=0
	utr_5_read=0
	utr_5_base=0
	utr_3_read=0
	utr_3_base=0
	
	intergenic_up1kb_base=0
	intergenic_up1kb_read=0
	intergenic_down1kb_base=0
	intergenic_down1kb_read=0
	intergenic_up5kb_base=0
	intergenic_up5kb_read=0
	intergenic_down5kb_base=0
	intergenic_down5kb_read=0
	intergenic_up10kb_base=0
	intergenic_up10kb_read=0
	intergenic_down10kb_base=0
	intergenic_down10kb_read=0	
	
	(intron_base,intron_read) = base_read(intron,ranges)
	(cds_exon_base,cds_exon_read) = base_read(cds_exon,ranges)
	(utr_5_base,utr_5_read) = base_read(utr_5,ranges)
	(utr_3_base,utr_3_read) = base_read(utr_3,ranges)
	(intergenic_up1kb_base, intergenic_up1kb_read) = base_read(intergenic_up_1kb,ranges)
	(intergenic_up5kb_base, intergenic_up5kb_read) = base_read(intergenic_up_5kb,ranges)
	(intergenic_up10kb_base, intergenic_up10kb_read) = base_read(intergenic_up_10kb,ranges)
	(intergenic_down1kb_base, intergenic_down1kb_read) = base_read(intergenic_down_1kb,ranges)
	(intergenic_down5kb_base, intergenic_down5kb_read) = base_read(intergenic_down_5kb,ranges)
	(intergenic_down10kb_base, intergenic_down10kb_read) = base_read(intergenic_down_10kb,ranges)
	
	print >>sys.stderr, "Done"
	
	print >>sys.stderr, "========================================================="
	print >>sys.stderr, "Group\tTotal_bases\tReads_count\tReads/Kb"
	print >>sys.stderr, "CDS Exons:\t%d\t%d\t%5.2f" % (cds_exon_base,cds_exon_read,cds_exon_read*1000.0/cds_exon_base)
	print >>sys.stderr, "5'UTR Exons:\t%d\t%d\t%5.2f" % (utr_5_base,utr_5_read, utr_5_read*1000.0/utr_5_base)
	print >>sys.stderr, "3'UTR Exons:\t%d\t%d\t%5.2f" % (utr_3_base,utr_3_read, utr_3_read*1000.0/utr_3_base)
	print >>sys.stderr, "Intronic region:\t%d\t%d\t%5.2f" % (intron_base,intron_read,intron_read*1000.0/intron_base)
	
	print >>sys.stderr, "TSS up 1kb:\t%d\t%d\t%5.2f" % (intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/intergenic_up1kb_base)
	print >>sys.stderr, "TSS up 5kb:\t%d\t%d\t%5.2f" % (intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/intergenic_up5kb_base)
	print >>sys.stderr, "TSS up 10kb:\t%d\t%d\t%5.2f" % (intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/intergenic_up10kb_base)
	print >>sys.stderr, "TES down 1kb:\t%d\t%d\t%5.2f" % (intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/intergenic_down1kb_base)
	print >>sys.stderr, "TES down 5kb:\t%d\t%d\t%5.2f" % (intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/intergenic_down5kb_base)	
	print >>sys.stderr, "TES down 10kb:\t%d\t%d\t%5.2f" % (intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/intergenic_down10kb_base)
	print >>sys.stderr, "========================================================="
Example #16
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option(
        "-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]"
    )
    parser.add_option(
        "-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]"
    )
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]",
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]",
    )
    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="refgene_bed",
        help="Reference gene model in bed format. [optional]",
    )
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=100000,
        help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]",
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, "w")
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if options.refgene_bed:
        print >>sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >>sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + " only"
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  # nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp):
                continue
            WIG_SUM += tmp
        print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  # iterate each chrom

            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >>sys.stderr, "Skip " + chr_name + "!"
                continue

            print >>sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp):
                    continue
                WIG_SUM += tmp
        print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >>sys.stderr, "Normalizing bigwig file, output wiggle file"
    for chr_name, chr_size in chrom_sizes.items():  # iterate each chrom

        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >>sys.stderr, "Skip " + chr_name + "!"
            continue

        print >>sys.stderr, "Writing " + chr_name + " ..."
        OUT.write("variableStep chrom=" + chr_name + "\n")
        for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size):
            coord = interval[1]
            bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
            tmp = numpy.nansum(bw_signal)
            if numpy.isnan(tmp):
                continue
            bw_signal = numpy.nan_to_num(bw_signal)
            for v in bw_signal:
                coord += 1
                if v != 0:
                    print >> OUT, "%d\t%.4f" % (coord, v * weight)