Beispiel #1
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1 = BigWigFile( file=open(options.BigWig_File1) )
	bw2 = BigWigFile( file=open(options.BigWig_File2) )
	chrom_sizes = load_chromsize(options.chromSize)
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		print >>sys.stderr, "Processing " + chr_name + " ..."
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			coord = interval[1]
			bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
			bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
			if all_nan(bw_signal1) and all_nan(bw_signal2):
				continue
			bw_signal1 = replace_nan( bw_signal1 )
			bw_signal2 = replace_nan( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
def Main():
    global args
    args = ParseArg()
    bw1 = BigWigFile(open(args.percentile1))
    bw2 = BigWigFile(open(args.percentile2))
    gout = WriteToFile(args.output + ".list")
    perc_array1 = []
    perc_array2 = []
    for line in ReadFromFile(args.geneList):
        row = line.strip().split()
        gene = row[0]
        chrom = row[1]
        start = int(row[2])
        end = int(row[3])
        array1 = bw1.get_as_array(chrom, start, end)
        array2 = bw2.get_as_array(chrom, start, end)
        if array1 is not None and array2 is not None:
            perc1 = np.mean(array1) + 50
            perc2 = np.mean(array2) + 50
            print >> gout, '%s\t%s\t%d\t%d\t%f\t%f' % (gene, chrom, start, end,
                                                       perc1, perc2)
            perc_array1.append(perc1)
            perc_array2.append(perc2)
    '''scatter plot'''
    sns.set()
    plt.scatter(perc_array1,
                perc_array2,
                marker=',',
                color='black',
                s=1,
                alpha=0.1)
    plt.axes().set_aspect('equal')
    plt.xlabel(args.x, fontsize=20)
    plt.ylabel(args.y, fontsize=20)
    plt.ylim(0, 100)
    plt.xlim(0, 100)
    plt.tick_params(axis='both', which='major', labelsize=20, width=2)
    plt.gca().set_yticks([0, 20, 40, 60, 80, 100])
    plt.gca().set_xticks([0, 20, 40, 60, 80, 100])
    x1, y1 = [0, 89.5], [10.5, 100]
    x2, y2 = [10.5, 100
              ], [0, 89.5
                  ]  # draw lines showing the threshold to call changed domains
    plt.gca().spines['left'].set_linewidth(2)
    plt.gca().spines['bottom'].set_linewidth(2)
    plt.gca().spines['right'].set_linewidth(2)
    plt.gca().spines['top'].set_linewidth(2)
    plt.subplots_adjust(bottom=.2, left=.2)
    plt.plot(x1, y1, linewidth=0.5, linestyle='--', color='red')
    plt.plot(x2, y2, linewidth=0.5, linestyle='--', color='red')
    plt.savefig(args.output + '_dot.eps', format='eps')
    plt.close()

    logging("DONE!!!")
Beispiel #3
0
class BigWigWrapper(object):
    """A wrapper for bx-python BigWig file"""
    def __init__(self, filepath):
        self.bw = BigWigFile(open(filepath))

    def __getitem__(self, iv):
        return self.bw.get_as_array(iv.chrom, iv.start, iv.end)
def findInsertions(bwFile, bedData, interval, x):

    if interval =='start':
        sL = int(bedData[x][1])-options.l
        sR = int(bedData[x][1])+options.r
    elif interval == 'end':
        sL = int(bedData[x][2])-options.l
        sR = int(bedData[x][2])+options.r
    else:
        sL = int(bedData[x][1])-options.l
        sR = int(bedData[x][2])+options.r
    
    # get signal data
    f = open(bwFile, "rb")
    bigwig_class = BigWigFile(f)
    try: signal = bigwig_class.get_as_array(bedData[x][0],sL,sR)
    except OverflowError: signal = np.array([np.nan]*(sR-sL))
    f.close()
    
    if signal is not None:
        if np.sum(np.isfinite(signal)) > 0:
            out = np.nanmean(signal)
        else: out = 0
    else: out = 0
    
    out = signal
    return out
Beispiel #5
0
    def summarize(self,
                  interval,
                  bins=None,
                  method='summarize',
                  function='mean'):

        # We may be dividing by zero in some cases, which raises a warning in
        # NumPy based on the IEEE 754 standard (see
        # http://docs.scipy.org/doc/numpy/reference/generated/
        #       numpy.seterr.html)
        #
        # That's OK -- we're expecting that to happen sometimes. So temporarily
        # disable this error reporting for the duration of this method.
        orig = np.geterr()['invalid']
        np.seterr(invalid='ignore')

        if (bins is None) or (method == 'get_as_array'):
            bw = BigWigFile(open(self.fn))
            s = bw.get_as_array(
                interval.chrom,
                interval.start,
                interval.stop,
            )
            if s is None:
                s = np.zeros((interval.stop - interval.start, ))
            else:
                s[np.isnan(s)] = 0

        elif method == 'ucsc_summarize':
            if function in ['mean', 'min', 'max', 'std', 'coverage']:
                return self.ucsc_summarize(interval, bins, function=function)
            else:
                raise ValueError('function "%s" not supported by UCSC\'s'
                                 'bigWigSummary')

        else:
            bw = BigWigFile(open(self.fn))
            s = bw.summarize(interval.chrom, interval.start, interval.stop,
                             bins)
            if s is None:
                s = np.zeros((bins, ))
            else:
                if function == 'sum':
                    s = s.sum_data
                if function == 'mean':
                    s = s.sum_data / s.valid_count
                    s[np.isnan(s)] = 0
                if function == 'min':
                    s = s.min_val
                    s[np.isinf(s)] = 0
                if function == 'max':
                    s = s.max_val
                    s[np.isinf(s)] = 0
                if function == 'std':
                    s = (s.sum_squares / s.valid_count)
                    s[np.isnan(s)] = 0

        # Reset NumPy error reporting
        np.seterr(divide=orig)
        return s
Beispiel #6
0
def getChromatinDataSeries(bigwigFile, libraryTable, sgInfoTable, tssTable, colname = '', naValue = 0):
	bwindex = BigWigFile(open(bigwigFile))
	chromDict = tssTable['chromosome'].to_dict()

	chromatinScores = []
	for name, sgInfo in sgInfoTable.iterrows():
		geneTup = (sgInfo['gene_name'],','.join(sgInfo['transcript_list']))

		if geneTup not in chromDict: #negative controls
			chromatinScores.append(np.nan)
			continue

		if sgInfo['strand'] == '+':
			sgRange = sgInfo['pam coordinate'] + sgInfo['length']
		else:
			sgRange = sgInfo['pam coordinate'] - sgInfo['length']

		chrom = chromDict[geneTup]
		
		chromatinArray = bwindex.get_as_array(chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange))
		if chromatinArray is not None and len(chromatinArray) > 0:
			chromatinScores.append(np.nanmean(chromatinArray))
		else: #often chrY when using K562 data..
			# print name
			# print chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange)
			chromatinScores.append(np.nan)

	chromatinSeries = pd.Series(chromatinScores, index=libraryTable.index, name = colname)

	return chromatinSeries.fillna(naValue)
Beispiel #7
0
def get_mean_phastcons(bedtool, phastcons_location):
    
    """
    
    Get means phastcons scores for all intervals in a bed tool
    bedtool - bedtool to extract data from
    phastcons_location - location of phastcons file
    
    """
    
    f = open(phastcons_location, 'r')
    bw = BigWigFile(file=f)

    #if bedtool
    data = np.ndarray(len(bedtool))  
    for i, bedline in enumerate(bedtool):
              
        conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop)
        
        if len(conservation_values) > 0:
            mean_phastcons = np.mean(conservation_values)
        else:
            mean_phastcons = 0
        data[i] = mean_phastcons
        
    return data
Beispiel #8
0
def get_mean_phastcons(bedtool, phastcons_location, sample_size = 1000):
    
    """
    
    Get means phastcons scores for all intervals in a bed tool
    bedtool - bedtool to extract data from
    phastcons_location - location of phastcons file
    
    """
    
    with open(phastcons_location) as bw_file:
        bw = BigWigFile(bw_file)
    
        data = []
        
        for bedline in bedtool.random_subset(min(len(bedtool), sample_size)):
            conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop)
            try:
                if len(conservation_values) > 0:
                    mean_phastcons = np.mean(conservation_values)
                else:
                    mean_phastcons = 0
                data.append(mean_phastcons)
            except TypeError:
                pass
    return data
Beispiel #9
0
def get_mean_phastcons(bedtool, phastcons_location, sample_size=1000):
    """
    
    Get means phastcons scores for all intervals in a bed tool
    bedtool - bedtool to extract data from
    phastcons_location - location of phastcons file
    
    """

    with open(phastcons_location) as bw_file:
        bw = BigWigFile(bw_file)

        data = []

        for bedline in bedtool.random_subset(min(len(bedtool), sample_size)):
            conservation_values = bw.get_as_array(bedline.chrom, bedline.start,
                                                  bedline.stop)
            try:
                if len(conservation_values) > 0:
                    mean_phastcons = np.mean(conservation_values)
                else:
                    mean_phastcons = 0
                data.append(mean_phastcons)
            except TypeError:
                pass
    return data
Beispiel #10
0
def Main():
    global args
    args = ParseArg()
    bw = BigWigFile(open(args.bigwig))
    CheckFolderExist(args.output)
    fout = WriteToFile(args.output + '/' + args.name + '.bed')
    wout = WriteToFile(args.output + '/' + args.name + '.wig')
    genome = LoadGenome(args.genome)
    if args.smooth:
        logging("Options: turn on smooth mode")
    for chrom in SortGenome(genome):
        chrom_size = genome[chrom]
        logging("Process: %s\t%d" % (chrom, chrom_size))
        array = bw.get_as_array(chrom, 0, chrom_size)
        invalid = np.isnan(array)
        array[invalid] = 0
        agg_array = []
        start = 0
        stop = args.window
        for nn in range(int(math.ceil(len(array) / float(args.window)))):
            if stop >= len(array):
                stop = len(array)
                agg_array.append(np.mean(array[start:stop]))
                break
            agg_array.append(np.mean(array[start:stop]))
            start += args.window
            stop += args.window
        agg_array = np.array(agg_array)
        if args.smooth:
            smooth_array = Smooth(agg_array)
        else:
            smooth_array = agg_array
        print >> wout, "variableStep chrom=%s span=%d" % (chrom, args.window)
        for nn, value in enumerate(smooth_array):
            if nn == 0:
                print >> fout, "%s\t0\t%d\t%.6f" % (chrom,
                                                    (nn + 1) * args.window,
                                                    float(value))
                print >> wout, "%d\t%.6f" % (nn + 1, value)
            elif nn == len(smooth_array) - 1:
                print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window,
                                                     chrom_size, float(value))
                print >> wout, "variableStep chrom=%s span=%d" % (
                    chrom, chrom_size - ((nn) * args.window))
                print >> wout, "%d\t%.6f" % (nn * args.window + 1,
                                             float(value))
            else:
                print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window,
                                                     (nn + 1) * args.window,
                                                     float(value))
                print >> wout, "%d\t%.6f" % (nn * args.window + 1,
                                             float(value))
    fout.flush()
    wout.flush()
    wig2bw = "wigToBigWig -clip %s %s %s" % (args.output + '/' + args.name +
                                             '.wig', args.genome, args.output +
                                             '/' + args.name + '.bw')
    os.system(wig2bw)
    logging("Finish: TSA_smooth DONE!!!")
Beispiel #11
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-m","--min_signal",action="store",type="float",dest="min_score",default=0.0,help="To redude the size of output wigfile, genomic positions with signal value smaller than (<) this threshold will be filtered out. default=%default")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1 = BigWigFile( file=open(options.BigWig_File1) )
	bw2 = BigWigFile( file=open(options.BigWig_File2) )
	chrom_sizes = load_chromsize(options.chromSize)
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		print >>sys.stderr, "Processing " + chr_name + " ..."
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			coord = interval[1]
			try:
				bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
			except:
				bw_signal1 = numpy.array()
			try:
				bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
			except:
				bw_signal2 = numpy.array()
			if bw_signal1 is None and bw_signal2 is None:
				continue
			if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)):
				continue
			if len(bw_signal1) == 0 and len(bw_signal2) == 0:
				continue
			bw_signal1 = numpy.nan_to_num( bw_signal1 )
			bw_signal2 = numpy.nan_to_num( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v >= options.min_score: print >>OUT, "%d\t%.2f" % (coord,v)
Beispiel #12
0
    def summarize(self, interval, bins=None, method='summarize',
                  function='mean'):

        # We may be dividing by zero in some cases, which raises a warning in
        # NumPy based on the IEEE 754 standard (see
        # http://docs.scipy.org/doc/numpy/reference/generated/
        #       numpy.seterr.html)
        #
        # That's OK -- we're expecting that to happen sometimes. So temporarily
        # disable this error reporting for the duration of this method.
        orig = np.geterr()['invalid']
        np.seterr(invalid='ignore')

        if (bins is None) or (method == 'get_as_array'):
            bw = BigWigFile(open(self.fn))
            s = bw.get_as_array(
                interval.chrom,
                interval.start,
                interval.stop,)
            if s is None:
                s = np.zeros((interval.stop - interval.start,))
            else:
                s[np.isnan(s)] = 0

        elif method == 'ucsc_summarize':
            if function in ['mean', 'min', 'max', 'std', 'coverage']:
                return self.ucsc_summarize(interval, bins, function=function)
            else:
                raise ValueError('function "%s" not supported by UCSC\'s'
                                 'bigWigSummary')

        else:
            bw = BigWigFile(open(self.fn))
            s = bw.summarize(
                interval.chrom,
                interval.start,
                interval.stop, bins)
            if s is None:
                s = np.zeros((bins,))
            else:
                if function == 'sum':
                    s = s.sum_data
                if function == 'mean':
                    s = s.sum_data / s.valid_count
                    s[np.isnan(s)] = 0
                if function == 'min':
                    s = s.min_val
                    s[np.isinf(s)] = 0
                if function == 'max':
                    s = s.max_val
                    s[np.isinf(s)] = 0
                if function == 'std':
                    s = (s.sum_squares / s.valid_count)
                    s[np.isnan(s)] = 0

        # Reset NumPy error reporting
        np.seterr(divide=orig)
        return s
Beispiel #13
0
class BigWigWrapper(object):

    """A wrapper for bx-python BigWig file"""

    def __init__(self, filepath):
        self.bw = BigWigFile(open(filepath))

    def __getitem__(self, iv):
        return self.bw.get_as_array(iv.chrom, iv.start, iv.end)
Beispiel #14
0
def wig_reader(infile, chrom_sizes=None, informat='wiggle', bin_size=2000):
    '''infile: either a wiggle or bigwig format file
	   chromsize: chrom_name: size, only needed is format is bigwig
	   format: either 'wiggle' or 'bigwig'
	   return: chrom, position (0-based), value
	'''
    if informat.upper() == 'WIGGLE':
        point_num = 1
        count = 0
        for chrom, start, end, strand, score in bx.wiggle.IntervalReader(
                infile):
            yield (chrom, start, end, score)
            """
			count += 1
			if count ==1:
				chrom = fields[0]
				up_bound = fields[1]+1
				score = fields[2]
				continue
			if (fields[0] == chrom) and (fields[1] +1 == up_bound + 1) and (fields[2] == score):
				point_num += 1
				up_bound = fields[1]+1
				continue
			else:
				yield((chrom, up_bound - point_num, up_bound, score))
				chrom = fields[0]
				score = fields[2]
				up_bound = fields[1]+1
				point_num = 1
			"""

    elif informat.upper() == 'BIGWIG':
        bw_obj = BigWigFile(file=open(infile))
        for chr_name, chr_size in list(chrom_sizes.items()):
            for chrom, st, end in BED.tillingBed(chrName=chr_name,
                                                 chrSize=chr_size,
                                                 stepSize=bin_size):
                sig_list = bw_obj.get_as_array(chrom, st, end)
                if sig_list is None:
                    continue
                sig_list = numpy.nan_to_num(sig_list)
                if numpy.sum(sig_list) == 0:
                    continue
                low_bound = st
                point_num = 1
                score = sig_list[0]
                for value in (sig_list[1:]):
                    if value == score:
                        point_num += 1
                    else:
                        yield ((chrom, low_bound, low_bound + point_num,
                                score))
                        score = value
                        low_bound = low_bound + point_num
                        point_num = 1
    else:
        raise Exception("Unknown format. Must be 'wiggle' or 'bigwig'")
Beispiel #15
0
def bigwig_to_wav(args):
    import numpy as np
    from bx.bbi.bigwig_file import BigWigFile
    from scipy.signal import convolve
    from scipy.stats import norm
    from scipy.ndimage import zoom

    logger.info('read input BigWigfile: ' + args.bigwig_file)
    f_bigwig = open(args.bigwig_file, 'rb')
    logger.info('read input BED file: ' + args.bed_file)
    f_bed = open(args.bed_file, 'r')
    bigwig = BigWigFile(f_bigwig)

    smooth_filter = None
    scale_factors = None
    if args.smooth == 'boxcar':
        smooth_filter = np.ones(args.window_size, dtype=np.float32)
    elif args.smooth == 'gaussian':
        smooth_filter = norm.pdf(
            np.linspace(-3, 3, args.window_size * 3,
                        endpoint=True)).astype(np.float32)
    if args.smooth != 'none':
        scale_factors = convolve(np.ones(smooth_filter.shape[0]),
                                 smooth_filter,
                                 mode='same')

    if not os.path.exists(args.output_dir):
        logger.info('create output directory: ' + args.output_dir)
        os.makedirs(args.output_dir)

    for line in f_bed:
        c = line.strip().split('\t')
        chrom = c[0]
        start = int(c[1])
        end = int(c[2])
        x = np.nan_to_num(bigwig.get_as_array(chrom, start, end))
        # zoom the signals
        x = zoom(x, args.zoom)
        if args.smooth != 'none':
            # smooth the raw signal with a moving window
            x = convolve(x, smooth_filter, mode='same')
            # scale the signal
            filter_length = smooth_filter.shape[0]
            x[:(filter_length / 2)] /= scale_factors[:(filter_length / 2)]
            x[(-filter_length / 2):] /= scale_factors[(-filter_length / 2):]
            if x.shape[0] > filter_length:
                x[(filter_length / 2):(-filter_length /
                                       2)] /= np.sum(smooth_filter)

        wav_file = os.path.join(args.output_dir,
                                '%s:%d-%d.wav' % (chrom, start, end))
        logger.info('create wav file: ' + wav_file)
        modulate(x,
                 wav_file,
                 sample_rate=args.sample_rate,
                 n_channels=args.n_channels)
Beispiel #16
0
class Phylop(object):
    def __init__(self, bw_fname):
        """
        :param bw_fname: Phylop 100way bigwig file name.
        """
        self.bw_handle = open(os.path.expanduser(bw_fname))
        self.bw = BigWigFile(self.bw_handle)

    def get(self, chrom, start, end, flanking=0):
        """
        :param chrom: chr1, chr2, etc.
        :param start: 0-based.
        :param end: 1-based.
        :param flanking: length of flanking sequence on each side.
        """
        return np.nanmean(
            self.bw.get_as_array(chrom, start - flanking, end + flanking))

    def calculate(self, fname, out_fname):
        """
        :param fname: SNP BED.
        :param out_fname: output file.
        """
        with open(fname) as bed_f, open(out_fname, 'w') as out_f:
            out_f.write('\t'.join(self._build_header()) + '\n')
            for line in bed_f:
                cols = line.rstrip().split('\t')
                chrom, start, end = cols[:3]
                start = int(start)
                end = int(end)
                scores = [
                    self.get(chrom, start, end),
                    self.get(chrom, start, end, 3),
                    self.get(chrom, start, end, 7)
                ]
                out_f.write('\t'.join(map(str, cols + scores)) + '\n')

    def close(self):
        self.bw_handle.close()

    def _build_header(self):
        header = [
            '#chrom_snp', 'start_snp', 'end_snp', 'ref', 'alt', 'feature',
            'gene_id', 'chrom', 'start', 'end', 'name', 'score', 'strand',
            'distance'
        ]
        header += ['phylop1', 'phylop3', 'phylop7']
        return header
def findInsertions(bwFile, bedData, x):
    if options.tn5 is not None:
        bwFile = options.b + options.tn5 + "." + bedData[x][0] + ".Scores.bw"

    sL = int(bedData[x][1]) - options.l
    sR = int(bedData[x][2]) + options.r

    # get signal data
    f = open(bwFile, "rb")
    bw = BigWigFile(f)
    try:
        signal = bw.get_as_array(bedData[x][0], sL, sR)
    except OverflowError:
        signal = np.array([np.nan] * (sR - sL))
    f.close()

    out = signal
    try:
        if bedData[x][3] == "-":
            out = out[::-1]
    except IndexError:
        pass
    return out
Beispiel #18
0
def bwSmooth(c):
    # open bigwig
    t = open(options.a)
    bw = BigWigFile(t)
    # get data, pass if not available 
    chrN = c[0];sPos=int(c[1]);ePos=int(c[2])
    signal = bw.get_as_array(chrN,sPos,ePos)
    t.close()
    # smooth data
    if type(signal) == type(None): signal = np.zeros(ePos-sPos)
    #else:
    signal[np.isnan(signal)] = 0
    convM = np.convolve(signal,wSmooth,'same')
    # save
    sList = np.arange(sPos,ePos,step)
    eList = sList+step
    chrList = np.array([chrN]*len(sList))
    meanSig = convM[range(step/2,chunkSize+padLen,step)]
    # save out
    idx1 = meanSig>0; idx2 = eList<chrLen; idx = idx1*idx2
    idx[chunkSize/step:] = False
    pData = np.c_[chrList[idx],np.array(sList[idx],dtype=str),eList[idx],meanSig[idx]]
    return pData
def profile_bwfile(inbed,bwfile):
	'''retrieve signal from bigwig file for each entry in input bed file'''
	bw = BigWigFile( file=open( bwfile ) )
	
	for line in open(inbed):
		bw_signal=[]
		try:
			if line.startswith('#'):continue
			if line.startswith('track'):continue
			if line.startswith('browser'):continue
			if not line.strip():
				continue
			else:
				line = line.rstrip('\r\n')
				fields = line.split()
				chrom = fields[0]
				start = int(fields[1])
				end = int(fields[2])
		except:
			print >>sys.stderr,"Must be  chrom [space] start [space] end: " + line,
			continue		
		bw_signal.extend(bw.get_as_array(chrom,start,end))
		print chrom +'\t'+ str(start) +'\t'+ str(end) + '\t' + ','.join(str(i) for i in bw_signal)		
def coverageGeneBody_bigwig(bigFile, refbed, outfile, gtype="png"):
    '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided
	into 100 regsions. bigFile is bigwig format file'''
    if refbed is None:
        print >> sys.stderr, "You must specify a bed file representing gene model\n"
        exit(0)
    OUT1 = open(outfile + ".geneBodyCoverage_plot.r", 'w')
    OUT2 = open(outfile + ".geneBodyCoverage.txt", 'w')

    bw = BigWigFile(file=open(bigFile))
    print >> sys.stderr, "calculating coverage over gene body ..."
    coverage = collections.defaultdict(int)
    flag = 0
    gene_count = 0
    for line in open(refbed, 'r'):
        try:
            if line.startswith(('#', 'track', 'browser')): continue
            gene_count += 1
            # Parse fields from gene tabls
            fields = line.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            tx_end = int(fields[2])
            geneName = fields[3]
            strand = fields[5]

            exon_starts = map(int, fields[11].rstrip(',\n').split(','))
            exon_starts = map((lambda x: x + tx_start), exon_starts)
            exon_ends = map(int, fields[10].rstrip(',\n').split(','))
            exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
        except:
            print >> sys.stderr, "[NOTE:input bed must be 12-column] skipped this line: " + line,
            continue
        gene_all_base = []
        percentile_base = []
        mRNA_len = 0
        flag = 0
        for st, end in zip(exon_starts, exon_ends):
            gene_all_base.extend(range(st + 1, end +
                                       1))  #0-based coordinates on genome
            mRNA_len = len(gene_all_base)
            if mRNA_len < 100:
                flag = 1
                break
        if flag == 1: continue
        if strand == '-':
            gene_all_base.sort(reverse=True)  #deal with gene on minus stand
        else:
            gene_all_base.sort(reverse=False)
        percentile_base = mystat.percentile_list(
            gene_all_base)  #get 101 points from each gene's coordinates

        for i in range(0, len(percentile_base)):
            #try:
            sig = bw.get_as_array(chrom, percentile_base[i] - 1,
                                  percentile_base[i])
            if sig is None: continue
            coverage[i] += np.nan_to_num(sig[0])
            #except:
            #	continue
        print >> sys.stderr, "  %d genes finished\r" % gene_count,

    x_coord = []
    y_coord = []
    print >> OUT2, "percentile\tcount"
    for i in coverage:
        x_coord.append(str(i))
        y_coord.append(str(coverage[i]))
        print >> OUT2, str(i) + '\t' + str(coverage[i])

    print >> OUT1, "%s(\'%s\')" % (gtype,
                                   outfile + ".geneBodyCoverage." + gtype)
    print >> OUT1, "x=0:100"
    print >> OUT1, "y=c(" + ','.join(y_coord) + ')'
    print >> OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count
    print >> OUT1, "dev.off()"
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help=
        "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]"
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=500000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    parser.add_option(
        "-f",
        "--format",
        action="store",
        type="string",
        dest="out_format",
        default="bgr",
        help=
        "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print >> sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >> sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom
            #if chr_name != "chrY":continue
            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >> sys.stderr, "Skip " + chr_name + "!"
                continue

            print >> sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1],
                                            interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >> sys.stderr, "Normalizing bigwig file ..."
    for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom
        #if chr_name != "chrY":continue
        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >> sys.stderr, "Skip " + chr_name + "!"
            continue

        if options.out_format.upper() == "WIG":
            print >> sys.stderr, "Writing " + chr_name + " ..."
            OUT.write('variableStep chrom=' + chr_name + '\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                coord = interval[1]
                bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    if v != 0: print >> OUT, "%d\t%.2f" % (coord, v)
        elif options.out_format.upper() == "BGR":
            print >> sys.stderr, "Writing " + chr_name + " ..."
            #OUT.write('variableStep chrom='+chr_name+'\n')
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                v2p = collections.defaultdict(list)  #value to position
                range2p = {
                }  #coorindate range to value, bedgraph. #[start]=[len,value]
                coord = interval[1]
                bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                bw_signal = numpy.nan_to_num(bw_signal) * weight
                for v in bw_signal:
                    coord += 1
                    #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
                    if v != 0: v2p[v].append(coord)
                for v in v2p:
                    for k, g in groupby(enumerate(v2p[v]), lambda
                                        (i, x): i - x):
                        for l in [map(itemgetter(1), g)]:
                            range2p[l[0] - 1] = [len(l), v]
                for i in sorted(range2p):
                    print >> OUT, chr_name + '\t' + str(i) + '\t' + str(
                        i + range2p[i][0]) + '\t' + str(range2p[i][1])
        else:
            print >> sys.stderr, "unknown output format"
            sys.exit(1)
inFile.close();

chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r');
chromSizes = {};
for line in chromSizesFile:
	if line is None or line == "" or line[0]=="#": continue
	data=line.rstrip().split("\t");
	chromSizes[data[0]]=int(data[1]);

curBW = BigWigFile(open(args.inBW))

outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w");
outStream.write("track type=wiggle_0\n")

for chr in transChrs:
	values = curBW.get_as_array( chr, 0, chromSizes[oldToNew[chr]] )
	#print(chr);
	if values is not None:
		sys.stderr.write("Adding %s -> %s\n"%(chr, oldToNew[chr]));
		outStream.write("fixedStep chrom=%s start=1 step=1\n"%(oldToNew[chr]))
		outStream.write("\n".join(map(str,values)));
		outStream.write("\n");

toBW = subprocess.Popen(["wigToBigWig","%s.wig.gz"%(args.outFPre),args.chrsFile,"%s.bw"%(args.outFPre)])
temp = toBW.communicate()
if temp[0] is not None:
	sys.stderr.write("wigToBigWig: %s"%(temp[0]));
if temp[1] is not None:
	sys.stderr.write("wigToBigWig: %s"%(temp[1]));
if temp[0] is None and temp[1] is None and os.path.isfile("%s.bw"%(args.outFPre)): # if no errors, delete the original
	os.remove("%s.wig.gz"%(args.outFPre))
Beispiel #23
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option("-i",
                      "--bwfile",
                      action="store",
                      type="string",
                      dest="BigWig_File",
                      help="Input BigWig file. [required]")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type="string",
                      dest="output_wig",
                      help="Output wig file. [required]")
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help=
        "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]"
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help=
        "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]"
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed format. [optional]")
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=100000,
        help=
        "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]"
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, 'w')
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if (options.refgene_bed):
        print >> sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >> sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp): continue
            WIG_SUM += tmp
        print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom

            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >> sys.stderr, "Skip " + chr_name + "!"
                continue

            print >> sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name,
                                           chrSize=chr_size,
                                           stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1],
                                            interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp): continue
                WIG_SUM += tmp
        print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >> sys.stderr, "Normalizing bigwig file, output wiggle file"
    for chr_name, chr_size in chrom_sizes.items():  #iterate each chrom

        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >> sys.stderr, "Skip " + chr_name + "!"
            continue

        print >> sys.stderr, "Writing " + chr_name + " ..."
        OUT.write('variableStep chrom=' + chr_name + '\n')
        for interval in BED.tillingBed(chrName=chr_name,
                                       chrSize=chr_size,
                                       stepSize=options.chunk_size):
            coord = interval[1]
            bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
            tmp = numpy.nansum(bw_signal)
            if numpy.isnan(tmp): continue
            bw_signal = numpy.nan_to_num(bw_signal)
            for v in bw_signal:
                coord += 1
                if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
Beispiel #24
0
wSize = options.w
wSmooth = np.ones(wSize)
step = options.s

#### SCRIPT #####
# split genome into chunks
for i in range(0,len(gSizes)):
    # break chrs into pieces
    chrN = gSizes[i][0]
    chrLen = int(gSizes[i][1])
    sVals = np.arange(1,int(gSizes[i][1]),chunkSize)
    
    # read in bigWig
    for j in range(0,len(sVals)):
        # get data, pass if not available
        signal = bw.get_as_array(chrN,sVals[j],sVals[j]+chunkSize+padLen)
        try: signal.any()
        except: continue
        
        # smooth data
        print chrN, sVals[j]
        signal[np.isnan(signal)] = 0
        convM = np.convolve(signal,wSmooth,'same')
            
        # save data
        sList = np.arange(sVals[j],sVals[j]+chunkSize+padLen,step)
        eList = sList+step
        chrList = np.array([chrN]*len(sList))
        meanSig = convM[range(step/2,chunkSize+padLen,step)]
        
        # save out
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-p","--peak-file",action="store",type="string",dest="peak_file",help="Peak file generated by ChEAP_PeakCalling")
	parser.add_option("-f","--forward",action="store",type="string",dest="forward_peak",help="BigWig file of forward peak (first 5nt)")
	parser.add_option("-r","--reverse",action="store",type="string",dest="reverse_peak",help="BigWig file of reverse peak (first 5nt)")
	parser.add_option("-c","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-w","--window",action="store",type="int",dest="window_size",default=5,help="Window size (on genome) to calculate cross strand distance. default=%default")
	parser.add_option("-s","--shift-size",action="store",type="int",dest="max_distance",default=100,help="Maximum shift size. default=%default")

	(options,args)=parser.parse_args()

	if not (options.peak_file and options.forward_peak and options.reverse_peak and options.chromSize):
		parser.print_help()
		sys.exit(0)
	if options.window_size <1:
		print >>sys.stderr, "window size must be intreger larger than 1"
		parser.print_help()
		sys.exit(0)	
	fwd = BigWigFile( file=open(options.forward_peak) )
	rev = BigWigFile( file=open(options.reverse_peak) )
	chrom_sizes = load_chromsize(options.chromSize)	
	shiftSize=collections.defaultdict(list)
	count=0
	avg_eud=collections.defaultdict(int)	#average euclidean distance over window
	for line in open(options.peak_file,'r'):
		if line.startswith('#'):
			continue
		if not line.rstrip():
			continue
		fields=line.rstrip().split()
		if fields[3] == '-':
			continue
		if int(fields[4]) <30:
			continue
		chrom = fields[0]
		peak_pos = int(fields[2])
		peak_start = peak_pos - options.window_size
		peak_end = peak_pos + options.window_size
		if peak_start <0:
			peak_start=0
		if peak_end > chrom_sizes[chrom]:
			peak_end = chrom_sizes[chrom]
		
		fwd_signal = fwd.get_as_array(chrom,peak_start,peak_end)
		#if all_nan(fwd_signal):
		#	continue
		fwd_signal = replace_nan( fwd_signal )
		for offset in range(0,options.max_distance+1):
			rev_signal = rev.get_as_array(chrom,peak_start + offset, peak_end + offset)
			rev_signal = replace_nan( rev_signal )
			#print >>OUT, chrom + ":" + str(peak_start) + '-' + str(peak_end) + '\t' + str(offset) + '\t' + str(twoList.euclidean_distance(fwd_signal,rev_signal))
			shiftSize[chrom + str(peak_pos)].append(twoList.euclidean_distance(fwd_signal,rev_signal))
	for k in shiftSize:
		if len(set(shiftSize[k]))==1:
			continue
		count +=1
		norm_factor = max(shiftSize[k])
		for indx, val in enumerate(shiftSize[k]):
			avg_eud[indx] += val/norm_factor
	for k,v in avg_eud.items():
		print str(k) + '\t' + str(v/count)
Beispiel #26
0
def retrieve_boo(boo_list, spe, out_filename, mode, phyloP_filename,
                 maf_folder, tree):
    """retrieve the age of each binding site, main species is spe"""
    out = WriteToFile(out_filename)
    if mode == "phyloP":
        if phyloP_filename is None:
            error(
                "outorder method is phyloP, a bigwig phyloP file must be provided"
            )
            exit(1)
        else:
            phyloP_bw = BigWigFile(open(phyloP_filename))
        node = tree.get_leaves_by_name(spe)[0]
        branch_order = {}
        num = 0
        while not node.is_root():
            branch_order[num] = node.name
            node = node.up
            num += 1
        # add the root
        branch_order[num] = node.name
        branch_order_sorted = sorted(branch_order.keys())
    elif mode == "maf":
        if maf_folder is None:
            error("outrder method is maf, maf file folder must be provided")
            exit(1)
        else:
            maf_block = MafFile(spe, maf_folder)
        node = tree.get_leaves_by_name(spe)[0]
        branch_order = {}
        num = 0
        while not node.is_root():
            branch_order[num] = node.name
            node = node.up
            num += 1
        # add the root
        branch_order[num] = node.name
        branch_order_sorted = sorted(branch_order.keys())
        # get spe_list
        spe_list = [leaf.name for leaf in tree]
    else:
        print >> sys.stderr, "Unknown mode: %s" % (mode)
        exit(1)
    for boo_table in boo_list:
        poslist = boo_table['pos']
        agelist = []
        if len(boo_table['count']) == 0:
            warning(
                "The program think the number of TFBS (eg. %s) in target species is not realistic. Skip it."
                % (poslist[0]))
            continue
        if mode == "simple":
            for branch in boo_table['count'].keys():
                for nn in range(boo_table['count'][branch]):
                    agelist.append(branch)
        elif mode == "phyloP":
            phyloP_list = []
            total_spe = 0
            for branch in boo_table['count'].keys():
                total_spe += boo_table['count'][branch]
            # in some cases the number of tfbs will not equal to leaf number
            if total_spe != len(poslist):
                warning(
                    "The program think the number of TFBS (eg. %s) in target species is not realistic. Sample %d TFBS from total %d TFBS"
                    % (poslist[0], total_spe, len(poslist)))
                poslist = [
                    poslist[i] for i in sorted(
                        random.sample(xrange(len(poslist)), total_spe))
                ]
            for region in poslist:
                region_pos = region.replace(':', '-').split('-')
                chrom = region_pos[0]
                start = int(region_pos[1])
                stop = int(region_pos[2])
                array = phyloP_bw.get_as_array(chrom, start, stop)
                invalid = np.isnan(array)
                array[invalid] = 0.0
                phyloP_list.append(sum(array))
            phyloP_index = np.argsort(np.array(phyloP_list))
            phyloP_order = {}
            for nn in range(len(poslist)):
                phyloP_order[phyloP_index[nn]] = nn
            branch_list = {}
            num = 0
            for nn in branch_order_sorted:
                if branch_order[nn] in boo_table['count']:
                    for mm in range(boo_table['count'][branch_order[nn]]):
                        branch_list[num] = branch_order[nn]
                        num += 1
            for nn in range(len(poslist)):
                agelist.append(branch_list[phyloP_order[nn]])
        elif mode == "maf":
            maf_list = []
            total_spe = 0
            for branch in boo_table['count'].keys():
                total_spe += boo_table['count'][branch]
            if total_spe != len(poslist):
                warning(
                    "The program think the number of TFBS (eg. %s) in target species is not realistic. Sample %d TFBS from total %d TFBS"
                    % (poslist[0], total_spe, len(poslist)))
                poslist = [
                    poslist[i] for i in sorted(
                        random.sample(xrange(len(poslist)), total_spe))
                ]
            for region in poslist:
                region_pos = region.replace(':', '-').split('-')
                chrom = region_pos[0]
                start = int(region_pos[1])
                stop = int(region_pos[2])
                score = maf_block.score(chrom, start, stop, spe, spe_list)
                maf_list.append(score)
            maf_index = np.argsort(np.array(maf_list))
            maf_order = {}
            for nn in range(len(poslist)):
                maf_order[maf_index[nn]] = nn
            branch_list = {}
            num = 0
            for nn in branch_order_sorted:
                if branch_order[nn] in boo_table['count']:
                    for mm in range(boo_table['count'][branch_order[nn]]):
                        branch_list[num] = branch_order[nn]
                        num += 1
            for nn in range(len(poslist)):
                agelist.append(branch_list[maf_order[nn]])
        else:
            error("Unknown outorder method%s" % (mode))
            exit(1)
        try:
            assert len(poslist) == len(agelist)
        except AssertionError:
            error("motif count is not equal to boo count")
            exit(1)
        for nn in range(len(poslist)):
            pos = poslist[nn]
            chrom, start, stop = pos.replace(':', '-').split('-')
            print >> out, chrom + '\t' + start + '\t' + stop + '\t' + agelist[
                nn]
Beispiel #27
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-b","--forward",action="store",type="string",dest="forward_bw",help="BigWig file for forward reads (extend 1 nt from 5' end of read)")
	parser.add_option("-d","--reverse",action="store",type="string",dest="reverse_bw",help="BigWig file for reverse reads (extend 1 nt from 5' end of read)")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files")
	parser.add_option("-z","--fuzziness",action="store",type="int",dest="fuzzy_size",default=10,help="Peaks within fuzzy window will be merged. default=%default (bp)")
	parser.add_option("-w","--bgw",action="store",type="int",dest="window_size",default=200,help="Background window size used to determine background signal level (lambda in Poisson model). default=%default (bp)")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	parser.add_option("-p","--pvalue",action="store",type="float",dest="pvalue_cutoff",default=0.1,help="Pvalue cutoff for peak detection. default=%default")
	parser.add_option("-r","--bg-root-num",action="store",type="float",dest="bg_root_num",default=100,help="Background peak root number. default=%default")
	parser.add_option("-e","--extention",action="store",type="int",dest="extention_size",default=5,help="Window size used to calculate peak area. Larger number will signficantly reduce speed, and make peak calling more meaningless.  default=%default")

	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.chromSize and options.forward_bw and options.reverse_bw):
		parser.print_help()
		sys.exit(0)
	for file in (options.chromSize,options.forward_bw,options.reverse_bw):
		if not os.path.exists(file):
			print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n'
			sys.exit(0)
	
	chrom_sizes = load_chromsize(options.chromSize)
	OUT = open(options.output_prefix + ".single_nt_peak.xls",'w')
	fw_bw_obj = BigWigFile( file = open(options.forward_bw))
	rv_bw_obj = BigWigFile( file = open(options.reverse_bw))
	rv_peak_roots = {}
	rv_peak_height = {}
	rv_ranges={}
	rv_peak_pvalue={}
	pv_cutoff = -10*math.log10(options.pvalue_cutoff)	
	signal.signal(signal.SIGINT, signal_handler)


	print >>sys.stderr, logo	
	
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#calculate peak height and peak area for forward bigwig
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.forward_bw + '  ...'
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		fw_peak_roots = {}	#key is chr,pos,strand,height: ("chr19   51345387        +       2.83"), value is area("2.82999992371")
		fw_peak_height = {}
		fw_ranges={}
		fw_peak_pvalue={}
		if chr_name != 'chrY':
			continue
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..."
		progress = 0
		coord = 0	
		#for each chunk
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)				
			for indx,val in enumerate(fw_bw_obj.get_as_array(interval[0],interval[1],interval[2])):
				coord += 1	#coord is 1-based on genome
				if numpy.isnan(val):continue
				area_value = sum_bwfile(chr_name, coord, options.extention_size, fw_bw_obj,chrom_sizes)
				fw_peak_roots[chr_name + "\t" + str(coord) + "\t+"] = area_value		#key is chrom + position + strand,value is area
				fw_peak_height[chr_name + "\t" + str(coord) + "\t+"] = val
				if chr_name not in fw_ranges:
					fw_ranges[chr_name] = IntervalTree()
				else:
					fw_ranges[chr_name].insert_interval( Interval( coord-1, coord, value=area_value) )
			finish_part = int(interval[2]*100/chr_size)
			if finish_part > progress:
				print >>sys.stderr, " %d%% finished\r" % (finish_part),
				progress = finish_part	
	
	
		#fw_global_lamda = numpy.mean(fw_peak_roots.values())
		#print >>sys.stderr, "Global mean (Forward) = " + str(fw_global_lamda)
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.forward_bw + '  ...'
		for k in fw_peak_roots:
			chrom = k.split("\t")[0]
			coord = int(k.split("\t")[1])
			fw_peak_pvalue[k] = cal_poisson_pvalue(int(fw_peak_roots[k]), coord-1, coord, fw_ranges[chrom],options.window_size,options.bg_root_num)

	
		fw_peak_filtered = merge_peaks(fw_peak_height,fuzziness=options.fuzzy_size)	
		for k,v in fw_peak_filtered.items():
			#print k + '\t' + str(v)
			(chrom,end,strand) = k.split('\t')
			end = int(end)
			start = end -1
			height = str(v)
			area = str(fw_peak_roots[k])
			pvalue = fw_peak_pvalue[k]
			if pvalue < pv_cutoff:continue
			print >>OUT, '\t'.join([chrom, str(start), str(end), area,str(round(pvalue)),strand,height])
	
	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#calculate peak height and peak area for reverse bigwig
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.reverse_bw + '  ...'
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		if chr_name != 'chrY':
			continue
		print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..."
		progress = 0
		coord = 0	
		#for each chunk
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):	#cut chrom into bins, interval such as ('chr1', 235000000, 236000000)				
			
			for indx,val in enumerate(rv_bw_obj.get_as_array(interval[0],interval[1],interval[2])):
				coord += 1	#coord is 1-based on genome
				if numpy.isnan(val):continue
				area_value = sum_bwfile(chr_name, coord, options.extention_size, rv_bw_obj,chrom_sizes)				
				rv_peak_roots[chr_name + "\t" + str(coord) + "\t-"] = area_value
				rv_peak_height[chr_name + "\t" + str(coord) + "\t-"] = val
				if chr_name not in rv_ranges:
					rv_ranges[chr_name] = IntervalTree()
				else:
					rv_ranges[chr_name].insert_interval( Interval( coord-1, coord, value = area_value) )
			finish_part = int(interval[2]*100/chr_size)
			if finish_part > progress:
				print >>sys.stderr, " %d%% finished\r" % (finish_part),
				progress = finish_part
	

	#rv_global_lamda = numpy.mean(rv_peak_roots.values())
	#print >>sys.stderr, "Global mean (Reverse) = " + str(rv_global_lamda)
	print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.reverse_bw + '  ... '
	for k in rv_peak_roots:
		chrom = k.split("\t")[0]
		coord = int(k.split("\t")[1])
		rv_peak_pvalue[k] = cal_poisson_pvalue(int(rv_peak_roots[k]),coord-1,coord, rv_ranges[chrom],options.window_size,options.bg_root_num)
		#print k + '\t' + str(rv_peak_roots[k]) + '\t' + str(pvalue)


	rv_peak_filtered = merge_peaks(rv_peak_height,fuzziness=options.fuzzy_size)
	for k,v in rv_peak_filtered.items():
		(chrom,end,strand) = k.split('\t')
		end = int(end)
		start = end -1
		height = str(v)
		area = str(rv_peak_roots[k])
		pvalue = rv_peak_pvalue[k]
		if pvalue < pv_cutoff:continue
		
		print >>OUT, '\t'.join([chrom, str(start), str(end), area, str(round(pvalue)),strand,height])

#make a matrix of the data
allData = np.empty([totalLength,len(IDs)]);
if args.eliminateMissing>0:
	keepThese = np.ones([totalLength]).astype(bool); # onlt those for which data was observed in all tracks

for i in range(0,len(IDs)):
	#input GB tracks
	curBW = BigWigFile(open(files[i]))
	curTot = 0;
	if args.verbose>1: sys.stderr.write("Inputting data for %s.\n"%(IDs[i]));
	for chr in chrOrder:
		if args.verbose>1: sys.stderr.write("  Inputting data for %s.\n"%(chr));
		if args.verbose>2: sys.stderr.write("    Getting data from BW.\n");
		values = curBW.get_as_array( chr, 0, chromSizes[chr] )
		if values is None:
			sys.stderr.write("%s is missing %s... skipping it for all\n"%(IDs[i],chr));
			chrOrder.remove(chr)
			allData = np.delete(allData, [range(curTot, (curTot+np.sum(useThese[chr])))],0);
			if args.eliminateMissing>0:
				keepThese = np.delete(keepThese, [range(curTot, (curTot+np.sum(useThese[chr])))],0);
			totalLength = totalLength -  np.sum(useThese[chr]);
			del useThese[chr]
			del chromSizes[chr]
			continue
		if args.verbose>2: sys.stderr.write("    Checking for missing data.\n");
		if args.eliminateMissing>0:
			#keepThese[curTot:(curTot+sum(useThese[chr]))] = np.logical_and(keepThese[curTot:(curTot+sum(useThese[chr]))], np.logical_not(np.isnan( values ))[useThese[chr]]);
			keepThese[np.add(curTot,np.nonzero(np.isnan( values[useThese[chr]])))] = False;
		#print(np.add(curTot,np.nonzero(np.isnan( values[useThese[chr]]))))
Create a site profile vector showing the average signal accumulated from a
bigwig file around the center of each interval from a BED file.

Output is the average signal value at that relative position across the 
intervals.

usage: %prog bigwig_file.bw padding < bed_file.bed 
"""

import sys
from numpy import *

from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile

bw = BigWigFile( open( sys.argv[1] ) )
padding = int( sys.argv[2] )
totals = zeros( padding*2, dtype=float64 )
valid = zeros( padding*2, dtype=int32 )

for interval in GenomicIntervalReader( sys.stdin ):
    center = floor( ( interval.start + interval.end ) / 2 )
    values = bw.get_as_array( interval.chrom, center - padding, center + padding )
    # Determine which positions had data and mask the rest for totalling
    invalid = isnan( values )
    values[ invalid ] = 0
    totals += values
    valid += ( ~ invalid )

savetxt( sys.stdout, totals/valid )
Beispiel #30
0
def main(args):
    bw_file = BigWigFile( open(args.bigWigFile) )
    bw_file.get_as_array(chrom, st, end)
Beispiel #31
0
def coverageGeneBody_bigwig(bigFile,refbed,outfile,gtype="png"):
	'''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided
	into 100 regsions. bigFile is bigwig format file'''
	if refbed is None:
		print >>sys.stderr,"You must specify a bed file representing gene model\n"
		exit(0)
	OUT1 = open(outfile + ".geneBodyCoverage_plot.r",'w')
	OUT2 = open(outfile + ".geneBodyCoverage.txt",'w')
	
	bw = BigWigFile( file = open(bigFile) )
	print >>sys.stderr, "calculating coverage over gene body ..."
	coverage=collections.defaultdict(int)
	flag=0
	gene_count = 0
	for line in open(refbed,'r'):
		try:
			if line.startswith(('#','track','browser')):continue  
			gene_count += 1
           	# Parse fields from gene tabls
			fields = line.split()
			chrom     = fields[0]
			tx_start  = int( fields[1] )
			tx_end    = int( fields[2] )
			geneName      = fields[3]
			strand    = fields[5]
				
			exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
			exon_starts = map((lambda x: x + tx_start ), exon_starts)
			exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
			exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends);   
		except:
			print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line,
			continue
		gene_all_base=[]
		percentile_base=[]
		mRNA_len =0
		flag=0
		for st,end in zip(exon_starts,exon_ends):
			gene_all_base.extend(range(st+1,end+1))		#0-based coordinates on genome
			mRNA_len = len(gene_all_base)
			if mRNA_len <100:
				flag=1
				break
		if flag==1: continue
		if strand == '-':
			gene_all_base.sort(reverse=True)			#deal with gene on minus stand
		else:
			gene_all_base.sort(reverse=False)
		percentile_base = mystat.percentile_list (gene_all_base)	#get 101 points from each gene's coordinates
			
		for i in range(0,len(percentile_base)):
			#try:
			sig = bw.get_as_array(chrom,percentile_base[i]-1,percentile_base[i])
			if sig is None:continue
			coverage[i] += np.nan_to_num(sig[0])
			#except:
			#	continue
		print >>sys.stderr, "  %d genes finished\r" % gene_count,

	x_coord=[]
	y_coord=[]
	print >>OUT2, "percentile\tcount"
	for i in coverage:
		x_coord.append(str(i))
		y_coord.append(str(coverage[i]))
		print >>OUT2, str(i) + '\t' + str(coverage[i])
		
	print >>OUT1, "%s(\'%s\')" % (gtype, outfile + ".geneBodyCoverage." + gtype)
	print >>OUT1, "x=0:100"
	print >>OUT1, "y=c(" + ','.join(y_coord) + ')'
	print >>OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count
	print >>OUT1, "dev.off()"
Beispiel #32
0
def main():
    usage = "%prog [options]"
    parser = OptionParser(usage, version="%prog " + __version__)

    parser.add_option(
        "-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]"
    )
    parser.add_option(
        "-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]"
    )
    parser.add_option(
        "-s",
        "--chromSize",
        action="store",
        type="string",
        dest="chromSize",
        help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]",
    )
    parser.add_option(
        "-t",
        "--wigsum",
        action="store",
        type="int",
        dest="total_wigsum",
        default=100000000,
        help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]",
    )
    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="refgene_bed",
        help="Reference gene model in bed format. [optional]",
    )
    parser.add_option(
        "-c",
        "--chunk",
        action="store",
        type="int",
        dest="chunk_size",
        default=100000,
        help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]",
    )
    (options, args) = parser.parse_args()

    if not (options.BigWig_File and options.output_wig and options.chromSize):
        parser.print_help()
        sys.exit(0)

    OUT = open(options.output_wig, "w")
    bw = BigWigFile(file=open(options.BigWig_File))
    chrom_sizes = load_chromsize(options.chromSize)
    exons = []
    WIG_SUM = 0.0
    if options.refgene_bed:
        print >>sys.stderr, "Extract exons from " + options.refgene_bed
        obj = BED.ParseBED(options.refgene_bed)
        exons = obj.getExon()
        print >>sys.stderr, "Merge overlapping exons ..."
        exons = BED.unionBed3(exons)
        print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + " only"
        for chrom, st, end in exons:
            try:
                bw.get_as_array(chrom, 0, 1).size
            except:
                continue

            bw_signal = bw.get_as_array(chrom, st, end)
            tmp = numpy.nansum(
                bw_signal
            )  # nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
            if numpy.isnan(tmp):
                continue
            WIG_SUM += tmp
        print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
    else:
        print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File
        for chr_name, chr_size in chrom_sizes.items():  # iterate each chrom

            try:
                bw.get_as_array(chr_name, 0, 1).size
            except:
                print >>sys.stderr, "Skip " + chr_name + "!"
                continue

            print >>sys.stderr, "Processing " + chr_name + " ..."
            for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size):
                bw_signal = bw.get_as_array(interval[0], interval[1], interval[2])
                tmp = numpy.nansum(bw_signal)
                if numpy.isnan(tmp):
                    continue
                WIG_SUM += tmp
        print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM

    try:
        weight = options.total_wigsum / WIG_SUM
    except:
        "Error, WIG_SUM cannot be 0"
        eys.exit(1)

        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    print >>sys.stderr, "Normalizing bigwig file, output wiggle file"
    for chr_name, chr_size in chrom_sizes.items():  # iterate each chrom

        try:
            bw.get_as_array(chr_name, 0, 1).size
        except:
            print >>sys.stderr, "Skip " + chr_name + "!"
            continue

        print >>sys.stderr, "Writing " + chr_name + " ..."
        OUT.write("variableStep chrom=" + chr_name + "\n")
        for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size):
            coord = interval[1]
            bw_signal = bw.get_as_array(chr_name, interval[1], interval[2])
            tmp = numpy.nansum(bw_signal)
            if numpy.isnan(tmp):
                continue
            bw_signal = numpy.nan_to_num(bw_signal)
            for v in bw_signal:
                coord += 1
                if v != 0:
                    print >> OUT, "%d\t%.4f" % (coord, v * weight)
Beispiel #33
0
class BigWig(object):
    def __init__(self, filename):
        self.filename = filename
        self.determine_sizes()
        self.bwf = BigWigFile(open(filename))

    def determine_sizes(self):
        self.sizes = {}
        fh = open(self.filename, "rb")
        # read magic number to guess endianness
        magic = fh.read(4)
        if magic == '&\xfc\x8f\x88':
            endianness = '<'
        elif magic == '\x88\x8f\xfc&':
            endianness = '>'
        else:
            raise IOError("The file is not in bigwig format")

        # read the header
        info = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60))
        self.version = info[0]
        self.zoom_levels = info[1]
        self.chromosome_tree_offset = info[2]
        self.full_data_offset = info[3]
        self.full_index_offset = info[4]
        self.field_count = info[5]
        self.defined_field_count = info[6]
        self.auto_SQL_offset = info[7]
        self.total_summary_offset = info[8]
        self.uncompress_buf_size = info[9]
        
        # go to the data
        fh.seek(self.chromosome_tree_offset)
        # read magic again
        magic = fh.read(4)
        if magic == '\x91\x8c\xcax':
            endianness = '<'
        elif magic == 'x\xca\x8c\x91':
            endianness = '>'
        else:
            raise ValueError("Wrong magic for this bigwig data file")

        info2 = struct.unpack(endianness + 'IIIQQ', fh.read(28))
        self.block_size = info2[0]
        self.key_size = info2[1]
        self.val_size = info2[2]
        self.item_count = info2[3]

        info3 = struct.unpack(endianness + 'BBH', fh.read(4))
        self.is_leaf = info3[0]
        self.count = info3[2]

        for n in range(self.count):
            format_code = endianness + str(self.key_size) + 'sII'
            info = struct.unpack(format_code, fh.read(self.key_size + 2 * 4))
            key, chrom_id, chrom_size = info

            key = key.replace('\x00', '')
            self.sizes[key] = chrom_size

    def get_as_array(self, chrom, start, end):
        return self.bwf.get_as_array(chrom, start, end)

    def get(self, chrom, start, end):
        return self.bwf.get(chrom, start, end)

    def query(self, chrom, start, end, number):
        return self.bwf.query(chrom, start, end, number)
	if args.inFile2 is not None:
		inFile2 = BedReader(open(args.inFile2))
else:	
	raise Exception("Unrecognized format!");



for locus in scanThese:
	if args.verbose>0:
		print("Scanning %s"%(locus[GENOMEDATA.NAME]))
	stF = max(locus[GENOMEDATA.ST] - padding,0);
	enF = locus[GENOMEDATA.EN] + padding + inclusive;
	try:
		if (locus[GENOMEDATA.STR]=="-" and args.inFile2 is not None):
			if args.format=="BIGWIG" or args.format=="BW" or args.format=="BIGBED" or args.format=="BB":
				values = inFile2.get_as_array( locus[GENOMEDATA.CHR], stF, enF )
		else:
			if args.format=="BIGWIG" or args.format=="BW" or args.format=="BIGBED" or args.format=="BB":
				values = inFile1.get_as_array( locus[GENOMEDATA.CHR], stF, enF )
	except OverflowError as e:
		sys.stderr.write("OverflowError at '%s'; st=%d, en=%d\n"%(locus[GENOMEDATA.NAME],locus[GENOMEDATA.ST],locus[GENOMEDATA.EN]));
		raise(e);
	if values is None and args.correctChr>0:
		#try again adding chr or taking it away
		if locus[GENOMEDATA.CHR][:3]=="chr":
			 locus[GENOMEDATA.CHR] =  locus[GENOMEDATA.CHR][3:]
		else:
			 locus[GENOMEDATA.CHR]="chr"+ locus[GENOMEDATA.CHR];
		try:
			if (locus[GENOMEDATA.STR]=="-" and args.inFile2 is not None):
				if args.format=="BIGWIG" or args.format=="BW" or args.format=="BIGBED" or args.format=="BB":
curBW1 = BigWigFile(open(args.inBW1))
curBW2 = BigWigFile(open(args.inBW2))


outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w");
outStream.write("track type=wiggle_0\n")

for chr in chromSizes.keys():
	last = 0;
	final = chromSizes[chr];
	sys.stderr.write("Outputting data for %s:\n"%(chr));
	while last!=final: # this breaks it up into chunks so that I'm not piping entire (human) chromosomes at once
		if args.verbose>0: sys.stderr.write("  Section %i - %i:\n"%(last,curLast));
		curLast = np.min([last+args.chunks,final]);
		values1 = curBW1.get_as_array( chr, last, curLast )
		values2 = curBW2.get_as_array( chr, last, curLast )
	#print(chr);
		if values1 is not None and values2 is not None: # what if only a chunk of a chromosome is missing? then I will get errors
			values = applyFunction(values1,values2,args.function);
			if last==0:
				outStream.write("fixedStep chrom=%s start=1 step=1\n"%(chr))
			outStream.write("\n".join(map(str,values)));
			outStream.write("\n");
			outStream.flush();
		last=curLast;


outStream.close();

toBW = subprocess.Popen(["wigToBigWig","%s.wig.gz"%(args.outFPre),args.chrsFile,"%s.bw"%(args.outFPre)])
Beispiel #36
0
 or left_or_right == 'r' and strand == '-'):
 # 5' site
 if (chrom, coordinate) in annotated_5p:
     fivep_splice_site_counts = (
             annotated_fivep_splice_site_counts
         )
     line_counts = annotated_line_counts
 else:
     fivep_splice_site_counts = (
             unannotated_fivep_splice_site_counts
         )
     line_counts = unannotated_line_counts
 if strand == '+':
     bwvals = bw.get_as_array(
                 chrom,
                 coordinate - args.extension,
                 coordinate + args.extension
             )
     if bwvals is None:
         continue
     for i, j in enumerate(
                     xrange(-args.extension, args.extension)
                 ):
         if not math.isnan(bwvals[i]):
             fivep_splice_site_counts[j] += bwvals[i]
             line_counts[j] += 1
 elif strand == '-':
     bwvals = bw.get_as_array(
                 chrom,
                 coordinate - (args.extension - 1),
                 coordinate + (args.extension + 1)
curBW = BigWigFile(open(args.inBW))

outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w");
outStream.write("track type=wiggle_0\n")


for chr in chromSizes.keys():
	last = 0;
	final = chromSizes[chr];
	sys.stderr.write("Outputting data for %s:\n"%(chr));
	while last!=final: # this breaks it up into chunks so that I'm not piping entire (human) chromosomes at once
		if args.verbose>0: sys.stderr.write("  Section %i - %i:\n"%(last,curLast));
		curLast = np.min([last+args.chunks,final]);
		curEnd = np.min([curLast+additionalFlankSize, final]);
		curSt = np.max([last-additionalFlankSize,0]);
		values = curBW.get_as_array( chr, curSt, curEnd )
	#print(chr);
		if values is not None:
			for f in allFunctions:
				values = applyFunction(values,f);
			values = values[(last - curSt):(curLast-last + (last-curSt))];# set them only to the middle part of this data so that the additionalFlankSize regions are not output.
			#print(values.shape);
			if last==0:
				outStream.write("fixedStep chrom=%s start=1 step=1\n"%(chr))
			outStream.write("\n".join(map(str,values)));
			outStream.write("\n");
			outStream.flush();
		last=curLast;

outStream.close();
Beispiel #38
0
    def summarize(self,
                  interval,
                  bins=None,
                  method='summarize',
                  function='mean',
                  zero_inf=True,
                  zero_nan=True):
        """
        Parameters
        ----------

        interval : object
            Object with chrom (str), start (int) and stop (int) attributes.

        bins : int or None
            Number of bins; if None, bins will be the length of the interval

        method : summarize | ucsc_summarize | get_as_array
            "summarize" and "get_as_array" use bx-python; "ucsc_summarize" uses
            bigWigSummarize. See other notes in docstring for
            metaseq.array_helpers._local_coverage. If None, defaults to
            "summarize".

        function : mean | min | max | std | coverage
            Determines the nature of the summarized values. Ignored if
            `method="get_as_array"`; "coverage" is only valid if method is
            "ucsc_summarize".

        zero_inf, zero_nan : bool
            If `zero_inf` is True, set any inf or -inf to zero before
            returning. If `zero_nan` is True, set any nan values to zero before
            returning.
        """

        if method is None:
            method = 'summarize'

        # We may be dividing by zero in some cases, which raises a warning in
        # NumPy based on the IEEE 754 standard (see
        # http://docs.scipy.org/doc/numpy/reference/generated/
        #       numpy.seterr.html)
        #
        # That's OK -- we're expecting that to happen sometimes. So temporarily
        # disable this error reporting for the duration of this method.
        orig = np.geterr()['invalid']
        np.seterr(invalid='ignore')

        if (bins is None) or (method == 'get_as_array'):
            bw = BigWigFile(open(self.fn))
            s = bw.get_as_array(
                interval.chrom,
                interval.start,
                interval.stop,
            )
            if s is None:
                s = np.zeros((interval.stop - interval.start, ))
            else:
                if zero_nan:
                    s[np.isnan(s)] = 0
                if zero_inf:
                    s[np.isinf(s)] = 0

        elif method == 'ucsc_summarize':
            if function in ['mean', 'min', 'max', 'std', 'coverage']:
                return self.ucsc_summarize(interval, bins, function=function)
            else:
                raise ValueError('function "%s" not supported by UCSC\'s'
                                 'bigWigSummary')

        elif method == 'summarize':
            bw = BigWigFile(open(self.fn))
            s = bw.summarize(interval.chrom, interval.start, interval.stop,
                             bins)
            if s is None:
                s = np.zeros((bins, ))
            else:
                if function == 'sum':
                    s = s.sum_data
                elif function == 'mean':
                    s = s.sum_data / s.valid_count
                    if zero_nan:
                        s[np.isnan(s)] = 0
                elif function == 'min':
                    s = s.min_val
                    if zero_inf:
                        s[np.isinf(s)] = 0
                elif function == 'max':
                    s = s.max_val
                    if zero_inf:
                        s[np.isinf(s)] = 0
                elif function == 'std':
                    s = (s.sum_squares / s.valid_count)
                    if zero_nan:
                        s[np.isnan(s)] = 0
                else:
                    raise ValueError(
                        'function "%s" not supported by bx-python' % function)
        else:
            raise ValueError(
                "method '%s' not in [summarize, ucsc_summarize, get_as_array]"
                % method)

        # Reset NumPy error reporting
        np.seterr(divide=orig)
        return s
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]")
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]")
	parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]")	
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]")
	parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)

	OUT=open(options.output_wig,'w')
	bw = BigWigFile( file=open(options.BigWig_File) )
	chrom_sizes = load_chromsize(options.chromSize)	
	exons=[]
	WIG_SUM=0.0
	if (options.refgene_bed):	
		print >>sys.stderr, "Extract exons from " + options.refgene_bed
		obj = BED.ParseBED(options.refgene_bed)
		exons = obj.getExon()
		print >>sys.stderr, "Merge overlapping exons ..."
		exons = BED.unionBed3(exons)
		print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
		for chrom,st,end in exons:
			try: bw.get_as_array(chrom,0,1).size
			except:continue

			bw_signal = bw.get_as_array(chrom,st,end)
			tmp = numpy.nansum(bw_signal)			#nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
			if numpy.isnan(tmp):continue	
			WIG_SUM += tmp
		print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
	else:
		print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File
		for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
			#if chr_name != "chrY":continue
			try: bw.get_as_array(chr_name,0,1).size
			except:
				print >>sys.stderr, "Skip " + chr_name + "!"
				continue

			print >>sys.stderr, "Processing " + chr_name + " ..."	
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				bw_signal = bw.get_as_array(interval[0],interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				WIG_SUM += tmp
		print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM
	
	try:
		weight = options.total_wigsum/WIG_SUM
	except:
		"Error, WIG_SUM cannot be 0"
		eys.exit(1)

	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	print >>sys.stderr, "Normalizing bigwig file ..."
	for chr_name, chr_size in chrom_sizes.items():          #iterate each chrom
		#if chr_name != "chrY":continue
		try: bw.get_as_array(chr_name,0,1).size
		except:
			print >>sys.stderr, "Skip " + chr_name + "!"
			continue
		
		if options.out_format.upper() == "WIG":
			print >>sys.stderr, "Writing " + chr_name + " ..."
			OUT.write('variableStep chrom='+chr_name+'\n')
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				coord = interval[1]
				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				bw_signal = numpy.nan_to_num(bw_signal) * weight
				for v in bw_signal:
					coord +=1
					if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
		elif options.out_format.upper() == "BGR":
			print >>sys.stderr, "Writing " + chr_name + " ..."
			#OUT.write('variableStep chrom='+chr_name+'\n')
			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
				v2p = collections.defaultdict(list)     #value to position
				range2p={}      #coorindate range to value, bedgraph. #[start]=[len,value]
				coord = interval[1]
				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
				tmp = numpy.nansum(bw_signal)
				if numpy.isnan(tmp):continue
				bw_signal = numpy.nan_to_num(bw_signal) * weight
				for v in bw_signal:
					coord +=1
					#if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
					if v != 0: v2p[v].append(coord)
				for v in v2p:
					for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x):
						for l in [map(itemgetter(1), g)]:
							range2p[l[0]-1] = [len(l),v]
				for i in sorted(range2p):
					print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1])
		else:
			print >>sys.stderr, "unknown output format"
			sys.exit(1)