def getFragmentLength_worker(chrom, start, end, bamFile): """ Queries the reads at the given region for the distance between reads and the read length :param chrom: chromosome name :param start: int region start :param end: int region end :param bamFile: bamfile name :return: np.array, where first column is fragment length, the second is for read length """ bam = bamHandler.openBam(bamFile) end = min(end, start + 5e4) if chrom in bam.references: reads = np.array([(abs(r.template_length), r.query_length) for r in bam.fetch(chrom, start, end) if r.is_proper_pair and r.is_read1]) if not len(reads): # if the previous operation produces an empty list # it could be that the data is not paired, then # we try with out filtering reads = np.array([(abs(r.template_length), r.query_length) for r in bam.fetch(chrom, start, end)]) else: raise NameError("chromosome {} not found in bam file".format(chrom)) if not len(reads): reads = np.array([]).reshape(0, 2) return reads
def peFragmentSize(bamFile, bamFileIndex=None, return_lengths=False, numberOfProcessors=None, verbose=False): bamHandle = bamHandler.openBam(bamFile, bamFileIndex) chromSizes = zip(bamHandle.references, bamHandle.lengths) chunkSize = int( float(sum(bamHandle.lengths)) * 0.3 / max(numberOfProcessors, len(bamHandle.lengths))) imap_res = mapReduce.mapReduce((bamHandle.filename, ), getFragmentLength_wrapper, chromSizes, genomeChunkLength=chunkSize, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragLength = {'sample_size': len(fl), 'min': fl.min(), 'qtile25': np.percentile(fl, 25), 'mean': np.mean(fl), 'median': np.median(fl), 'qtile75': np.percentile(fl, 75), 'max': fl.max(), 'std': np.std(fl)} if return_lengths: fragLength['lengths'] = fl else: fragLength = None return fragLength
def getFragmentLength_worker(chrom, start, end, bamFile): bam = bamHandler.openBam(bamFile) end = min(end, start + 5e4) reads = np.array([]) if chrom in bam.references: reads = np.array([abs(r.tlen) for r in bam.fetch(chrom, start, end) if r.is_proper_pair and r.is_read1]) else: raise NameError("chromosome {} not found in bam file".format(chrom)) return reads
def getRead(self, readType): """ prepare arguments for test """ bam = bamHandler.openBam(self.bamFile_PE) if readType == 'paired-reverse': read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0] elif readType == 'single-forward': read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0] elif readType == 'single-reverse': read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0] else: # by default a forward paired read is returned read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0] return read
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins): """ Queries the reads at the given region for the distance between reads and the read length Parameters ---------- chrom : str chromosome name start : int region start end : int region end bamFile : str BAM file name distanceBetweenBins : int the number of bases at the end of each bin to ignore Returns ------- np.array an np.array, where first column is fragment length, the second is for read length """ bam = bamHandler.openBam(bamFile) end = max(start + 1, end - distanceBetweenBins) if chrom in bam.references: reads = np.array([(abs(r.template_length), r.infer_query_length()) for r in bam.fetch(chrom, start, end) if r.is_proper_pair and r.is_read1]) if not len(reads): # if the previous operation produces an empty list # it could be that the data is not paired, then # we try with out filtering reads = np.array([(abs(r.template_length), r.query_length) for r in bam.fetch(chrom, start, end)]) else: raise NameError("chromosome {} not found in bam file".format(chrom)) if not len(reads): reads = np.array([]).reshape(0, 2) return reads
def countReadsInRegions_worker(chrom, start, end, bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros=False, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, samFlag=None, bedRegions=None ): """ counts the reads in each bam file at each 'stepSize' position within the interval start, end for a 'binLength' window. Because the idea is to get counts for window positions at different positions for sampling the bins are equally spaced between each other and are not one directly next *after* the other. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. The result is a list of tuples. >>> test = Tester() The transpose is used to get better looking numbers. the first line corresponds to the number of reads per bin in the first bamfile. >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0)) array([[ 0., 0., 1., 1.], [ 0., 1., 1., 2.]]) When skipZeros is set to true, those cases in which *all* of the bamfiles have zero counts for a certain bin are ignored >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 200, 200, 0)) array([[ 2.], [ 4.]]) Test min mapping quality >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0, minMappingQuality=40)) array([[ 0., 0., 0., 1.], [ 0., 0., 0., 1.]]) Test ignore duplicates >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0, ignoreDuplicates=True)) array([[ 0., 0., 1., 1.], [ 0., 1., 1., 1.]]) Test bed regions: >>> bedRegions = [(test.chrom, 10, 20), (test.chrom, 150, 160)] >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 0, 200, 0, bedRegions=bedRegions)) array([[ 0., 1.], [ 0., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) # array to keep the read counts for the regions subNum_reads_per_bin = [] rows = 0 startTime = time.time() extendPairedEnds = True zerosToNans = False bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList] regionsToConsider = [] if bedRegions: for chrom, start, end in bedRegions: regionsToConsider.append((chrom, start, end, end - start)) else: for i in xrange(start, end, stepSize): if i + binLength > end: break regionsToConsider.append((chrom, i, i + binLength, binLength)) for chrom, start, end, binLength in regionsToConsider: avgReadsArray = [] for bam in bamHandlers: avgReadsArray.append( getCoverageOfRegion(bam, chrom, start, end, binLength, defaultFragmentLength, extendPairedEnds, zerosToNans, minMappingQuality=minMappingQuality, ignoreDuplicates=ignoreDuplicates, samFlag=samFlag )[0]) # skip if any of the bam files returns a NaN if np.isnan(sum(avgReadsArray)): continue if skipZeros and sum(avgReadsArray) == 0: continue subNum_reads_per_bin.extend(avgReadsArray) rows += 1 if debug: endTime = time.time() print "%s countReadsInRegions_worker: processing %d " \ "(%.1f per sec) @ %s:%s-%s" % \ (multiprocessing.current_process().name, rows, rows / (endTime - startTime), chrom, start, end ) return np.array(subNum_reads_per_bin).reshape(rows, len(bamFilesList))
def countReadsInRegions_worker(chrom, start, end, bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros = False): """ counts the reads in each bam file at each 'stepSize' position within the interval start, end for a 'binLength' window. The idea is to get counts for window positions at different positions for sampling. That is why the bins are not consecutive. The result is a list of tuples. >>> test = Tester() The transpose is used to get better looking numbers. the first line corresponds to the number of reads per bin in the first bamfile >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, [test.bamFile1, test.bamFile2], 50, 25, 0)) array([[ 0., 0., 1., 1.], [ 0., 1., 1., 2.]]) When skipZeros is set to true, those cases in which *all* of the bamfiles have zero counts for a certain bin are ignored >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, [test.bamFile1, test.bamFile2], 200, 200, 0)) array([[ 2.], [ 4.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) # array to keep the read counts for the regions subNum_reads_per_bin = [] rows = 0 startTime = time.time() extendPairedEnds = True zerosToNans = False bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList] for i in xrange(start, end, stepSize): if i + binLength > end: break avgReadsArray = [] for bam in bamHandlers: avgReadsArray.append( \ getCoverageOfRegion( bam, chrom, i, i+binLength, binLength, defaultFragmentLength, extendPairedEnds, zerosToNans )[0] ) if np.isnan(sum(avgReadsArray)): continue if skipZeros and sum(avgReadsArray) == 0: continue subNum_reads_per_bin.extend(avgReadsArray) rows += 1 if debug: endTime = time.time() print "%s countReadsInRegions_worker: processing %d (%.1f per sec) @ %s:%s-%s" % \ ( multiprocessing.current_process().name, rows, rows / (endTime - startTime) , chrom, start, end ) return np.array(subNum_reads_per_bin).reshape(rows,len(bamFilesList))
def countReadsInRegions_worker(chrom, start, end, bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros=False, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, samFlag=None, bedRegions=None ): """Counts the reads in each bam file at each 'stepSize' position within the interval (start, end) for a window or bin of size binLength. Because the idea is to get counts for window/bin positions at The stepSize controls the distance between bins. For example, a step size of 20 and a bin size of 20 will create bins next to each other. if the step size is smaller than the bin size the bins will overlap. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate bamFileList : list List of name of indexed bam files. stepSize : int the positions for which the coverage is computed are defined as follows: ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute the coverage at each base pair. If the stepSize is equal to the binLength then the coverage is computed for consecutive bins. If seepSize is smaller than the binLength, then teh bins will overlap. binLength : int length of the window/bin defaultFragmentLength : in see :meth:`deepTools.countReadsPerBin.getFragmentFromRead` method Returns ------- numpy array The result is a numpy array that as rows each bin and as columns each bam file. Examples -------- Initialize some useful values >>> test = Tester() The transpose is used to get better looking numbers. The first line corresponds to the number of reads per bin in the first bamfile. >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0)) array([[ 0., 0., 1., 1.], [ 0., 1., 1., 2.]]) When skipZeros is set to true, those cases in which *all* of the bamfiles have zero counts for a certain bin are ignored. >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) # array to keep the read counts for the regions subNum_reads_per_bin = [] rows = 0 startTime = time.time() extendPairedEnds = True zerosToNans = False bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList] regionsToConsider = [] if bedRegions: for chrom, start, end in bedRegions: regionsToConsider.append((chrom, start, end, end - start)) else: for i in xrange(start, end, stepSize): if i + binLength > end: break regionsToConsider.append((chrom, i, i + binLength, binLength)) for chrom, start, end, binLength in regionsToConsider: avgReadsArray = [] for bam in bamHandlers: avgReadsArray.append( getCoverageOfRegion(bam, chrom, start, end, binLength, defaultFragmentLength, extendPairedEnds, zerosToNans, minMappingQuality=minMappingQuality, ignoreDuplicates=ignoreDuplicates, samFlag=samFlag )[0]) # skip if any of the bam files returns a NaN if np.isnan(sum(avgReadsArray)): continue if skipZeros and sum(avgReadsArray) == 0: continue subNum_reads_per_bin.extend(avgReadsArray) rows += 1 if debug: endTime = time.time() print "%s countReadsInRegions_worker: processing %d " \ "(%.1f per sec) @ %s:%s-%s" % \ (multiprocessing.current_process().name, rows, rows / (endTime - startTime), chrom, start, end ) return np.array(subNum_reads_per_bin).reshape(rows, len(bamFilesList))
def run(self): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList] chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = max([x.mapped for x in bamFilesHandlers]) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] if self.verbose: print "step size is {}".format(self.stepSize) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, region=self.region, numberOfProcessors=self.numberOfProcessors) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data) os.remove(tempFileName) # self.out_file_for_raw_data.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position within the interval (start, end) for a window or bin of size binLength. The stepSize controls the distance between bins. For example, a step size of 20 and a bin size of 20 will create bins next to each other. If the step size is smaller than the bin size the bins will overlap. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate bed_regions_list: list List of tuples of the form (chrom, start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- numpy array The result is a numpy array that as rows each bin and as columns each bam file. Examples -------- Initialize some useful values >>> test = Tester() >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50) The transpose is used to get better looking numbers. The first line corresponds to the number of reads per bin in the first bamfile. >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200) >>> _array array([[ 0., 0.], [ 0., 1.], [ 1., 1.], [ 1., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) if self.stepSize is None: raise ValueError("stepSize is not set!") # array to keep the read counts for the regions subnum_reads_per_bin = [] rows = 0 start_time = time.time() bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList] regionsToConsider = [] if bed_regions_list is not None: for chrom, start, end in bed_regions_list: regionsToConsider.append((chrom, start, end, end - start)) else: for i in xrange(start, end, self.stepSize): if i + self.binLength > end: break regionsToConsider.append((chrom, i, i + self.binLength, self.binLength)) if self.save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' for chrom, start, end, region_length in regionsToConsider: coverage_array = [] for bam in bam_handlers: coverage_array.append( self.get_coverage_of_region(bam, chrom, start, end, region_length)[0]) subnum_reads_per_bin.extend(coverage_array) rows += 1 if self.save_data: _file.write("\t".join(map(str, [chrom, start, end])) + "\t") _file.write("\t".join(["{}".format(x) for x in coverage_array]) + "\n") if self.verbose: endTime = time.time() print "%s countReadsInRegions_worker: processing %d " \ "(%.1f per sec) @ %s:%s-%s" % \ (multiprocessing.current_process().name, rows, rows / (endTime - start_time), chrom, start, end) if self.save_data: _file.close() return np.array(subnum_reads_per_bin).reshape(rows, len(self.bamFilesList)), _file_name
def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position within the interval (start, end) for a window or bin of size binLength. The stepSize controls the distance between bins. For example, a step size of 20 and a bin size of 20 will create bins next to each other. If the step size is smaller than the bin size the bins will overlap. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate bed_regions_list: list List of tuples of the form (chrom, start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- numpy array The result is a numpy array that as rows each bin and as columns each bam file. Examples -------- Initialize some useful values >>> test = Tester() >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50) The transpose is used to get better looking numbers. The first line corresponds to the number of reads per bin in the first bamfile. >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200) >>> _array array([[ 0., 0.], [ 0., 1.], [ 1., 1.], [ 1., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) if self.stepSize is None: raise ValueError("stepSize is not set!") # array to keep the read counts for the regions subnum_reads_per_bin = [] rows = 0 start_time = time.time() bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList] regionsToConsider = [] if bed_regions_list is not None: for chrom, start, end in bed_regions_list: if mapReduce.blOverlap(self.blackList, chrom, [start, end]): continue regionsToConsider.append((chrom, start, end, end - start)) else: for i in xrange(start, end, self.stepSize): if i + self.binLength > end: break if mapReduce.blOverlap(self.blackList, chrom, [i, i + self.binLength]): continue regionsToConsider.append((chrom, i, i + self.binLength, self.binLength)) if self.save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' for chrom, start, end, region_length in regionsToConsider: coverage_array = [] for bam in bam_handlers: coverage_array.append( self.get_coverage_of_region(bam, chrom, start, end, region_length)[0]) subnum_reads_per_bin.extend(coverage_array) rows += 1 if self.save_data: _file.write("\t".join(map(str, [chrom, start, end])) + "\t") _file.write("\t".join(["{}".format(x) for x in coverage_array]) + "\n") if self.verbose: endTime = time.time() print "%s countReadsInRegions_worker: processing %d " \ "(%.1f per sec) @ %s:%s-%s" % \ (multiprocessing.current_process().name, rows, rows / (endTime - start_time), chrom, start, end) if self.save_data: _file.close() return np.array(subnum_reads_per_bin).reshape(rows, len(self.bamFilesList)), _file_name
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) distanceBetweenBins *= 2 fl = [] while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce( (bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = { 'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length) } else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = { 'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length) } if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) distanceBetweenBins *= 2 fl = [] while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length)} else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def run(self): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList] chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = max([x.mapped for x in bamFilesHandlers]) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] if self.verbose: print "step size is {}".format(self.stepSize) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data) os.remove(tempFileName) self.out_file_for_raw_data.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None, samFlag=None): r""" This function collects read counts (coverage) from several bam files and returns an numpy array with the results. This function does not explicitly do the coverage computation, instead divides the work into smaller chunks that are sent to individual processors. Parameters ---------- bamFilesList : list List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam'] binLength : int Length of the window/bin. This value is overruled by ``bedFile`` if present. numberOfSamples : int Total number of samples. The genome is divided into ``numberOfSamples``, each with a window/bin length equal to ``binLength``. This value is overruled by ``stepSize`` in case such value is present and by ``bedFile`` in which case the number of samples and bins are defined in the bed file defaultFragmentLength : int fragment length to extend reads that are not paired. Paired reads are extended to the fragment length defined by the mate distance. For Illumina reads, usual values are around 300. This value can be determined using the peak caller MACS2 or can be approximated by the fragment lengths computed when preparing the library for sequencing. numberOfProcessors : int Number of processors to use. Default is 4 skipZeros : bool Default is True. This option decides if regions having zero coverage in all bam files should be skipped or kept. verbose : bool Output messages. Default: False region : str Region to limit the computation in the form chrom:start:end. bedFile : str Name of a bed file containing the regions for wich to compute the coverage. This option overrules ``binLength``, ``numberOfSamples`` and ``stepSize``. extendPairedEnds : bool Whether coverage should be computed for the extended read length (i.e. the region covered by the two mates or the regions expected to be covered by single-reads). Default: true minMappingQuality : int Reads of a mapping quality less than the give value are not considered. Default: None ignoreDuplicates : bool Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are to be excluded. Default: false chrToSkip: list List with names of chromosomes that do not want to be included in the coverage computation. This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het'). stepSize : int the positions for which the coverage is computed are defined as follows: ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute the coverage at each base pair. If the stepSize is equal to the binLength then the coverage is computed for consecutive bins. If seepSize is smaller than the binLength, then teh bins will overlap. samFlag : int If given, only reads having such flag are considered. For example, to get only reads that are the first mates a samFlag of 64 could be used. Similarly, the samFlag can be used to select only reads mapping on the forward (or reverse) strand or to get only properly paired reads. Returns ------- numpy array Each row correspond to each bin/bed region and each column correspond to each of the bamFiles. If ``skipZeros`` is used, then the result may have less rows than expected Examples -------- The test data contains reads for 200 bp. >>> test = Tester() The transpose function is used to get a nicer looking output. The first line corresponds to the number of reads per bin in bam file 1 >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) """ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ] chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) max_mapped = max( [ x.mapped for x in bamFilesHandlers ] ) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) if stepSize is None: stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 ) chunkSize = int (stepSize * 1e3 / ( reads_per_bp * len(bamFilesHandlers)) ) [ bam_h.close() for bam_h in bamFilesHandlers] if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros, extendPairedEnds, minMappingQuality, ignoreDuplicates, samFlag), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) try: num_reads_per_bin = np.concatenate(imap_res, axis=0) except ValueError: if bedFile: exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.') return num_reads_per_bin
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None): r""" This function visits a number of sites and returs a matrix containing read counts. Each row to one sampled site and each column correspond to each of the bamFiles. If the chrsToSkip is given, then counts are filter out from this chromosome which, unless a female sample is used, the counts are less compared to autosomes. For most applications this is irrelevant but for other cases, like when stimating the best scaling factor, this is important. The test data contains reads for 200 bp >>> test = Tester() The transpose function is used to get a nicer looking output. The first line corresponds to the number of reads per bin in bam file 1 >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) >>> np.savez('/tmp/aa', aa) """ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ] chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) max_mapped = max( [ x.mapped for x in bamFilesHandlers ] ) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) if stepSize is None: stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 ) chunkSize = int (stepSize * 1e3 / ( reads_per_bp * len(bamFilesHandlers)) ) [ bam_h.close() for bam_h in bamFilesHandlers] if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros, extendPairedEnds, minMappingQuality, ignoreDuplicates), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) num_reads_per_bin = np.concatenate(imap_res, axis=0) return num_reads_per_bin
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None, samFlag=None): r""" This function visits a number of sites and returs a matrix containing read counts. Each row to one sampled site and each column correspond to each of the bamFiles. If the chrsToSkip is given, then counts are filter out from this chromosome which, unless a female sample is used, the counts are less compared to autosomes. For most applications this is irrelevant but for other cases, like when stimating the best scaling factor, this is important. The test data contains reads for 200 bp >>> test = Tester() The transpose function is used to get a nicer looking output. The first line corresponds to the number of reads per bin in bam file 1 >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) >>> np.savez('/tmp/aa', aa) """ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ] chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) max_mapped = max( [ x.mapped for x in bamFilesHandlers ] ) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) if stepSize is None: stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 ) chunkSize = int (stepSize * 1e3 / ( reads_per_bp * len(bamFilesHandlers)) ) [ bam_h.close() for bam_h in bamFilesHandlers] if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros, extendPairedEnds, minMappingQuality, ignoreDuplicates, samFlag), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) try: num_reads_per_bin = np.concatenate(imap_res, axis=0) except ValueError: if bedFile: exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.') return num_reads_per_bin
def openBam(bamFile, bamIndex=None): return bamHandler.openBam(bamFile, bamIndex)
def countReadsInRegions_worker(chrom, start, end, bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros=False, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, bedRegions=None): """ counts the reads in each bam file at each 'stepSize' position within the interval start, end for a 'binLength' window. Because the idea is to get counts for window positions at different positions for sampling the bins are equally spaced between each other and are not one directly next *after* the other. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. The result is a list of tuples. >>> test = Tester() The transpose is used to get better looking numbers. the first line corresponds to the number of reads per bin in the first bamfile. >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0)) array([[ 0., 0., 1., 1.], [ 0., 1., 1., 2.]]) When skipZeros is set to true, those cases in which *all* of the bamfiles have zero counts for a certain bin are ignored >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 200, 200, 0)) array([[ 2.], [ 4.]]) Test min mapping quality >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0, minMappingQuality=40)) array([[ 0., 0., 0., 1.], [ 0., 0., 0., 1.]]) Test ignore duplicates >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 50, 25, 0, ignoreDuplicates=True)) array([[ 0., 0., 1., 1.], [ 0., 1., 1., 1.]]) Test bed regions: >>> bedRegions = [(test.chrom, 10, 20), (test.chrom, 150, 160)] >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, ... [test.bamFile1, test.bamFile2], 0, 200, 0, bedRegions=bedRegions)) array([[ 0., 1.], [ 0., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) # array to keep the read counts for the regions subNum_reads_per_bin = [] rows = 0 startTime = time.time() extendPairedEnds = True zerosToNans = False bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList] regionsToConsider = [] if bedRegions: for chrom, start, end in bedRegions: regionsToConsider.append((chrom, start, end, end - start)) else: for i in xrange(start, end, stepSize): if i + binLength > end: break regionsToConsider.append((chrom, i, i + binLength, binLength)) for chrom, start, end, binLength in regionsToConsider: avgReadsArray = [] for bam in bamHandlers: avgReadsArray.append( getCoverageOfRegion(bam, chrom, start, end, binLength, defaultFragmentLength, extendPairedEnds, zerosToNans, minMappingQuality=minMappingQuality, ignoreDuplicates=ignoreDuplicates, )[0]) # skip if any of the bam files returns a NaN if np.isnan(sum(avgReadsArray)): continue if skipZeros and sum(avgReadsArray) == 0: continue subNum_reads_per_bin.extend(avgReadsArray) rows += 1 if debug: endTime = time.time() print "%s countReadsInRegions_worker: processing %d " \ "(%.1f per sec) @ %s:%s-%s" % \ (multiprocessing.current_process().name, rows, rows / (endTime - startTime), chrom, start, end ) return np.array(subNum_reads_per_bin).reshape(rows, len(bamFilesList))
def get_read_and_fragment_length(bamFile, bamFileIndex=None, return_lengths=False, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling :param bamFile: bamfile name :param bamFileIndex: bamfile index name :param return_lengths: bool, :param numberOfProcessors: :param verbose: :return: tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile, bamFileIndex) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) chunk_size = int( float(sum(bam_handle.lengths)) * 0.3 / max(numberOfProcessors, len(bam_handle.lengths))) # avoid small chunk sizes to split the computation chunk_size = max(chunk_size, 100000) imap_res = mapReduce.mapReduce((bam_handle.filename, ), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=chunk_size, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = { 'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length) } else: fragment_len_dict = None if return_lengths: fragment_len_dict['lengths'] = fragment_length read_len_dict = { 'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length) } if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def get_read_and_fragment_length(bamFile, return_lengths=False, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) chunk_size = int(float(sum(bam_handle.lengths)) * 0.3 / max(numberOfProcessors, len(bam_handle.lengths))) # avoid small chunk sizes to split the computation chunk_size = max(chunk_size, 100000) imap_res = mapReduce.mapReduce((bam_handle.filename, ), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=chunk_size, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length)} else: fragment_len_dict = None if return_lengths: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict