def peFragmentSize(bamFile, bamFileIndex=None, return_lengths=False, numberOfProcessors=None, verbose=False): bamHandle = bamHandler.openBam(bamFile, bamFileIndex) chromSizes = zip(bamHandle.references, bamHandle.lengths) chunkSize = int( float(sum(bamHandle.lengths)) * 0.3 / max(numberOfProcessors, len(bamHandle.lengths))) imap_res = mapReduce.mapReduce((bamHandle.filename, ), getFragmentLength_wrapper, chromSizes, genomeChunkLength=chunkSize, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragLength = {'sample_size': len(fl), 'min': fl.min(), 'qtile25': np.percentile(fl, 25), 'mean': np.mean(fl), 'median': np.median(fl), 'qtile75': np.percentile(fl, 75), 'max': fl.max(), 'std': np.std(fl)} if return_lengths: fragLength['lengths'] = fl else: fragLength = None return fragLength
def getScorePerBin(bigwigFilesList, binLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, chrsToSkip=[]): """ This function returns a matrix containing scores (median) for the coverage of fragments within a region. Each row corresponds to a sampled region. Likewise, each column corresponds to a bigwig file. Test dataset with two samples covering 200 bp. >>> test = Tester() >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 5,)) array([[ 1., 1., 2., 2.], [ 1., 1., 1., 3.]]) """ # Try to determine an optimal fraction of the genome (chunkSize) # that is sent to workers for analysis. If too short, too much time # is spend loading the files # if too long, some processors end up free. # the following values are empirical # get list of common chromosome names and sizes chromSizes = getChromSizes(bigwigFilesList) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) max_mapped = max( map(lambda x: getNumberOfFragmentsPerRegionFromBigWig(x, chromSizes), bigwigFilesList) ) reads_per_bp = float(max_mapped) / genomeSize stepSize = binLength #for consecutive bins chunkSize = int(stepSize * 1e3 / ( reads_per_bp * len(bigwigFilesList)) ) if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) # mapReduce( (staticArgs), func, chromSize, etc. ) imap_res = mapReduce.mapReduce((bigwigFilesList, stepSize, binLength, skipZeros), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) score_per_bin = np.concatenate(imap_res, axis=0) return score_per_bin
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandlers = [bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam'] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == 'bigwig'] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: bwh = pyBigWig.open(bw) for chromName, size in bwh.chroms().items(): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print "\nWARNING\n" \ "Chromosome {} length reported in the " \ "bigwig files differ.\n{} for {}\n" \ "{} for {}.\n\nThe smallest " \ "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size bwh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig( chromNamesAndSize, bedGraphFile, outputFileName, True) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def get_read_and_fragment_length(bamFile, bamFileIndex=None, return_lengths=False, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling :param bamFile: bamfile name :param bamFileIndex: bamfile index name :param return_lengths: bool, :param numberOfProcessors: :param verbose: :return: tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile, bamFileIndex) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) chunk_size = int( float(sum(bam_handle.lengths)) * 0.3 / max(numberOfProcessors, len(bam_handle.lengths))) # avoid small chunk sizes to split the computation chunk_size = max(chunk_size, 100000) imap_res = mapReduce.mapReduce((bam_handle.filename, ), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=chunk_size, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = { 'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length) } else: fragment_len_dict = None if return_lengths: fragment_len_dict['lengths'] = fragment_length read_len_dict = { 'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length) } if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def run(self): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList] chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = max([x.mapped for x in bamFilesHandlers]) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] if self.verbose: print "step size is {}".format(self.stepSize) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, region=self.region, numberOfProcessors=self.numberOfProcessors) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data) os.remove(tempFileName) # self.out_file_for_raw_data.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def run(self): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList] chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = max([x.mapped for x in bamFilesHandlers]) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] if self.verbose: print "step size is {}".format(self.stepSize) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data) os.remove(tempFileName) self.out_file_for_raw_data.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) distanceBetweenBins *= 2 fl = [] while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length)} else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None, samFlag=None): r""" This function visits a number of sites and returs a matrix containing read counts. Each row to one sampled site and each column correspond to each of the bamFiles. If the chrsToSkip is given, then counts are filter out from this chromosome which, unless a female sample is used, the counts are less compared to autosomes. For most applications this is irrelevant but for other cases, like when stimating the best scaling factor, this is important. The test data contains reads for 200 bp >>> test = Tester() The transpose function is used to get a nicer looking output. The first line corresponds to the number of reads per bin in bam file 1 >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) >>> np.savez('/tmp/aa', aa) """ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ] chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) max_mapped = max( [ x.mapped for x in bamFilesHandlers ] ) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) if stepSize is None: stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 ) chunkSize = int (stepSize * 1e3 / ( reads_per_bp * len(bamFilesHandlers)) ) [ bam_h.close() for bam_h in bamFilesHandlers] if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros, extendPairedEnds, minMappingQuality, ignoreDuplicates, samFlag), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) try: num_reads_per_bin = np.concatenate(imap_res, axis=0) except ValueError: if bedFile: exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.') return num_reads_per_bin
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False, ): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandlers = [ bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == "bam" ] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == "bigwig"] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: bwh = pyBigWig.open(bw) for chromName, size in bwh.chroms().items(): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print "\nWARNING\n" "Chromosome {} length reported in the " "bigwig files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw ) chromNamesAndSize[chromName] = min(chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size bwh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( ( tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step, ), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors, ) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", "wb") for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, "rb"), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == "bedgraph": os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None): r""" This function visits a number of sites and returs a matrix containing read counts. Each row to one sampled site and each column correspond to each of the bamFiles. If the chrsToSkip is given, then counts are filter out from this chromosome which, unless a female sample is used, the counts are less compared to autosomes. For most applications this is irrelevant but for other cases, like when stimating the best scaling factor, this is important. The test data contains reads for 200 bp >>> test = Tester() The transpose function is used to get a nicer looking output. The first line corresponds to the number of reads per bin in bam file 1 >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) >>> np.savez('/tmp/aa', aa) """ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ] chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) max_mapped = max( [ x.mapped for x in bamFilesHandlers ] ) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) if stepSize is None: stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 ) chunkSize = int (stepSize * 1e3 / ( reads_per_bp * len(bamFilesHandlers)) ) [ bam_h.close() for bam_h in bamFilesHandlers] if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros, extendPairedEnds, minMappingQuality, ignoreDuplicates), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) num_reads_per_bin = np.concatenate(imap_res, axis=0) return num_reads_per_bin
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, zerosToNans=True, smoothLength=0, fixed_step=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. >>> test = Tester() >>> outFile = tempfile.NamedTemporaryFile() >>> funcArgs = {'scaleFactor': 1.0} >>> writeBedGraph([(test.bamFile1, 'bam')], outFile.name, ... 0, scaleCoverage, funcArgs, region='3R:0:200') >>> open(outFile.name, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> outFile.close() """ bigwig_info = cfg.config.get('external_tools', 'bigwig_info') bamHandlers = [openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam'] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == 'bigwig'] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: inBlock = False for line in os.popen( "{} -chroms {}".format(bigwig_info, bw)).readlines(): if line[0:10] == "chromCount": inBlock = True continue if line[0:5] == "bases": break if inBlock: chromName, id, size = line.strip().split(" ") size = int(size) if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print "\nWARNING\n" \ "Chromosome {} length reported in the " \ "bigwig files differ.\n{} for {}\n" \ "{} for {}.\n\nThe smallest " \ "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bigwigs[1]) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, zerosToNans, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig( chromNamesAndSize, bedGraphFile, outputFileName, False) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def writeBedGraph(bamFilesList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, zerosToNans=True, smoothLength=0, minMappingQuality=None, ignoreDuplicates=False, fragmentFromRead_func=None): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. >>> test = Tester() >>> outFile = tempfile.NamedTemporaryFile() >>> funcArgs = {'scaleFactor': 1.0} >>> writeBedGraph( [test.bamFile1], outFile.name, ... 0, scaleCoverage, funcArgs, region='3R:0:200') >>> open(outFile.name, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> outFile.close() """ bamHandlers = [openBam(x) for x in bamFilesList] genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False) if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamFilesList, func, funcArgs, extendPairedEnds, smoothLength, zerosToNans, minMappingQuality, ignoreDuplicates, fragmentFromRead_func), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig( chromNamesAndSize, bedGraphFile, outputFileName, False) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) distanceBetweenBins *= 2 fl = [] while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce( (bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = { 'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length) } else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = { 'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length) } if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, extendPairedEnds=True, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None, samFlag=None): r""" This function collects read counts (coverage) from several bam files and returns an numpy array with the results. This function does not explicitly do the coverage computation, instead divides the work into smaller chunks that are sent to individual processors. Parameters ---------- bamFilesList : list List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam'] binLength : int Length of the window/bin. This value is overruled by ``bedFile`` if present. numberOfSamples : int Total number of samples. The genome is divided into ``numberOfSamples``, each with a window/bin length equal to ``binLength``. This value is overruled by ``stepSize`` in case such value is present and by ``bedFile`` in which case the number of samples and bins are defined in the bed file defaultFragmentLength : int fragment length to extend reads that are not paired. Paired reads are extended to the fragment length defined by the mate distance. For Illumina reads, usual values are around 300. This value can be determined using the peak caller MACS2 or can be approximated by the fragment lengths computed when preparing the library for sequencing. numberOfProcessors : int Number of processors to use. Default is 4 skipZeros : bool Default is True. This option decides if regions having zero coverage in all bam files should be skipped or kept. verbose : bool Output messages. Default: False region : str Region to limit the computation in the form chrom:start:end. bedFile : str Name of a bed file containing the regions for wich to compute the coverage. This option overrules ``binLength``, ``numberOfSamples`` and ``stepSize``. extendPairedEnds : bool Whether coverage should be computed for the extended read length (i.e. the region covered by the two mates or the regions expected to be covered by single-reads). Default: true minMappingQuality : int Reads of a mapping quality less than the give value are not considered. Default: None ignoreDuplicates : bool Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are to be excluded. Default: false chrToSkip: list List with names of chromosomes that do not want to be included in the coverage computation. This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het'). stepSize : int the positions for which the coverage is computed are defined as follows: ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute the coverage at each base pair. If the stepSize is equal to the binLength then the coverage is computed for consecutive bins. If seepSize is smaller than the binLength, then teh bins will overlap. samFlag : int If given, only reads having such flag are considered. For example, to get only reads that are the first mates a samFlag of 64 could be used. Similarly, the samFlag can be used to select only reads mapping on the forward (or reverse) strand or to get only properly paired reads. Returns ------- numpy array Each row correspond to each bin/bed region and each column correspond to each of the bamFiles. If ``skipZeros`` is used, then the result may have less rows than expected Examples -------- The test data contains reads for 200 bp. >>> test = Tester() The transpose function is used to get a nicer looking output. The first line corresponds to the number of reads per bin in bam file 1 >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2], ... 50, 4, 0, skipZeros=True)) array([[ 0., 1., 1.], [ 1., 1., 2.]]) """ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ] chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) max_mapped = max( [ x.mapped for x in bamFilesHandlers ] ) reads_per_bp = float(max_mapped) / genomeSize # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) if stepSize is None: stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 ) chunkSize = int (stepSize * 1e3 / ( reads_per_bp * len(bamFilesHandlers)) ) [ bam_h.close() for bam_h in bamFilesHandlers] if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength, defaultFragmentLength, skipZeros, extendPairedEnds, minMappingQuality, ignoreDuplicates, samFlag), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) try: num_reads_per_bin = np.concatenate(imap_res, axis=0) except ValueError: if bedFile: exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.') return num_reads_per_bin
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. >>> test = Tester() >>> outFile = tempfile.NamedTemporaryFile() >>> funcArgs = {'scaleFactor': 1.0} >>> writeBedGraph([(test.bamFile1, 'bam')], outFile.name, ... 0, scaleCoverage, funcArgs, region='3R:0:200') >>> open(outFile.name, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> outFile.close() """ bigwig_info = cfg.config.get('external_tools', 'bigwig_info') bamHandlers = [openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam'] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == 'bigwig'] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: inBlock = False for line in os.popen( "{} -chroms {}".format(bigwig_info, bw)).readlines(): if line[0:10] == "chromCount": inBlock = True continue if line[0:5] == "bases": break if inBlock: chromName, id, size = line.strip().split(" ") size = int(size) if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print "\nWARNING\n" \ "Chromosome {} length reported in the " \ "bigwig files differ.\n{} for {}\n" \ "{} for {}.\n\nThe smallest " \ "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bigwigs[1]) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig( chromNamesAndSize, bedGraphFile, outputFileName, False) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def get_read_and_fragment_length(bamFile, return_lengths=False, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = zip(bam_handle.references, bam_handle.lengths) chunk_size = int(float(sum(bam_handle.lengths)) * 0.3 / max(numberOfProcessors, len(bam_handle.lengths))) # avoid small chunk sizes to split the computation chunk_size = max(chunk_size, 100000) imap_res = mapReduce.mapReduce((bam_handle.filename, ), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=chunk_size, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length)} else: fragment_len_dict = None if return_lengths: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict