def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=None, verbose=False, region=None): r""" Subdivides the genome or the reads into chunks to be analyzed in parallel using several processors. This codes handles the creation of workers that tabulate the GC content for small regions and then collects and integrates the results >>> test = Tester() >>> arg = test.testTabulateGCcontent() >>> res = tabulateGCcontent(*arg) >>> res array([[ 0. , 18. , 1. ], [ 3. , 63. , 0.42857143], [ 7. , 159. , 0.39622642], [ 25. , 192. , 1.171875 ], [ 28. , 215. , 1.17209302], [ 16. , 214. , 0.6728972 ], [ 12. , 95. , 1.13684211], [ 9. , 24. , 3.375 ], [ 3. , 11. , 2.45454545], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ]]) """ global global_vars chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())] imap_res = mapReduce.mapReduce( (stepSize, fragmentLength, chrNameBamToBit, verbose), tabulateGCcontent_wrapper, chromSizes, genomeChunkLength=chunkSize, numberOfProcessors=numberOfProcessors, region=region) for subN_gc, subF_gc in imap_res: try: F_gc += subF_gc N_gc += subN_gc except NameError: F_gc = subF_gc N_gc = subN_gc scaling = sum(N_gc) // sum(F_gc) R_gc = np.array([ float(F_gc[x]) / N_gc[x] * scaling if N_gc[x] and F_gc[x] > 0 else 1 for x in range(len(F_gc)) ]) data = np.transpose(np.vstack((F_gc, N_gc, R_gc))) return data
def fraction_kept(args): """ Count the following: (A) The total number of alignments sampled (B) The total number of alignments ignored due to any of the following: --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates --minFragmentLength --maxFragmentLength Black list regions are already accounted for. This works by sampling the genome (by default, we'll iterate until we sample 1% or 100,000 alignments, whichever is smaller (unless there are fewer than 100,000 alignments, in which case sample everything). The sampling works by dividing the genome into bins and only looking at the first 50000 bases. If this doesn't yield sufficient alignments then the bin size is halved. """ filtered = 0 total = 0 distanceBetweenBins = 2000000 bam_handle = bamHandler.openBam(args.bam) bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization) num_needed_to_sample = max(bam_mapped if bam_mapped <= 100000 else 0, min(100000, 0.01 * bam_mapped)) if args.ignoreForNormalization: chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references) if chrom_name not in args.ignoreForNormalization] else: chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) while total < num_needed_to_sample and distanceBetweenBins > 50000: # If we've iterated, then halve distanceBetweenBins distanceBetweenBins /= 2 if distanceBetweenBins < 50000: distanceBetweenBins = 50000 res = mapReduce.mapReduce((bam_handle.filename, args), getFractionKept_wrapper, chrom_sizes, genomeChunkLength=distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if len(res): filtered, total = np.sum(res, axis=0) if total == 0: # This should never happen total = 1 return 1.0 - float(filtered) / float(total)
def getScorePerBin(bigwigFilesList, binLength, numberOfProcessors=1, skipZeros=True, verbose=False, region=None, bedFile=None, stepSize=None, chrsToSkip=[]): """ This function returns a matrix containing scores (median) for the coverage of fragments within a region. Each row corresponds to a sampled region. Likewise, each column corresponds to a bigwig file. Test dataset with two samples covering 200 bp. >>> test = Tester() >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 5,)) array([[ 1., 1., 2., 2.], [ 1., 1., 1., 3.]]) """ # Try to determine an optimal fraction of the genome (chunkSize) # that is sent to workers for analysis. If too short, too much time # is spend loading the files # if too long, some processors end up free. # the following values are empirical # get list of common chromosome names and sizes chromSizes = getChromSizes(bigwigFilesList) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ] chrNames, chrLengths = zip(*chromSizes) genomeSize = sum(chrLengths) if stepSize is None: stepSize = binLength #for consecutive bins chunkSize = int(stepSize * 500 / len(bigwigFilesList)) if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) # mapReduce( (staticArgs), func, chromSize, etc. ) imap_res = mapReduce.mapReduce((bigwigFilesList, stepSize, binLength, skipZeros), countReadsInRegions_wrapper, chromSizes, genomeChunkLength=chunkSize, bedFile=bedFile, region=region, numberOfProcessors=numberOfProcessors) score_per_bin = np.concatenate(imap_res, axis=0) return score_per_bin
def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=None, verbose=False, region=None): r""" Subdivides the genome or the reads into chunks to be analyzed in parallel using several processors. This codes handles the creation of workers that tabulate the GC content for small regions and then collects and integrates the results >>> test = Tester() >>> arg = test.testTabulateGCcontent() >>> res = tabulateGCcontent(*arg) >>> res array([[ 0. , 18. , 1. ], [ 3. , 63. , 0.45815996], [ 7. , 159. , 0.42358185], [ 25. , 192. , 1.25278115], [ 28. , 215. , 1.25301422], [ 16. , 214. , 0.71935396], [ 12. , 95. , 1.21532959], [ 9. , 24. , 3.60800971], [ 3. , 11. , 2.62400706], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ]]) """ global global_vars chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())] imap_res = mapReduce.mapReduce((stepSize, fragmentLength, chrNameBamToBit, verbose), tabulateGCcontent_wrapper, chromSizes, genomeChunkLength=chunkSize, numberOfProcessors=numberOfProcessors, region=region) for subN_gc, subF_gc in imap_res: try: F_gc += subF_gc N_gc += subN_gc except NameError: F_gc = subF_gc N_gc = subN_gc if sum(F_gc) == 0: sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter") scaling = float(sum(N_gc)) / float(sum(F_gc)) R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling if N_gc[x] and F_gc[x] > 0 else 1 for x in range(len(F_gc))]) data = np.transpose(np.vstack((F_gc, N_gc, R_gc))) return data
def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=None, verbose=False, region=None): r""" Subdivides the genome or the reads into chunks to be analyzed in parallel using several processors. This codes handles the creation of workers that tabulate the GC content for small regions and then collects and integrates the results >>> test = Tester() >>> arg = test.testTabulateGCcontent() >>> res = tabulateGCcontent(*arg) >>> res array([[ 0. , 18. , 1. ], [ 3. , 63. , 0.42857143], [ 7. , 159. , 0.39622642], [ 25. , 192. , 1.171875 ], [ 28. , 215. , 1.17209302], [ 16. , 214. , 0.6728972 ], [ 12. , 95. , 1.13684211], [ 9. , 24. , 3.375 ], [ 3. , 11. , 2.45454545], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ]]) """ global global_vars chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()]) chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) chromSizes = [(k, v) for k, v in chromSizes if k in chrNameBamToBit.keys()] imap_res = mapReduce.mapReduce((stepSize, fragmentLength, chrNameBamToBit, verbose), tabulateGCcontent_wrapper, chromSizes, genomeChunkLength=chunkSize, numberOfProcessors=numberOfProcessors, region=region) for subN_gc, subF_gc in imap_res: try: F_gc += subF_gc N_gc += subN_gc except NameError: F_gc = subF_gc N_gc = subN_gc scaling = sum(N_gc) / sum(F_gc) R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling if N_gc[x] and F_gc[x] > 0 else 1 for x in xrange(len(F_gc))]) data = np.transpose(np.vstack((F_gc, N_gc, R_gc))) return data
def getMappingStats(bam, nThreads): """ This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless This requires pysam > 0.13.0 """ header = [(x, y) for x, y in zip(bam.references, bam.lengths)] res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads) mapped = sum([x[0] for x in res]) unmapped = sum([x[1] for x in res]) stats = {x[0]: [0, 0] for x in header} for r in res: stats[r[2]][0] += r[0] stats[r[2]][1] += r[1] # We need to count the number of unmapped reads as well unmapped += bam.count("*") return mapped, unmapped, stats
def countReadsPerGC(regionSize, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=None, verbose=False, region=None): r""" Computes for a region of size regionSize, the GC of the region and the number of reads that overlap it. >>> test = Tester() >>> arg = test.testCountReadsPerGC() >>> reads_per_gc = countReadsPerGC(*arg) >>> reads_per_gc[0:5,:] array([[ 132. , 0.44 ], [ 132. , 0.44 ], [ 133. , 0.44 ], [ 134. , 0.43666667], [ 134. , 0.44 ]]) """ global global_vars chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) imap_res = mapReduce.mapReduce( (stepSize, regionSize, chrNameBamToBit, verbose), countReadsPerGC_wrapper, chromSizes, genomeChunkLength=chunkSize, numberOfProcessors=numberOfProcessors, region=region) reads_per_gc = [] for sub_reads_per_gc in imap_res: reads_per_gc += sub_reads_per_gc reads_per_gc = np.asarray(reads_per_gc) return reads_per_gc
def countReadsPerGC(regionSize, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=None, verbose=False, region=None): r""" Computes for a region of size regionSize, the GC of the region and the number of reads that overlap it. >>> test = Tester() >>> arg = test.testCountReadsPerGC() >>> reads_per_gc = countReadsPerGC(*arg) >>> reads_per_gc[0:5,:] array([[ 132. , 0.44 ], [ 132. , 0.44 ], [ 133. , 0.44 ], [ 134. , 0.43666667], [ 134. , 0.44 ]]) """ global global_vars chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()]) chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) imap_res = mapReduce.mapReduce((stepSize, regionSize, chrNameBamToBit, verbose), countReadsPerGC_wrapper, chromSizes, genomeChunkLength=chunkSize, numberOfProcessors=numberOfProcessors, region=region) reads_per_gc = [] for sub_reads_per_gc in imap_res: reads_per_gc += sub_reads_per_gc reads_per_gc = np.asarray(reads_per_gc) return reads_per_gc
def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. Parameters ---------- func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0} out_file_name : str name of the file to save the resulting data. smoothLength : int Distance in bp for smoothing the coverage per tile. """ self.__dict__["smoothLength"] = smoothLength getStats = len(self.mappedList) < len(self.bamFilesList) bam_handles = [] for x in self.bamFilesList: if getStats: bam, mapped, unmapped, stats = bamHandler.openBam(x, returnStats=True, nThreads=self.numberOfProcessors) self.mappedList.append(mapped) self.statsList.append(stats) else: bam = bamHandler.openBam(x) bam_handles.append(bam) genome_chunk_length = getGenomeChunkLength(bam_handles, self.binLength, self.mappedList) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handles, verbose=False) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) for x in list(self.__dict__.keys()): if x in ["mappedList", "statsList"]: continue sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) res = mapReduce.mapReduce([func_to_call, func_args], writeBedGraph_wrapper, chrom_names_and_size, self_=self, genomeChunkLength=genome_chunk_length, region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) # Determine the sorted order of the temp files chrom_order = dict() for i, _ in enumerate(chrom_names_and_size): chrom_order[_[0]] = i res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() if format == 'bedgraph': out_file = open(out_file_name, 'wb') for r in res: if r[3]: _foo = open(r[3], 'rb') shutil.copyfileobj(_foo, out_file) _foo.close() os.remove(r[3]) out_file.close() else: bedGraphToBigWig(chrom_names_and_size, [x[3] for x in res], out_file_name)
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write( "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n" ) sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [ bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles ] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append( utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write( "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n" ) for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0
def run(self, allArgs=None): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except: y = pyBigWig.open(x) bamFilesHandlers.append(y) chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromSizes)) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = [] for x in bamFilesHandlers: try: max_mapped.append(x.mapped) except: # bigWig, use a fixed value max_mapped.append(0) max_mapped = max(max_mapped) # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin if max_mapped == 0: chunkSize = 10000 * self.binLength self.stepSize = self.binLength else: reads_per_bp = float(max_mapped) / genomeSize chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] # Ensure that chunkSize is always at least self.stepSize if chunkSize < self.stepSize: chunkSize = self.stepSize if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) distanceBetweenBins *= 2 fl = [] # Fix issue #522, allow distanceBetweenBins == 0 if distanceBetweenBins == 0: imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=binSize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length), 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))), 'qtile10': np.percentile(fragment_length, 10), 'qtile20': np.percentile(fragment_length, 20), 'qtile30': np.percentile(fragment_length, 30), 'qtile40': np.percentile(fragment_length, 40), 'qtile60': np.percentile(fragment_length, 60), 'qtile70': np.percentile(fragment_length, 70), 'qtile80': np.percentile(fragment_length, 80), 'qtile90': np.percentile(fragment_length, 90), 'qtile99': np.percentile(fragment_length, 99)} else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length), 'mad': np.median(np.abs(read_length - np.median(read_length))), 'qtile10': np.percentile(read_length, 10), 'qtile20': np.percentile(read_length, 20), 'qtile30': np.percentile(read_length, 30), 'qtile40': np.percentile(read_length, 40), 'qtile60': np.percentile(read_length, 60), 'qtile70': np.percentile(read_length, 70), 'qtile80': np.percentile(read_length, 80), 'qtile90': np.percentile(read_length, 90), 'qtile99': np.percentile(read_length, 99)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def main(args=None): args = parseArguments().parse_args(args) if args.shift: if len(args.shift) not in [2, 4]: sys.exit("The --shift option can accept either 2 or 4 values only.") if len(args.shift) == 2: args.shift.extend([-args.shift[1], -args.shift[0]]) elif args.ATACshift: args.shift = [4, -5, 5, -4] bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) total = mapped + unmapped chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)] chromDict = {x: y for x, y in zip(bam.references, bam.lengths)} # Filter, writing the results to a bunch of temporary files res = mapReduce([args, chromDict], filterWorker, chrom_sizes, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) res = sorted(res) # The temp files are now in order for concatenation nFiltered = sum([x[3] for x in res]) totalSeen = sum([x[2] for x in res]) # The * contig isn't queried tmpFiles = [x[4] for x in res] if not args.BED: arguments = ["-o", args.outFile] arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict) if args.filteredOutReads: tmpFiles = [x[5] for x in res] if not args.BED: arguments = ["-o", args.filteredOutReads] arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict, args) if args.filterMetrics: sampleName = args.bam if args.label: sampleName = args.label if args.smartLabels: sampleName = smartLabels([args.bam])[0] of = open(args.filterMetrics, "w") of.write("#bamFilterReads --filterMetrics\n") of.write("#File\tReads Remaining\tTotal Initial Reads\n") of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total)) of.close() return 0
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) distanceBetweenBins *= 2 fl = [] # Fix issue #522, allow distanceBetweenBins == 0 if distanceBetweenBins == 0: imap_res = mapReduce.mapReduce( (bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=binSize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce( (bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = { 'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length), 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))), 'qtile10': np.percentile(fragment_length, 10), 'qtile20': np.percentile(fragment_length, 20), 'qtile30': np.percentile(fragment_length, 30), 'qtile40': np.percentile(fragment_length, 40), 'qtile60': np.percentile(fragment_length, 60), 'qtile70': np.percentile(fragment_length, 70), 'qtile80': np.percentile(fragment_length, 80), 'qtile90': np.percentile(fragment_length, 90), 'qtile99': np.percentile(fragment_length, 99) } else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = { 'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length), 'mad': np.median(np.abs(read_length - np.median(read_length))), 'qtile10': np.percentile(read_length, 10), 'qtile20': np.percentile(read_length, 20), 'qtile30': np.percentile(read_length, 30), 'qtile40': np.percentile(read_length, 40), 'qtile60': np.percentile(read_length, 60), 'qtile70': np.percentile(read_length, 70), 'qtile80': np.percentile(read_length, 80), 'qtile90': np.percentile(read_length, 90), 'qtile99': np.percentile(read_length, 99) } if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def run(self, allArgs=None): bamFilesHandles = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except SystemExit: sys.exit(sys.exc_info()[1]) except: y = pyBigWig.open(x) bamFilesHandles.append(y) chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromsizes)) genomeSize = sum(chrLengths) if self.bedFile is None: chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths) else: chunkSize = None [bam_h.close() for bam_h in bamFilesHandles] if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromsizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def fraction_kept(args, stats): """ Count the following: (A) The total number of alignments sampled (B) The total number of alignments ignored due to any of the following: --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates --minFragmentLength --maxFragmentLength Black list regions are already accounted for. This works by sampling the genome (by default, we'll iterate until we sample 1% or 100,000 alignments, whichever is smaller (unless there are fewer than 100,000 alignments, in which case sample everything). The sampling works by dividing the genome into bins and only looking at the first 50000 bases. If this doesn't yield sufficient alignments then the bin size is halved. """ # Do we even need to proceed? if (not args.minMappingQuality or args.minMappingQuality == 0) and \ (not args.samFlagInclude or args.samFlagInclude == 0) and \ (not args.samFlagExclude or args.samFlagExclude == 0) and \ (not args.minFragmentLength or args.minFragmentLength == 0) and \ (not args.maxFragmentLength or args.maxFragmentLength == 0): if hasattr(args, "filterRNAstrand"): if args.filterRNAstrand not in ["forward", "reverse"]: return 1.0 else: return 1.0 filtered = 0 total = 0 distanceBetweenBins = 2000000 bam_handle = bamHandler.openBam(args.bam) bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats) if bam_mapped < 1000000: num_needed_to_sample = bam_mapped else: if 0.1 * bam_mapped >= 1000000: num_needed_to_sample = 0.1 * bam_mapped else: num_needed_to_sample = 1000000 if args.exactScaling: num_needed_to_sample = bam_mapped if num_needed_to_sample == bam_mapped: distanceBetweenBins = 55000 if args.ignoreForNormalization: chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references) if chrom_name not in args.ignoreForNormalization] else: chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) offset = 0 # Iterate over bins at various non-overlapping offsets until we have enough data while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000): res = mapReduce.mapReduce((bam_handle.filename, args, offset), getFractionKept_wrapper, chrom_sizes, genomeChunkLength=distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if len(res): foo, bar = np.sum(res, axis=0) filtered += foo total += bar offset += 1 if total == 0: # This should never happen total = 1 return 1.0 - float(filtered) / float(total)
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n") sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n") for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0
def writeBedGraph(bamFilesList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, zerosToNans=True, smoothLength=0, minMappingQuality=None, ignoreDuplicates=False, fragmentFromRead_func=None, centerRead=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. >>> test = Tester() >>> import tempfile >>> outFile = tempfile.NamedTemporaryFile() >>> funcArgs = {'scaleFactor': 1.0} >>> writeBedGraph( [test.bamFile1], outFile.name, ... 0, scaleCoverage, funcArgs, region='3R:0:200') >>> open(outFile.name, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> outFile.close() """ bamHandlers = [openBam(x) for x in bamFilesList] genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False) if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( (tileSize, fragmentLength, bamFilesList, func, funcArgs, extendPairedEnds, smoothLength, zerosToNans, minMappingQuality, ignoreDuplicates, fragmentFromRead_func, centerRead), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, False) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def writeBedGraph(bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandlers = [ bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam' ] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [ fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == 'bigwig' ] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: bwh = pyBigWig.open(bw) for chromName, size in list(bwh.chroms().items()): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print("\nWARNING\n" "Chromosome {} length reported in the " "bigwig files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw)) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size bwh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file _foo = open(tempFileName, 'rb') shutil.copyfileobj(_foo, outFile) _foo.close() os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print("output file: %s" % (outputFileName)) else: bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True) if debug: print("output file: %s" % (outputFileName)) os.remove(bedGraphFile)
def main(args=None): args = parseArguments().parse_args(args) if args.shift: if len(args.shift) not in [2, 4]: sys.exit( "The --shift option can accept either 2 or 4 values only.") if len(args.shift) == 2: args.shift.extend([-args.shift[1], -args.shift[0]]) elif args.ATACshift: args.shift = [4, -5, 5, -4] bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) total = mapped + unmapped chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)] chromDict = {x: y for x, y in zip(bam.references, bam.lengths)} # Filter, writing the results to a bunch of temporary files res = mapReduce([args, chromDict], filterWorker, chrom_sizes, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) res = sorted(res) # The temp files are now in order for concatenation nFiltered = sum([x[3] for x in res]) totalSeen = sum([x[2] for x in res]) # The * contig isn't queried tmpFiles = [x[4] for x in res] if not args.BED: arguments = ["-o", args.outFile] arguments.extend( tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict) if args.filteredOutReads: tmpFiles = [x[5] for x in res] if not args.BED: arguments = ["-o", args.filteredOutReads] arguments.extend( tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict, args) if args.filterMetrics: sampleName = args.bam if args.label: sampleName = args.label if args.smartLabels: sampleName = smartLabels([args.bam])[0] of = open(args.filterMetrics, "w") of.write("#bamFilterReads --filterMetrics\n") of.write("#File\tReads Remaining\tTotal Initial Reads\n") of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total)) of.close() return 0
def writeBedGraph(bamFilesList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, zerosToNans=True, smoothLength=0, minMappingQuality=None, ignoreDuplicates=False, fragmentFromRead_func=None, centerRead=False, samFlag=None): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. >>> test = Tester() >>> import tempfile >>> outFile = tempfile.NamedTemporaryFile() >>> funcArgs = {'scaleFactor': 1.0} >>> writeBedGraph( [test.bamFile1], outFile.name, ... 0, scaleCoverage, funcArgs, region='3R:0:200') >>> open(outFile.name, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> outFile.close() """ bamHandlers = [openBam(x) for x in bamFilesList] genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False) if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamFilesList, func, funcArgs, extendPairedEnds, smoothLength, zerosToNans, minMappingQuality, ignoreDuplicates, fragmentFromRead_func, centerRead, samFlag), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig( chromNamesAndSize, bedGraphFile, outputFileName, sort=True) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. Parameters ---------- func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0} out_file_name : str name of the file to save the resulting data. smoothLength : int Distance in bp for smoothing the coverage per tile. """ self.__dict__["smoothLength"] = smoothLength bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList] genome_chunk_length = getGenomeChunkLength(bam_handlers, self.binLength) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) for x in self.__dict__.keys(): sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) res = mapReduce.mapReduce([func_to_call, func_args], writeBedGraph_wrapper, chrom_names_and_size, self_=self, genomeChunkLength=genome_chunk_length, region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) # concatenate intermediary bedgraph files out_file = open(out_file_name + ".bg", 'wb') for tempfilename in res: if tempfilename: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempfilename, 'rb'), out_file) os.remove(tempfilename) bedgraph_file = out_file.name out_file.close() if format == 'bedgraph': os.rename(bedgraph_file, out_file_name) if self.verbose: print "output file: {}".format(out_file_name) else: bedGraphToBigWig( chrom_names_and_size, bedgraph_file, out_file_name, True) if self.verbose: print "output file: {}".format(out_file_name) os.remove(bedgraph_file)
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) distanceBetweenBins *= 2 fl = [] while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length)} else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def getScorePerBin(bigWigFiles, binLength, numberOfProcessors=1, verbose=False, region=None, bedFile=None, blackListFileName=None, stepSize=None, chrsToSkip=[], out_file_for_raw_data=None, allArgs=None): """ This function returns a matrix containing scores (median) for the coverage of fragments within a region. Each row corresponds to a sampled region. Likewise, each column corresponds to a bigwig file. Test dataset with two samples covering 200 bp. >>> test = Tester() >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3)) array([[ 1., 1., 2., 2.], [ 1., 1., 1., 3.]]) """ # Try to determine an optimal fraction of the genome (chunkSize) # that is sent to workers for analysis. If too short, too much time # is spent loading the files # if too long, some processors end up free. # the following is a heuristic # get list of common chromosome names and sizes chrom_sizes, non_common = getChromSizes(bigWigFiles) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if chrsToSkip and len(chrsToSkip): chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip] chrnames, chrlengths = list(zip(*chrom_sizes)) if stepSize is None: stepSize = binLength # for adjacent bins # set chunksize based on number of processors used chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6)) # make chunkSize multiple of binLength chunkSize -= chunkSize % binLength if verbose: print("step size is {}".format(stepSize)) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) # mapReduce( (staticArgs), func, chromSize, etc. ) if out_file_for_raw_data: save_file = True else: save_file = False # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions( allArgs) imap_res = mapReduce.mapReduce( (bigWigFiles, stepSize, binLength, save_file), countReadsInRegions_wrapper, chrom_sizes, genomeChunkLength=chunkSize, bedFile=bedFile, blackListFileName=blackListFileName, region=region, numberOfProcessors=numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if out_file_for_raw_data: if len(non_common): sys.stderr.write( "*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n" ) # concatenate intermediary bedgraph files ofile = open(out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one f = open(tempFileName, 'r') shutil.copyfileobj(f, ofile) f.close() os.remove(tempFileName) ofile.close() # the matrix scores are in the first element of each of the entries in imap_res score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return score_per_bin
def getScorePerBin(bigWigFiles, binLength, numberOfProcessors=1, verbose=False, region=None, bedFile=None, blackListFileName=None, stepSize=None, chrsToSkip=[], out_file_for_raw_data=None): """ This function returns a matrix containing scores (median) for the coverage of fragments within a region. Each row corresponds to a sampled region. Likewise, each column corresponds to a bigwig file. Test dataset with two samples covering 200 bp. >>> test = Tester() >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3)) array([[ 1., 1., 2., 2.], [ 1., 1., 1., 3.]]) """ # Try to determine an optimal fraction of the genome (chunkSize) # that is sent to workers for analysis. If too short, too much time # is spent loading the files # if too long, some processors end up free. # the following is a heuristic # get list of common chromosome names and sizes chrom_sizes, non_common = getChromSizes(bigWigFiles) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if chrsToSkip and len(chrsToSkip): chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip] chrnames, chrlengths = zip(*chrom_sizes) if stepSize is None: stepSize = binLength # for adjacent bins # set chunksize based on number of processors used chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6)) # make chunkSize multiple of binLength chunkSize -= chunkSize % binLength if verbose: print "step size is {}".format(stepSize) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) # mapReduce( (staticArgs), func, chromSize, etc. ) if out_file_for_raw_data: save_file = True else: save_file = False imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file), countReadsInRegions_wrapper, chrom_sizes, genomeChunkLength=chunkSize, bedFile=bedFile, blackListFileName=blackListFileName, region=region, numberOfProcessors=numberOfProcessors) if out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one shutil.copyfileobj(open(tempFileName, 'r'), out_file_for_raw_data) os.remove(tempFileName) out_file_for_raw_data.close() # the matrix scores are in the first element of each of the entries in imap_res score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return score_per_bin
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n") if args.labels is None: args.labels = args.bamfiles if args.smartLabels: args.labels = smartLabels(args.bamfiles) if len(args.labels) != len(args.bamfiles): sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles))) # Ensure that if we're given an attributeKey that it's not empty if args.attributeKey and args.attributeKey == "": args.attributeKey = None global gtf if not args.regionLabels and args.smartLabels: args.regionLabels = smartLabels(args.BED) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format(int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i])) of.close()
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, skipZeroOverZero=False, smoothLength=0, fixed_step=False, verbose=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandles = [] mappedList = [] for indexedFile, fileFormat in bamOrBwFileList: if fileFormat == 'bam': bam, mapped, unmapped, stats = bamHandler.openBam(indexedFile, returnStats=True, nThreads=numberOfProcessors) bamHandles.append(bam) mappedList.append(mapped) if len(bamHandles): genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize, mappedList) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose) else: genomeChunkLength = int(10e6) cCommon = [] chromNamesAndSize = {} for fileName, fileFormat in bamOrBwFileList: if fileFormat == 'bigwig': fh = pyBigWig.open(fileName) else: continue for chromName, size in list(fh.chroms().items()): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print("\nWARNING\n" "Chromosome {} length reported in the " "input files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bamOrBwFileList[0][0], size, fileName)) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size fh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, skipZeroOverZero, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) # Determine the sorted order of the temp files chrom_order = dict() for i, _ in enumerate(chromNamesAndSize): chrom_order[_[0]] = i res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() if format == 'bedgraph': of = open(outputFileName, 'wb') for r in res: if r is not None: _ = open(r[3], 'rb') shutil.copyfileobj(_, of) _.close() os.remove(r[3]) of.close() else: bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res], outputFileName)
def run(self, allArgs=None): bamFilesHandles = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except SystemExit: sys.exit(sys.exc_info()[1]) except: y = pyBigWig.open(x) bamFilesHandles.append(y) chromsizes, non_common = deeptools.utilities.getCommonChrNames( bamFilesHandles, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromsizes)) genomeSize = sum(chrLengths) if self.bedFile is None: chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths) else: chunkSize = None [bam_h.close() for bam_h in bamFilesHandles] if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions( allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce( [], countReadsInRegions_wrapper, chromsizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write( "*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n" ) # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit( '\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit( '\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def writeBedGraph(bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False, verbose=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandles = [] mappedList = [] for indexedFile, fileFormat in bamOrBwFileList: if fileFormat == 'bam': bam, mapped, unmapped, stats = bamHandler.openBam( indexedFile, returnStats=True, nThreads=numberOfProcessors) bamHandles.append(bam) mappedList.append(mapped) if len(bamHandles): genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize, mappedList) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose) else: genomeChunkLength = int(10e6) cCommon = [] chromNamesAndSize = {} for fileName, fileFormat in bamOrBwFileList: if fileFormat == 'bigwig': fh = pyBigWig.open(fileName) else: continue for chromName, size in list(fh.chroms().items()): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print("\nWARNING\n" "Chromosome {} length reported in the " "input files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bamOrBwFileList[0][0], size, fileName)) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size fh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) # Determine the sorted order of the temp files chrom_order = dict() for i, _ in enumerate(chromNamesAndSize): chrom_order[_[0]] = i res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() if format == 'bedgraph': of = open(outputFileName, 'wb') for r in res: if r is not None: _ = open(r[3], 'rb') shutil.copyfileobj(_, of) _.close() os.remove(r[3]) of.close() else: bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res], outputFileName)
def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. Parameters ---------- func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0} out_file_name : str name of the file to save the resulting data. smoothLength : int Distance in bp for smoothing the coverage per tile. """ self.__dict__["smoothLength"] = smoothLength bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList] genome_chunk_length = getGenomeChunkLength(bam_handlers, self.binLength) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) for x in list(self.__dict__.keys()): sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) res = mapReduce.mapReduce([func_to_call, func_args], writeBedGraph_wrapper, chrom_names_and_size, self_=self, genomeChunkLength=genome_chunk_length, region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) # concatenate intermediary bedgraph files out_file = open(out_file_name + ".bg", 'wb') for tempfilename in res: if tempfilename: # concatenate all intermediate tempfiles into one # bedgraph file _foo = open(tempfilename, 'rb') shutil.copyfileobj(_foo, out_file) _foo.close() os.remove(tempfilename) bedgraph_file = out_file.name out_file.close() if format == 'bedgraph': os.rename(bedgraph_file, out_file_name) if self.verbose: print("output file: {}".format(out_file_name)) else: bedGraphToBigWig(chrom_names_and_size, bedgraph_file, out_file_name, True) if self.verbose: print("output file: {}".format(out_file_name)) os.remove(bedgraph_file)
def run(self, allArgs=None): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except: y = pyBigWig.open(x) bamFilesHandlers.append(y) chromSizes, non_common = deeptools.utilities.getCommonChrNames( bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromSizes)) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max( int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max( int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError( "numberOfSamples has to be bigger than {} ".format( min_num_of_samples)) max_mapped = [] for x in bamFilesHandlers: try: max_mapped.append(x.mapped) except: # bigWig, use a fixed value max_mapped.append(0) max_mapped = max(max_mapped) # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin if max_mapped == 0: chunkSize = 10000 * self.binLength self.stepSize = self.binLength else: reads_per_bp = float(max_mapped) / genomeSize chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] # Ensure that chunkSize is always at least self.stepSize if chunkSize < self.stepSize: chunkSize = self.stepSize if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions( allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce( [], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write( "*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n" ) # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit( '\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit( '\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit( "Error: You need to specify at least one of --plotFile or --outRawCounts!\n" ) if args.labels is None: args.labels = args.bamfiles if len(args.labels) != len(args.bamfiles): sys.exit( "Error: The number of labels ({0}) does not match the number of BAM files ({1})!" .format(len(args.labels), len(args.bamfiles))) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write( "*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format( int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) / totalCounts[i])) of.close()
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False, verbose=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandlers = [bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam'] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=verbose) else: genomeChunkLength = int(10e6) cCommon = [] chromNamesAndSize = {} for fileName, fileFormat in bamOrBwFileList: if fileFormat == 'bigwig': fh = pyBigWig.open(fileName) else: continue for chromName, size in list(fh.chroms().items()): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print("\nWARNING\n" "Chromosome {} length reported in the " "input files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bamOrBwFileList[0][0], size, fileName)) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size fh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file _foo = open(tempFileName, 'rb') shutil.copyfileobj(_foo, outFile) _foo.close() os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print("output file: %s" % (outputFileName)) else: bedGraphToBigWig( chromNamesAndSize, bedGraphFile, outputFileName, True) if debug: print("output file: %s" % (outputFileName)) os.remove(bedGraphFile)