def getChunkLength(args, chromSize): """ There's no point in parsing the GTF time over and over again needlessly. Emprically, it seems that adding ~4x the number of workers is ideal, since coverage is non-uniform. This is a heuristic way of approximating that. Note that if there are MANY small contigs and a few large ones (e.g., the max and median lengths are >10x different, then it's best to take a different tack. """ if args.region: chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, args.region) rv = np.ceil((region_start - region_end) / float(4 * args.numberOfProcessors)).astype(int) return max(1, rv) bl = None if args.blackListFileName: bl = GTF(args.blackListFileName) lengths = [] for k, v in chromSize: regs = blSubtract(bl, k, [0, v]) for reg in regs: lengths.append(reg[1] - reg[0]) if len(lengths) >= 4 * args.numberOfProcessors: rv = np.median(lengths).astype(int) # In cases like dm6 or GRCh38, there are a LOT of really small contigs, which will cause the median to be small and performance to tank if np.max(lengths) >= 10 * rv: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) else: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) return max(1, rv)
def getChunkLength(args, chromSize): """ There's no point in parsing the GTF time over and over again needlessly. Emprically, it seems that adding ~4x the number of workers is ideal, since coverage is non-uniform. This is a heuristic way of approximating that. Note that if there are MANY small contigs and a few large ones (e.g., the max and median lengths are >10x different, then it's best to take a different tack. """ if args.region: chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, args.region) rv = np.ceil((region_start - region_end) / float(4 * args.numberOfProcessors)).astype(int) return max(1, rv) bl = None if args.blackListFileName: bl = GTF(args.blackListFileName) lengths = [] for k, v in chromSize: regs = blSubtract(bl, k, [0, v]) for reg in regs: lengths.append(reg[1] - reg[0]) if len(lengths) >= 4 * args.numberOfProcessors: rv = np.median(lengths).astype(int) # In cases like dm6 or GRCh38, there are a LOT of really small contigs, which will cause the median to be small and performance to tank if np.max(lengths) >= 10 * rv: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) else: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) return max(1, rv)
def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spent loading the files # if too long, some processors end up free. # the following values are empirical if self.stepSize is None: if self.region is None: self.stepSize = max( int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max( int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError( "numberOfSamples has to be bigger than {} ".format( min_num_of_samples)) max_mapped = 0 if len(self.mappedList) > 0: max_mapped = max(self.mappedList) # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin if max_mapped == 0: chunkSize = 10000 * self.binLength self.stepSize = self.binLength else: reads_per_bp = float(max_mapped) / genomeSize chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandles))) # Ensure that chunkSize is always at least self.stepSize if chunkSize < self.stepSize: chunkSize = self.stepSize # Ensure that chunkSize is always at least self.binLength if self.binLength and chunkSize < self.binLength: chunkSize = self.binLength return chunkSize
def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spent loading the files # if too long, some processors end up free. # the following values are empirical if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = 0 if len(self.mappedList) > 0: max_mapped = max(self.mappedList) # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin if max_mapped == 0: chunkSize = 10000 * self.binLength self.stepSize = self.binLength else: reads_per_bp = float(max_mapped) / genomeSize chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandles))) # Ensure that chunkSize is always at least self.stepSize if chunkSize < self.stepSize: chunkSize = self.stepSize # Ensure that chunkSize is always at least self.binLength if self.binLength and chunkSize < self.binLength: chunkSize = self.binLength return chunkSize
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc tbit = twobit.TwoBitFile(global_vars['2bit']) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum(tbit.sequence_sizes().values()) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.sequence_sizes().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(k, v) for k, v in tbit.sequence_sizes().items()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc))] global_vars['max_dup_gc'] = max_dup_gc tbit = py2bit.open(global_vars['2bit']) bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async( writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) oname = args.correctedFile.name args.correctedFile.close() if oname.endswith('bg'): f = open(oname, 'wb') for tempFileName in res: if tempFileName: shutil.copyfileobj(open(tempFileName, 'rb'), f) os.remove(tempFileName) f.close() else: chromSizes = [(k, v) for k, v in tbit.chroms().items()] writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
def run(self, allArgs=None): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except: y = pyBigWig.open(x) bamFilesHandlers.append(y) chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromSizes)) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = [] for x in bamFilesHandlers: try: max_mapped.append(x.mapped) except: # bigWig, use a fixed value max_mapped.append(0) max_mapped = max(max_mapped) # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin if max_mapped == 0: chunkSize = 10000 * self.binLength self.stepSize = self.binLength else: reads_per_bp = float(max_mapped) / genomeSize chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] # Ensure that chunkSize is always at least self.stepSize if chunkSize < self.stepSize: chunkSize = self.stepSize if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def run(self, allArgs=None): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except: y = pyBigWig.open(x) bamFilesHandlers.append(y) chromSizes, non_common = deeptools.utilities.getCommonChrNames( bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromSizes)) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max( int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max( int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError( "numberOfSamples has to be bigger than {} ".format( min_num_of_samples)) max_mapped = [] for x in bamFilesHandlers: try: max_mapped.append(x.mapped) except: # bigWig, use a fixed value max_mapped.append(0) max_mapped = max(max_mapped) # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin if max_mapped == 0: chunkSize = 10000 * self.binLength self.stepSize = self.binLength else: reads_per_bp = float(max_mapped) / genomeSize chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] # Ensure that chunkSize is always at least self.stepSize if chunkSize < self.stepSize: chunkSize = self.stepSize if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions( allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce( [], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write( "*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n" ) # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit( '\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit( '\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')