def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1): blacklisted = 0 if blackListFileName is None: return blacklisted # Get the chromosome lengths chromLens = {} lines = pysam.idxstats(bam_handle.filename) lines = toString(lines) if type(lines) is str: lines = lines.strip().split('\n') for line in lines: chrom, _len, nmapped, _nunmapped = line.split('\t') chromLens[chrom] = int(_len) bl = GTF(blackListFileName) regions = [] for chrom in bl.chroms: if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens: for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]): regions.append([bam_handle.filename, chrom, reg[0], reg[1]]) if len(regions) > 0: import multiprocessing if len(regions) > 1 and numberOfProcessors > 1: pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(bam_blacklisted_worker, regions).get(9999999) else: res = [bam_blacklisted_worker(x) for x in regions] for val in res: blacklisted += val return blacklisted
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1): blacklisted = 0 if blackListFileName is None: return blacklisted # Get the chromosome lengths chromLens = {x: y for x, y in zip(bam_handle.references, bam_handle.lengths)} bl = GTF(blackListFileName) hasOverlaps, minOverlap = bl.hasOverlaps(returnDistance=True) if hasOverlaps: sys.exit("Your blacklist file(s) has (have) regions that overlap. Proceeding with such a file would result in deepTools incorrectly calculating scaling factors. As such, you MUST fix this issue before being able to proceed.\n") if minOverlap < 1000: sys.stderr.write("WARNING: The minimum distance between intervals in your blacklist is {}. It makes little biological sense to include small regions between two blacklisted regions. Instead, these should likely be blacklisted as well.\n".format(minOverlap)) regions = [] for chrom in bl.chroms: if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens: for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]): regions.append([bam_handle.filename, chrom, reg[0], reg[1]]) if len(regions) > 0: import multiprocessing if len(regions) > 1 and numberOfProcessors > 1: pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(bam_blacklisted_worker, regions).get(9999999) else: res = [bam_blacklisted_worker(x) for x in regions] for val in res: blacklisted += val return blacklisted
def getPositionsToSample(chrom, start, end, stepSize): """ check if the region submitted to the worker overlaps with the region to take extra effort to sample. If that is the case, the regions to sample array is increased to match each of the positions in the extra effort region sampled at the same stepSize along the interval. If a filter out tree is given, then from positions to sample those regions are cleaned """ positions_to_sample = np.arange(start, end, stepSize) if global_vars['filter_out']: filter_out_tree = GTF(global_vars['filter_out']) else: filter_out_tree = None if global_vars['extra_sampling_file']: extra_tree = GTF(global_vars['extra_sampling_file']) else: extra_tree = None if extra_tree: orig_len = len(positions_to_sample) try: extra_match = extra_tree.findOverlaps(chrom, start, end) except KeyError: extra_match = [] if len(extra_match) > 0: for intval in extra_match: positions_to_sample = np.append(positions_to_sample, list(range(intval[0], intval[1], stepSize))) # remove duplicates positions_to_sample = np.unique(np.sort(positions_to_sample)) if debug: print("sampling increased to {} from {}".format( len(positions_to_sample), orig_len)) # skip regions that are filtered out if filter_out_tree: try: out_match = filter_out_tree.findOverlaps(chrom, start, end) except KeyError: out_match = [] if len(out_match) > 0: for intval in out_match: positions_to_sample = \ positions_to_sample[ (positions_to_sample < intval[0]) | (positions_to_sample >= intval[1])] return positions_to_sample
def getChunkLength(args, chromSize): """ There's no point in parsing the GTF time over and over again needlessly. Emprically, it seems that adding ~4x the number of workers is ideal, since coverage is non-uniform. This is a heuristic way of approximating that. Note that if there are MANY small contigs and a few large ones (e.g., the max and median lengths are >10x different, then it's best to take a different tack. """ if args.region: chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, args.region) rv = np.ceil((region_start - region_end) / float(4 * args.numberOfProcessors)).astype(int) return max(1, rv) bl = None if args.blackListFileName: bl = GTF(args.blackListFileName) lengths = [] for k, v in chromSize: regs = blSubtract(bl, k, [0, v]) for reg in regs: lengths.append(reg[1] - reg[0]) if len(lengths) >= 4 * args.numberOfProcessors: rv = np.median(lengths).astype(int) # In cases like dm6 or GRCh38, there are a LOT of really small contigs, which will cause the median to be small and performance to tank if np.max(lengths) >= 10 * rv: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) else: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) return max(1, rv)
def makeRegions(BED, args): """ Given a list of BED/GTF files, make a list of regions. These are vaguely extended as appropriate. For simplicity, the maximum of --beforeRegionStartLength and --afterRegionStartLength are tacked on to each end and transcripts are used for GTF files. """ itree = GTF(BED, transcriptID=args.transcriptID, transcript_id_designator=args.transcript_id_designator) o = [] extend = 0 # The before/after stuff is specific to computeMatrix if "beforeRegionsStartLength" in args: extend = max(args.beforeRegionsStartLength, args.afterRegionsStartLength) for chrom in itree.chroms: regs = itree.findOverlaps(chrom, 0, 4294967295) # bigWig files use 32 bit coordinates for reg in regs: o.append([chrom, max(0, reg[0] - extend), reg[1] + extend]) del itree return o
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1): blacklisted = 0 if blackListFileName is None: return blacklisted # Get the chromosome lengths chromLens = { x: y for x, y in zip(bam_handle.references, bam_handle.lengths) } bl = GTF(blackListFileName) hasOverlaps, minOverlap = bl.hasOverlaps(returnDistance=True) if hasOverlaps: sys.exit( "Your blacklist file(s) has (have) regions that overlap. Proceeding with such a file would result in deepTools incorrectly calculating scaling factors. As such, you MUST fix this issue before being able to proceed.\n" ) if minOverlap < 1000: sys.stderr.write( "WARNING: The minimum distance between intervals in your blacklist is {}. It makes little biological sense to include small regions between two blacklisted regions. Instead, these should likely be blacklisted as well.\n" .format(minOverlap)) regions = [] for chrom in bl.chroms: if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens: for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]): regions.append([bam_handle.filename, chrom, reg[0], reg[1]]) if len(regions) > 0: import multiprocessing if len(regions) > 1 and numberOfProcessors > 1: pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(bam_blacklisted_worker, regions).get(9999999) else: res = [bam_blacklisted_worker(x) for x in regions] for val in res: blacklisted += val return blacklisted
def makeRegions(BED, args): """ Given a list of BED/GTF files, make a list of regions. These are vaguely extended as appropriate. For simplicity, the maximum of --beforeRegionStartLength and --afterRegionStartLength are tacked on to each end and transcripts are used for GTF files. """ itree = GTF(BED, transcriptID=args.transcriptID, transcript_id_designator=args.transcript_id_designator) o = [] extend = 0 # The before/after stuff is specific to computeMatrix if "beforeRegionStartLength" in args: extend = max(args.beforeRegionStartLength, args.afterRegionStartLength) for chrom in itree.chroms: regs = itree.findOverlaps( chrom, 0, 4294967295) # bigWig files use 32 bit coordinates for reg in regs: o.append([chrom, max(0, reg[0] - extend), reg[1] + extend]) del itree return o
def main(): parser = argparse.ArgumentParser(add_help=True, description="Bourbon finds contiguous regions without repeats (low peat content) of a minimum size and without genes within some distance. Output is written to the terminal. Note that this program currently ignores the ends of chromosomes.") parser.add_argument("rmsk", help="Repeat masker file") parser.add_argument("gtf", help="GTF file") parser.add_argument("tbit", help="2bit file") parser.add_argument("--minimumProof", type=int, default=15000, help="Minimum size of a repeat-free region (default %(default)s)") parser.add_argument("--wobble", type=int, default=5000, help="Ensure no genes are within this distance of a region of interest (default %(default)s)") parser.add_argument("--legalBAC", type=float, default=0.01, help="Maximum N content (default %(default)s)") args = parser.parse_args() # Produce a header print("Chromosome\tStart\tEnd") genes = GTF(args.gtf) rmsk = open(args.rmsk) tb = py2bit.open(args.tbit) lastChrom = None lastEnd = 0 for line in rmsk: if line.startswith("#"): continue cols = line.strip().split() chrom = cols[5] start = int(cols[6]) - 1 end = int(cols[7]) if chrom == lastChrom: if start - lastEnd >= args.minimumProof: ROIstart = lastEnd ROIend = start blocks = splitByGenes(chrom, ROIstart, ROIend, genes, args.wobble) for block in blocks: if block[1] - block[0] < args.minimumProof: continue if not highN(chrom, block[0], block[1], tb, args.legalBAC): print("{}\t{}\t{}".format(chrom, block[0], block[1])) lastChrom = chrom lastEnd = end rmsk.close() tb.close()
def mapReduce(staticArgs, func, chromSize, genomeChunkLength=None, region=None, bedFile=None, blackListFileName=None, numberOfProcessors=4, verbose=False, includeLabels=False, keepExons=False, transcriptID="transcriptID", exonID="exonID", transcript_id_designator="transcript_id", self_=None): """ Split the genome into parts that are sent to workers using a defined number of procesors. Results are collected and returned. For each genomic region the given 'func' is called using the following parameters: chrom, start, end, staticArgs The *arg* are static, *pickable* variables that need to be sent to workers. The genome chunk length corresponds to a fraction of the genome, in bp, that is send to each of the workers for processing. Depending on the type of process a larger or shorter regions may be preferred :param chromSize: A list of duples containing the chromosome name and its length :param region: The format is chr:start:end:tileSize (see function getUserRegion) :param staticArgs: tuple of arguments that are sent to the given 'func' :param func: function to call. The function is called using the following parameters (chrom, start, end, staticArgs) :param bedFile: Is a bed file is given, the args to the func to be called are extended to include a list of bed defined regions. :param blackListFileName: A list of regions to exclude from all computations. Note that this has genomeChunkLength resolution... :param self_: In case mapreduce should make a call to an object the self variable has to be passed. :param includeLabels: Pass group and transcript labels into the calling function. These are added to the static args (groupLabel and transcriptName). If "includeLabels" is true, a tuple of (results, labels) is returned """ if not genomeChunkLength: genomeChunkLength = 1e5 genomeChunkLength = int(genomeChunkLength) if verbose: print("genome partition size for multiprocessing: {0}".format( genomeChunkLength)) region_start = 0 region_end = None # if a region is set, that means that the task should be only cover # the given genomic position if region: chromSize, region_start, region_end, genomeChunkLength = getUserRegion( chromSize, region) if verbose: print("chrom size: {0}, region start: {1}, region end: {2}, " "genome chunk length sent to each procesor: {3}".format( chromSize, region_start, region_end, genomeChunkLength)) if bedFile: defaultGroup = None if len(bedFile) == 1: defaultGroup = "genes" bed_interval_tree = GTF( bedFile, defaultGroup=defaultGroup, transcriptID=transcriptID, exonID=exonID, transcript_id_designator=transcript_id_designator, keepExons=keepExons) if blackListFileName: blackList = GTF(blackListFileName) TASKS = [] # iterate over all chromosomes for chrom, size in chromSize: # the start is zero unless a specific region is defined start = 0 if region_start == 0 else region_start for startPos in range(start, size, genomeChunkLength): endPos = min(size, startPos + genomeChunkLength) # Reject a chunk if it overlaps if blackListFileName: regions = blSubtract(blackList, chrom, [startPos, endPos]) else: regions = [[startPos, endPos]] for reg in regions: if self_ is not None: argsList = [self_] else: argsList = [] argsList.extend([chrom, reg[0], reg[1]]) # add to argument list the static list received the the function argsList.extend(staticArgs) # if a bed file is given, append to the TASK list, # a list of bed regions that overlap with the # current genomeChunk. if bedFile: # This effectively creates batches of intervals, which is # generally more performant due to the added overhead of # initializing additional workers. # TODO, there's no point in including the chromosome if includeLabels: bed_regions_list = [[ chrom, x[4], x[2], x[3], x[5], x[6] ] for x in bed_interval_tree.findOverlaps( chrom, reg[0], reg[1], trimOverlap=True, numericGroups=True, includeStrand=True)] else: bed_regions_list = [[ chrom, x[4], x[5], x[6] ] for x in bed_interval_tree.findOverlaps( chrom, reg[0], reg[1], trimOverlap=True, includeStrand=True)] if len(bed_regions_list) == 0: continue # add to argument list, the position of the bed regions to use argsList.append(bed_regions_list) TASKS.append(tuple(argsList)) if len(TASKS) > 1 and numberOfProcessors > 1: if verbose: print(("using {} processors for {} " "number of tasks".format(numberOfProcessors, len(TASKS)))) random.shuffle(TASKS) pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(func, TASKS).get(9999999) else: res = list(map(func, TASKS)) if includeLabels: if bedFile: return res, bed_interval_tree.labels else: return res, None return res
def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. >>> test = Tester() >>> import pysam >>> c = CountReadsPerBin([], stepSize=1, extendReads=300) For this case the reads are length 36. The number of overlapping read fragments is 4 and 5 for the positions tested. >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000833, 5000834), (5000834, 5000835)]) array([4., 5.]) In the following example a paired read is extended to the fragment length which is 100 The first mate starts at 5000000 and the second at 5000064. Each mate is extended to the fragment length *independently* At position 500090-500100 one fragment of length 100 overlap, and after position 5000101 there should be zero reads. >>> c.zerosToNans = True >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000090, 5000100), (5000100, 5000110)]) array([ 1., nan]) In the following case the reads length is 50. Reads are not extended. >>> c.extendReads=False >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) array([1., 2., 2.]) """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. TODO: profile the function c = 0 if chrom not in bamHandle.references: raise NameError( "chromosome {} not found in bam file".format(chrom)) prev_pos = set() lpos = None # of previous processed read pair for read in bamHandle.fetch(chrom, regStart, regEnd): if read.is_unmapped: continue if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths tLen = deeptools.utilities.getTLen(read) if self.minFragmentLength > 0 and tLen < self.minFragmentLength: continue if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add( (s, e, read.next_reference_id, read.is_reverse)) # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue last_eIdx = None for fragmentStart, fragmentEnd in position_blocks: if fragmentEnd is None or fragmentStart is None: continue fragmentLength = fragmentEnd - fragmentStart if fragmentLength == 0: continue # skip reads that are not in the region being # evaluated. if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: continue if fragmentStart < reg[0]: fragmentStart = reg[0] if fragmentEnd > reg[0] + len(coverages) * tileSize: fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max( (fragmentStart - reg[0]) // tileSize, 0) eIdx = vector_start + min( np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: continue sIdx = int(sIdx) eIdx = int(eIdx) coverages[sIdx:eIdx] += 1 last_eIdx = eIdx c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position within the interval (start, end) for a window or bin of size binLength. The stepSize controls the distance between bins. For example, a step size of 20 and a bin size of 20 will create bins next to each other. If the step size is smaller than the bin size the bins will overlap. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate bed_regions_list: list List of list of tuples of the form (start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- numpy array The result is a numpy array that as rows each bin and as columns each bam file. Examples -------- Initialize some useful values >>> test = Tester() >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50) The transpose is used to get better looking numbers. The first line corresponds to the number of reads per bin in the first bamfile. >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200) >>> _array array([[0., 0.], [0., 1.], [1., 1.], [1., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) if self.stepSize is None and bed_regions_list is None: raise ValueError("stepSize is not set!") # array to keep the read counts for the regions subnum_reads_per_bin = [] start_time = time.time() bam_handles = [] for fname in self.bamFilesList: try: bam_handles.append(bamHandler.openBam(fname)) except SystemExit: sys.exit(sys.exc_info()[1]) except: bam_handles.append(pyBigWig.open(fname)) blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) # A list of lists of tuples transcriptsToConsider = [] if bed_regions_list is not None: transcriptsToConsider = [x[1] for x in bed_regions_list] else: if self.stepSize == self.binLength: transcriptsToConsider.append([(start, end, self.binLength)]) else: for i in range(start, end, self.stepSize): if i + self.binLength > end: break if blackList is not None and blackList.findOverlaps( chrom, i, i + self.binLength): continue transcriptsToConsider.append([(i, i + self.binLength)]) if self.save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' for bam in bam_handles: for trans in transcriptsToConsider: tcov = self.get_coverage_of_region(bam, chrom, trans) if bed_regions_list is not None: subnum_reads_per_bin.append(np.sum(tcov)) else: subnum_reads_per_bin.extend(tcov) subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape( -1, len(self.bamFilesList), order='F') if self.save_data: idx = 0 for i, trans in enumerate(transcriptsToConsider): if len(trans[0]) != 3: starts = ",".join([str(x[0]) for x in trans]) ends = ",".join([str(x[1]) for x in trans]) _file.write("\t".join([chrom, starts, ends]) + "\t") _file.write("\t".join( ["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n") else: for exon in trans: for startPos in range(exon[0], exon[1], exon[2]): if idx >= subnum_reads_per_bin.shape[0]: # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size # Counts there are added to the bin before them, but range() will still try to include them. break _file.write("{0}\t{1}\t{2}\t".format( chrom, startPos, startPos + exon[2])) _file.write("\t".join([ "{}".format(x) for x in subnum_reads_per_bin[idx, :] ]) + "\n") idx += 1 _file.close() if self.verbose: endTime = time.time() rows = subnum_reads_per_bin.shape[0] print("%s countReadsInRegions_worker: processing %d " "(%.1f per sec) @ %s:%s-%s" % (multiprocessing.current_process().name, rows, rows / (endTime - start_time), chrom, start, end)) return subnum_reads_per_bin, _file_name
def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. >>> test = Tester() >>> import pysam >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300) For this case the reads are length 36. The number of overlapping read fragments is 4 and 5 for the positions tested. Note that reads are NOT extended, due to there being a 0 length input list of BAM files! >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000833, 5000834), (5000834, 5000835)]) array([4., 5.]) In the following case the reads length is 50. Reads are not extended. >>> c.extendReads=False >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) array([2., 4., 4.]) """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. TODO: profile the function c = 0 try: # BAM input if chrom not in bamHandle.references: raise NameError("chromosome {} not found in bam file".format(chrom)) except: # bigWig input, as used by plotFingerprint if bamHandle.chroms(chrom): _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) _[np.isnan(_)] = 0.0 _ = _ * tileSize coverages += _ continue else: raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) prev_pos = set() lpos = None # of previous processed read pair for read in bamHandle.fetch(chrom, regStart, regEnd): if read.is_unmapped: continue if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths tLen = getTLen(read) if self.minFragmentLength > 0 and tLen < self.minFragmentLength: continue if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue last_eIdx = None for fragmentStart, fragmentEnd in position_blocks: if fragmentEnd is None or fragmentStart is None: continue fragmentLength = fragmentEnd - fragmentStart if fragmentLength == 0: continue # skip reads that are not in the region being # evaluated. if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: continue if fragmentStart < reg[0]: fragmentStart = reg[0] if fragmentEnd > reg[0] + len(coverages) * tileSize: fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) if eIdx >= len(coverages): eIdx = len(coverages) - 1 if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: continue # First bin if fragmentEnd < reg[0] + (sIdx + 1) * tileSize: _ = fragmentEnd - fragmentStart else: _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart if _ > tileSize: _ = tileSize coverages[sIdx] += _ _ = sIdx + 1 while _ < eIdx: coverages[_] += tileSize _ += 1 while eIdx - sIdx >= nRegBins: eIdx -= 1 if eIdx > sIdx: _ = fragmentEnd - (reg[0] + eIdx * tileSize) if _ > tileSize: _ = tileSize elif _ < 0: _ = 0 coverages[eIdx] += _ last_eIdx = eIdx c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. >>> test = Tester() >>> import pysam >>> c = CountReadsPerBin([], stepSize=1, extendReads=300) For this case the reads are length 36. The number of overlapping read fragments is 4 and 5 for the positions tested. >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000833, 5000834), (5000834, 5000835)]) array([ 4., 5.]) In the following example a paired read is extended to the fragment length which is 100 The first mate starts at 5000000 and the second at 5000064. Each mate is extended to the fragment length *independently* At position 500090-500100 one fragment of length 100 overlap, and after position 5000101 there should be zero reads. >>> c.zerosToNans = True >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000090, 5000100), (5000100, 5000110)]) array([ 1., nan]) In the following case the reads length is 50. Reads are not extended. >>> c.extendReads=False >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) array([ 1., 2., 2.]) """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. TODO: profile the function c = 0 if chrom in bamHandle.references: reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd) if r.flag & 4 == 0] else: raise NameError("chromosome {} not found in bam file".format(chrom)) prev_start_pos = None # to store the start positions # of previous processed read pair for read in reads: if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths tLen = deeptools.utilities.getTLen(read) if self.minFragmentLength > 0 and tLen < self.minFragmentLength: continue if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue last_eIdx = None for fragmentStart, fragmentEnd in position_blocks: if fragmentEnd is None or fragmentStart is None: continue fragmentLength = fragmentEnd - fragmentStart if fragmentLength == 0: continue # skip reads that are not in the region being # evaluated. if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: continue if fragmentStart < reg[0]: fragmentStart = reg[0] if fragmentEnd > reg[0] + len(coverages) * tileSize: fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: continue coverages[sIdx:eIdx] += 1 last_eIdx = eIdx prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position within the interval (start, end) for a window or bin of size binLength. The stepSize controls the distance between bins. For example, a step size of 20 and a bin size of 20 will create bins next to each other. If the step size is smaller than the bin size the bins will overlap. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate bed_regions_list: list List of list of tuples of the form (start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- numpy array The result is a numpy array that as rows each bin and as columns each bam file. Examples -------- Initialize some useful values >>> test = Tester() >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50) The transpose is used to get better looking numbers. The first line corresponds to the number of reads per bin in the first bamfile. >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200) >>> _array array([[ 0., 0.], [ 0., 1.], [ 1., 1.], [ 1., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) if self.stepSize is None: raise ValueError("stepSize is not set!") # array to keep the read counts for the regions subnum_reads_per_bin = [] start_time = time.time() bam_handlers = [] for fname in self.bamFilesList: try: bam_handlers.append(bamHandler.openBam(fname)) except: bam_handlers.append(pyBigWig.open(fname)) blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) # A list of lists of tuples transcriptsToConsider = [] if bed_regions_list is not None: transcriptsToConsider = [x[1] for x in bed_regions_list] else: if self.stepSize == self.binLength: transcriptsToConsider.append([(start, end, self.binLength)]) else: for i in range(start, end, self.stepSize): if i + self.binLength > end: break if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength): continue transcriptsToConsider.append([(i, i + self.binLength)]) if self.save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' for bam in bam_handlers: for trans in transcriptsToConsider: tcov = self.get_coverage_of_region(bam, chrom, trans) if bed_regions_list is not None: subnum_reads_per_bin.append(np.sum(tcov)) else: subnum_reads_per_bin.extend(tcov) subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F') if self.save_data: idx = 0 for i, trans in enumerate(transcriptsToConsider): if len(trans[0]) != 3: starts = ",".join([str(x[0]) for x in trans]) ends = ",".join([str(x[1]) for x in trans]) _file.write("\t".join([chrom, starts, ends]) + "\t") _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n") else: for exon in trans: for startPos in range(exon[0], exon[1], exon[2]): if idx >= subnum_reads_per_bin.shape[0]: # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size # Counts there are added to the bin before them, but range() will still try to include them. break _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, startPos + exon[2])) _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n") idx += 1 _file.close() if self.verbose: endTime = time.time() rows = subnum_reads_per_bin.shape[0] print("%s countReadsInRegions_worker: processing %d " "(%.1f per sec) @ %s:%s-%s" % (multiprocessing.current_process().name, rows, rows / (endTime - start_time), chrom, start, end)) return subnum_reads_per_bin, _file_name
def count_reads_in_region_with_intron(self, chrom, start, end, bed_regions_list=None): """ Rewrite deeptools.CountReadsPerBin.count_reads_in_region Args: chrom (str): Chrom start (int): Start position end (int): End position bed_regions_list (list): List of bed region Returns: tuple: subnum_reads_per_bin, file_name """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) if self.stepSize is None: raise ValueError("stepSize is not set!") # array to keep the read counts for the regions subnum_reads_per_bin = [] start_time = time.time() bam_handlers = [] for fname in self.bamFilesList: try: bam_handlers.append(bamHandler.openBam(fname)) except: bam_handlers.append(pyBigWig.open(fname)) blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) # A list of lists of tuples transcriptsToConsider = [] if bed_regions_list is not None: transcriptsToConsider = [x[1] for x in bed_regions_list] else: if self.stepSize == self.binLength: transcriptsToConsider.append([(start, end, self.binLength)]) else: for i in range(start, end, self.stepSize): if i + self.binLength > end: break if blackList is not None and blackList.findOverlaps( chrom, i, i + self.binLength): continue transcriptsToConsider.append([(i, i + self.binLength)]) if self.save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' for bam in bam_handlers: for trans in transcriptsToConsider: tcov = self.get_coverage_of_region_with_intron( bam, chrom, trans) if bed_regions_list is not None: subnum_reads_per_bin.append(np.sum(tcov)) else: subnum_reads_per_bin.extend(tcov) subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape( -1, len(self.bamFilesList), order='F') if self.save_data: idx = 0 for i, trans in enumerate(transcriptsToConsider): if len(trans[0]) != 3: starts = ",".join([str(x[0]) for x in trans]) ends = ",".join([str(x[1]) for x in trans]) _file.write("\t".join([chrom, starts, ends]) + "\t") _file.write("\t".join( ["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n") else: for exon in trans: for startPos in range(exon[0], exon[1], exon[2]): if idx >= subnum_reads_per_bin.shape[0]: # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size # Counts there are added to the bin before them, but range() will still try to include them. break _file.write("{0}\t{1}\t{2}\t".format( chrom, startPos, startPos + exon[2])) _file.write("\t".join([ "{}".format(x) for x in subnum_reads_per_bin[idx, :] ]) + "\n") idx += 1 _file.close() if self.verbose: endTime = time.time() rows = subnum_reads_per_bin.shape[0] print("%s countReadsInRegions_worker: processing %d " "(%.1f per sec) @ %s:%s-%s" % (multiprocessing.current_process().name, rows, rows / (endTime - start_time), chrom, start, end)) return subnum_reads_per_bin, _file_name
def get_coverage_of_region_with_intron(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Rewrite deeptools.CountReadsPerBin.get_coverage_of_region Args: bamHandle (AlignmentFile): Bam object chrom (str): Chrom regions (list): List of block fragmentFromRead_func (function): Function to get fragment from read Returns: float: coverages """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. c = 0 if chrom in bamHandle.references: reads = [ r for r in bamHandle.fetch(chrom, regStart, regEnd) if r.flag & 4 == 0 ] else: raise NameError( "chromosome {} not found in bam file".format(chrom)) prev_start_pos = None # to store the start positions # of previous processed read pair for read in reads: if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths tLen = deeptools.utilities.getTLen(read) if self.minFragmentLength > 0 and tLen < self.minFragmentLength: continue if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue # Rewrite !!! sIdx = vector_start + max( (read.reference_start - reg[0]) // tileSize, 0) eIdx = vector_start + min( np.ceil(float(read.reference_end - reg[0]) / tileSize).astype('int'), nRegBins) coverages[sIdx:eIdx] += 1 prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. >>> test = Tester() >>> import pysam >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300) For this case the reads are length 36. The number of overlapping read fragments is 4 and 5 for the positions tested. Note that reads are NOT extended, due to there being a 0 length input list of BAM files! >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000833, 5000834), (5000834, 5000835)]) array([ 4., 5.]) In the following case the reads length is 50. Reads are not extended. >>> c.extendReads=False >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) array([ 2., 4., 4.]) """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. TODO: profile the function c = 0 try: # BAM input if chrom in bamHandle.references: reads = [ r for r in bamHandle.fetch(chrom, regStart, regEnd) if r.flag & 4 == 0 ] else: raise NameError( "chromosome {} not found in bam file".format(chrom)) except: # bigWig input, as used by plotFingerprint if bamHandle.chroms(chrom): _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) _[np.isnan(_)] = 0.0 _ = _ * tileSize coverages += _ continue else: raise NameError( "chromosome {} not found in bigWig file with chroms {}" .format(chrom, bamHandle.chroms())) prev_start_pos = None # to store the start positions # of previous processed read pair for read in reads: if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths if self.minFragmentLength > 0 and abs( read.template_length) < self.minFragmentLength: continue if self.maxFragmentLength > 0 and abs( read.template_length) > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue last_eIdx = None for fragmentStart, fragmentEnd in position_blocks: if fragmentEnd is None or fragmentStart is None: continue fragmentLength = fragmentEnd - fragmentStart if fragmentLength == 0: continue # skip reads that are not in the region being # evaluated. if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: continue if fragmentStart < reg[0]: fragmentStart = reg[0] if fragmentEnd > reg[0] + len(coverages) * tileSize: fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max( (fragmentStart - reg[0]) // tileSize, 0) eIdx = vector_start + min( np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) if eIdx >= len(coverages): eIdx = len(coverages) - 1 if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: continue # First bin if fragmentEnd < reg[0] + (sIdx + 1) * tileSize: _ = fragmentEnd - fragmentStart else: _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart if _ > tileSize: _ = tileSize coverages[sIdx] += _ _ = sIdx + 1 while _ < eIdx: coverages[_] += tileSize _ += 1 while eIdx - sIdx >= nRegBins: eIdx -= 1 if eIdx > sIdx: _ = fragmentEnd - (reg[0] + eIdx * tileSize) if _ > tileSize: _ = tileSize elif _ < 0: _ = 0 coverages[eIdx] += _ last_eIdx = eIdx prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
def mapReduce(staticArgs, func, chromSize, genomeChunkLength=None, region=None, bedFile=None, blackListFileName=None, numberOfProcessors=4, verbose=False, includeLabels=False, keepExons=False, transcriptID="transcriptID", exonID="exonID", transcript_id_designator="transcript_id", self_=None): """ Split the genome into parts that are sent to workers using a defined number of procesors. Results are collected and returned. For each genomic region the given 'func' is called using the following parameters: chrom, start, end, staticArgs The *arg* are static, *pickable* variables that need to be sent to workers. The genome chunk length corresponds to a fraction of the genome, in bp, that is send to each of the workers for processing. Depending on the type of process a larger or shorter regions may be preferred :param chromSize: A list of duples containing the chromosome name and its length :param region: The format is chr:start:end:tileSize (see function getUserRegion) :param staticArgs: tuple of arguments that are sent to the given 'func' :param func: function to call. The function is called using the following parameters (chrom, start, end, staticArgs) :param bedFile: Is a bed file is given, the args to the func to be called are extended to include a list of bed defined regions. :param blackListFileName: A list of regions to exclude from all computations. Note that this has genomeChunkLength resolution... :param self_: In case mapreduce should make a call to an object the self variable has to be passed. :param includeLabels: Pass group and transcript labels into the calling function. These are added to the static args (groupLabel and transcriptName). If "includeLabels" is true, a tuple of (results, labels) is returned """ if not genomeChunkLength: genomeChunkLength = 1e5 genomeChunkLength = int(genomeChunkLength) if verbose: print("genome partition size for multiprocessing: {0}".format( genomeChunkLength)) region_start = 0 region_end = None # if a region is set, that means that the task should only cover # the given genomic position if region: chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, region) if verbose: print("chrom size: {0}, region start: {1}, region end: {2}, " "genome chunk length sent to each procesor: {3}".format(chromSize, region_start, region_end, genomeChunkLength)) if bedFile: defaultGroup = None if len(bedFile) == 1: defaultGroup = "genes" bed_interval_tree = GTF(bedFile, defaultGroup=defaultGroup, transcriptID=transcriptID, exonID=exonID, transcript_id_designator=transcript_id_designator, keepExons=keepExons) if blackListFileName: blackList = GTF(blackListFileName) TASKS = [] # iterate over all chromosomes for chrom, size in chromSize: # the start is zero unless a specific region is defined start = 0 if region_start == 0 else region_start for startPos in range(start, size, genomeChunkLength): endPos = min(size, startPos + genomeChunkLength) # Reject a chunk if it overlaps if blackListFileName: regions = blSubtract(blackList, chrom, [startPos, endPos]) else: regions = [[startPos, endPos]] for reg in regions: if self_ is not None: argsList = [self_] else: argsList = [] argsList.extend([chrom, reg[0], reg[1]]) # add to argument list the static list received the the function argsList.extend(staticArgs) # if a bed file is given, append to the TASK list, # a list of bed regions that overlap with the # current genomeChunk. if bedFile: # This effectively creates batches of intervals, which is # generally more performant due to the added overhead of # initializing additional workers. # TODO, there's no point in including the chromosome if includeLabels: bed_regions_list = [[chrom, x[4], x[2], x[3], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, numericGroups=True, includeStrand=True)] else: bed_regions_list = [[chrom, x[4], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, includeStrand=True)] if len(bed_regions_list) == 0: continue # add to argument list, the position of the bed regions to use argsList.append(bed_regions_list) TASKS.append(tuple(argsList)) if len(TASKS) > 1 and numberOfProcessors > 1: if verbose: print(("using {} processors for {} " "number of tasks".format(numberOfProcessors, len(TASKS)))) random.shuffle(TASKS) pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(func, TASKS).get(9999999) else: res = list(map(func, TASKS)) if includeLabels: if bedFile: return res, bed_interval_tree.labels else: return res, None return res