Exemple #1
0
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    # Get the chromosome lengths
    chromLens = {}
    lines = pysam.idxstats(bam_handle.filename)
    lines = toString(lines)
    if type(lines) is str:
        lines = lines.strip().split('\n')
    for line in lines:
        chrom, _len, nmapped, _nunmapped = line.split('\t')
        chromLens[chrom] = int(_len)

    bl = GTF(blackListFileName)
    regions = []
    for chrom in bl.chroms:
        if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens:
            for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]):
                regions.append([bam_handle.filename, chrom, reg[0], reg[1]])

    if len(regions) > 0:
        import multiprocessing
        if len(regions) > 1 and numberOfProcessors > 1:
            pool = multiprocessing.Pool(numberOfProcessors)
            res = pool.map_async(bam_blacklisted_worker, regions).get(9999999)
        else:
            res = [bam_blacklisted_worker(x) for x in regions]
        for val in res:
            blacklisted += val

    return blacklisted
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    # Get the chromosome lengths
    chromLens = {}
    lines = pysam.idxstats(bam_handle.filename)
    lines = toString(lines)
    if type(lines) is str:
        lines = lines.strip().split('\n')
    for line in lines:
        chrom, _len, nmapped, _nunmapped = line.split('\t')
        chromLens[chrom] = int(_len)

    bl = GTF(blackListFileName)
    regions = []
    for chrom in bl.chroms:
        if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens:
            for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]):
                regions.append([bam_handle.filename, chrom, reg[0], reg[1]])

    if len(regions) > 0:
        import multiprocessing
        if len(regions) > 1 and numberOfProcessors > 1:
            pool = multiprocessing.Pool(numberOfProcessors)
            res = pool.map_async(bam_blacklisted_worker, regions).get(9999999)
        else:
            res = [bam_blacklisted_worker(x) for x in regions]
        for val in res:
            blacklisted += val

    return blacklisted
Exemple #3
0
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    # Get the chromosome lengths
    chromLens = {x: y for x, y in zip(bam_handle.references, bam_handle.lengths)}

    bl = GTF(blackListFileName)
    hasOverlaps, minOverlap = bl.hasOverlaps(returnDistance=True)
    if hasOverlaps:
        sys.exit("Your blacklist file(s) has (have) regions that overlap. Proceeding with such a file would result in deepTools incorrectly calculating scaling factors. As such, you MUST fix this issue before being able to proceed.\n")
    if minOverlap < 1000:
        sys.stderr.write("WARNING: The minimum distance between intervals in your blacklist is {}. It makes little biological sense to include small regions between two blacklisted regions. Instead, these should likely be blacklisted as well.\n".format(minOverlap))

    regions = []
    for chrom in bl.chroms:
        if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens:
            for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]):
                regions.append([bam_handle.filename, chrom, reg[0], reg[1]])

    if len(regions) > 0:
        import multiprocessing
        if len(regions) > 1 and numberOfProcessors > 1:
            pool = multiprocessing.Pool(numberOfProcessors)
            res = pool.map_async(bam_blacklisted_worker, regions).get(9999999)
        else:
            res = [bam_blacklisted_worker(x) for x in regions]
        for val in res:
            blacklisted += val

    return blacklisted
def getPositionsToSample(chrom, start, end, stepSize):
    """
    check if the region submitted to the worker
    overlaps with the region to take extra effort to sample.
    If that is the case, the regions to sample array is
    increased to match each of the positions in the extra
    effort region sampled at the same stepSize along the interval.

    If a filter out tree is given, then from positions to sample
    those regions are cleaned
    """
    positions_to_sample = np.arange(start, end, stepSize)

    if global_vars['filter_out']:
        filter_out_tree = GTF(global_vars['filter_out'])
    else:
        filter_out_tree = None

    if global_vars['extra_sampling_file']:
        extra_tree = GTF(global_vars['extra_sampling_file'])
    else:
        extra_tree = None

    if extra_tree:
        orig_len = len(positions_to_sample)
        try:
            extra_match = extra_tree.findOverlaps(chrom, start, end)
        except KeyError:
            extra_match = []

        if len(extra_match) > 0:
            for intval in extra_match:
                positions_to_sample = np.append(positions_to_sample,
                                                list(range(intval[0], intval[1], stepSize)))
        # remove duplicates
        positions_to_sample = np.unique(np.sort(positions_to_sample))
        if debug:
            print("sampling increased to {} from {}".format(
                len(positions_to_sample),
                orig_len))

    # skip regions that are filtered out
    if filter_out_tree:
        try:
            out_match = filter_out_tree.findOverlaps(chrom, start, end)
        except KeyError:
            out_match = []

        if len(out_match) > 0:
            for intval in out_match:
                positions_to_sample = \
                    positions_to_sample[
                        (positions_to_sample < intval[0]) |
                        (positions_to_sample >= intval[1])]
    return positions_to_sample
def getChunkLength(args, chromSize):
    """
    There's no point in parsing the GTF time over and over again needlessly.
    Emprically, it seems that adding ~4x the number of workers is ideal, since
    coverage is non-uniform. This is a heuristic way of approximating that.

    Note that if there are MANY small contigs and a few large ones (e.g., the
    max and median lengths are >10x different, then it's best to take a
    different tack.
    """

    if args.region:
        chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, args.region)
        rv = np.ceil((region_start - region_end) / float(4 * args.numberOfProcessors)).astype(int)
        return max(1, rv)

    bl = None
    if args.blackListFileName:
        bl = GTF(args.blackListFileName)

    lengths = []
    for k, v in chromSize:
        regs = blSubtract(bl, k, [0, v])
        for reg in regs:
            lengths.append(reg[1] - reg[0])

    if len(lengths) >= 4 * args.numberOfProcessors:
        rv = np.median(lengths).astype(int)
        # In cases like dm6 or GRCh38, there are a LOT of really small contigs, which will cause the median to be small and performance to tank
        if np.max(lengths) >= 10 * rv:
            rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int)
    else:
        rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int)

    return max(1, rv)
Exemple #6
0
def makeRegions(BED, args):
    """
    Given a list of BED/GTF files, make a list of regions.
    These are vaguely extended as appropriate. For simplicity, the maximum of --beforeRegionStartLength
    and --afterRegionStartLength are tacked on to each end and transcripts are used for GTF files.
    """
    itree = GTF(BED, transcriptID=args.transcriptID, transcript_id_designator=args.transcript_id_designator)
    o = []
    extend = 0
    # The before/after stuff is specific to computeMatrix
    if "beforeRegionsStartLength" in args:
        extend = max(args.beforeRegionsStartLength, args.afterRegionsStartLength)
    for chrom in itree.chroms:
        regs = itree.findOverlaps(chrom, 0, 4294967295)  # bigWig files use 32 bit coordinates
        for reg in regs:
            o.append([chrom, max(0, reg[0] - extend), reg[1] + extend])
    del itree
    return o
Exemple #7
0
def bam_blacklisted_reads(bam_handle,
                          chroms_to_ignore,
                          blackListFileName=None,
                          numberOfProcessors=1):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    # Get the chromosome lengths
    chromLens = {
        x: y
        for x, y in zip(bam_handle.references, bam_handle.lengths)
    }

    bl = GTF(blackListFileName)
    hasOverlaps, minOverlap = bl.hasOverlaps(returnDistance=True)
    if hasOverlaps:
        sys.exit(
            "Your blacklist file(s) has (have) regions that overlap. Proceeding with such a file would result in deepTools incorrectly calculating scaling factors. As such, you MUST fix this issue before being able to proceed.\n"
        )
    if minOverlap < 1000:
        sys.stderr.write(
            "WARNING: The minimum distance between intervals in your blacklist is {}. It makes little biological sense to include small regions between two blacklisted regions. Instead, these should likely be blacklisted as well.\n"
            .format(minOverlap))

    regions = []
    for chrom in bl.chroms:
        if (not chroms_to_ignore
                or chrom not in chroms_to_ignore) and chrom in chromLens:
            for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]):
                regions.append([bam_handle.filename, chrom, reg[0], reg[1]])

    if len(regions) > 0:
        import multiprocessing
        if len(regions) > 1 and numberOfProcessors > 1:
            pool = multiprocessing.Pool(numberOfProcessors)
            res = pool.map_async(bam_blacklisted_worker, regions).get(9999999)
        else:
            res = [bam_blacklisted_worker(x) for x in regions]
        for val in res:
            blacklisted += val

    return blacklisted
Exemple #8
0
def makeRegions(BED, args):
    """
    Given a list of BED/GTF files, make a list of regions.
    These are vaguely extended as appropriate. For simplicity, the maximum of --beforeRegionStartLength
    and --afterRegionStartLength are tacked on to each end and transcripts are used for GTF files.
    """
    itree = GTF(BED,
                transcriptID=args.transcriptID,
                transcript_id_designator=args.transcript_id_designator)
    o = []
    extend = 0
    # The before/after stuff is specific to computeMatrix
    if "beforeRegionStartLength" in args:
        extend = max(args.beforeRegionStartLength, args.afterRegionStartLength)
    for chrom in itree.chroms:
        regs = itree.findOverlaps(
            chrom, 0, 4294967295)  # bigWig files use 32 bit coordinates
        for reg in regs:
            o.append([chrom, max(0, reg[0] - extend), reg[1] + extend])
    del itree
    return o
Exemple #9
0
def getPositionsToSample(chrom, start, end, stepSize):
    """
    check if the region submitted to the worker
    overlaps with the region to take extra effort to sample.
    If that is the case, the regions to sample array is
    increased to match each of the positions in the extra
    effort region sampled at the same stepSize along the interval.

    If a filter out tree is given, then from positions to sample
    those regions are cleaned
    """
    positions_to_sample = np.arange(start, end, stepSize)

    if global_vars['filter_out']:
        filter_out_tree = GTF(global_vars['filter_out'])
    else:
        filter_out_tree = None

    if global_vars['extra_sampling_file']:
        extra_tree = GTF(global_vars['extra_sampling_file'])
    else:
        extra_tree = None

    if extra_tree:
        orig_len = len(positions_to_sample)
        try:
            extra_match = extra_tree.findOverlaps(chrom, start, end)
        except KeyError:
            extra_match = []

        if len(extra_match) > 0:
            for intval in extra_match:
                positions_to_sample = np.append(positions_to_sample,
                                                list(range(intval[0], intval[1], stepSize)))
        # remove duplicates
        positions_to_sample = np.unique(np.sort(positions_to_sample))
        if debug:
            print("sampling increased to {} from {}".format(
                len(positions_to_sample),
                orig_len))

    # skip regions that are filtered out
    if filter_out_tree:
        try:
            out_match = filter_out_tree.findOverlaps(chrom, start, end)
        except KeyError:
            out_match = []

        if len(out_match) > 0:
            for intval in out_match:
                positions_to_sample = \
                    positions_to_sample[
                        (positions_to_sample < intval[0]) |
                        (positions_to_sample >= intval[1])]
    return positions_to_sample
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(add_help=True, description="Bourbon finds contiguous regions without repeats (low peat content) of a minimum size and without genes within some distance. Output is written to the terminal. Note that this program currently ignores the ends of chromosomes.")
    parser.add_argument("rmsk", help="Repeat masker file")
    parser.add_argument("gtf", help="GTF file")
    parser.add_argument("tbit", help="2bit file")
    parser.add_argument("--minimumProof", type=int, default=15000, help="Minimum size of a repeat-free region (default %(default)s)")
    parser.add_argument("--wobble", type=int, default=5000, help="Ensure no genes are within this distance of a region of interest (default %(default)s)")
    parser.add_argument("--legalBAC", type=float, default=0.01, help="Maximum N content (default %(default)s)")
    args = parser.parse_args()

    # Produce a header
    print("Chromosome\tStart\tEnd")

    genes = GTF(args.gtf)
    rmsk = open(args.rmsk)
    tb = py2bit.open(args.tbit)

    lastChrom = None
    lastEnd = 0
    for line in rmsk:
        if line.startswith("#"):
            continue
        cols = line.strip().split()
        chrom = cols[5]
        start = int(cols[6]) - 1
        end = int(cols[7])
        if chrom == lastChrom:
            if start - lastEnd >= args.minimumProof:
                ROIstart = lastEnd
                ROIend = start
                blocks = splitByGenes(chrom, ROIstart, ROIend, genes, args.wobble)
                for block in blocks:
                    if block[1] - block[0] < args.minimumProof:
                        continue
                    if not highN(chrom, block[0], block[1], tb, args.legalBAC):
                        print("{}\t{}\t{}".format(chrom, block[0], block[1]))
        lastChrom = chrom
        lastEnd = end

    rmsk.close()
    tb.close()
Exemple #11
0
def mapReduce(staticArgs,
              func,
              chromSize,
              genomeChunkLength=None,
              region=None,
              bedFile=None,
              blackListFileName=None,
              numberOfProcessors=4,
              verbose=False,
              includeLabels=False,
              keepExons=False,
              transcriptID="transcriptID",
              exonID="exonID",
              transcript_id_designator="transcript_id",
              self_=None):
    """
    Split the genome into parts that are sent to workers using a defined
    number of procesors. Results are collected and returned.

    For each genomic region the given 'func' is called using
    the following parameters:

     chrom, start, end, staticArgs

    The *arg* are static, *pickable* variables that need to be sent
    to workers.

    The genome chunk length corresponds to a fraction of the genome, in bp,
    that is send to each of the workers for processing.

    Depending on the type of process a larger or shorter regions may be
    preferred

    :param chromSize: A list of duples containing the chromosome
                      name and its length
    :param region: The format is chr:start:end:tileSize (see function
                   getUserRegion)
    :param staticArgs: tuple of arguments that are sent to the given 'func'

    :param func: function to call. The function is called using the
                 following parameters (chrom, start, end, staticArgs)
    :param bedFile: Is a bed file is given, the args to the func to be
                    called are extended to include a list of bed
                    defined regions.
    :param blackListFileName: A list of regions to exclude from all computations.
                              Note that this has genomeChunkLength resolution...
    :param self_: In case mapreduce should make a call to an object
                  the self variable has to be passed.
    :param includeLabels: Pass group and transcript labels into the calling
                          function. These are added to the static args
                          (groupLabel and transcriptName).

    If "includeLabels" is true, a tuple of (results, labels) is returned
    """

    if not genomeChunkLength:
        genomeChunkLength = 1e5
    genomeChunkLength = int(genomeChunkLength)

    if verbose:
        print("genome partition size for multiprocessing: {0}".format(
            genomeChunkLength))

    region_start = 0
    region_end = None

    # if a region is set, that means that the task should be only cover
    # the given genomic position

    if region:
        chromSize, region_start, region_end, genomeChunkLength = getUserRegion(
            chromSize, region)
        if verbose:
            print("chrom size: {0}, region start: {1}, region end: {2}, "
                  "genome chunk length sent to each procesor: {3}".format(
                      chromSize, region_start, region_end, genomeChunkLength))

    if bedFile:
        defaultGroup = None
        if len(bedFile) == 1:
            defaultGroup = "genes"
        bed_interval_tree = GTF(
            bedFile,
            defaultGroup=defaultGroup,
            transcriptID=transcriptID,
            exonID=exonID,
            transcript_id_designator=transcript_id_designator,
            keepExons=keepExons)

    if blackListFileName:
        blackList = GTF(blackListFileName)

    TASKS = []
    # iterate over all chromosomes
    for chrom, size in chromSize:
        # the start is zero unless a specific region is defined
        start = 0 if region_start == 0 else region_start
        for startPos in range(start, size, genomeChunkLength):
            endPos = min(size, startPos + genomeChunkLength)

            # Reject a chunk if it overlaps
            if blackListFileName:
                regions = blSubtract(blackList, chrom, [startPos, endPos])
            else:
                regions = [[startPos, endPos]]

            for reg in regions:
                if self_ is not None:
                    argsList = [self_]
                else:
                    argsList = []

                argsList.extend([chrom, reg[0], reg[1]])
                # add to argument list the static list received the the function
                argsList.extend(staticArgs)

                # if a bed file is given, append to the TASK list,
                # a list of bed regions that overlap with the
                # current genomeChunk.
                if bedFile:
                    # This effectively creates batches of intervals, which is
                    # generally more performant due to the added overhead of
                    # initializing additional workers.

                    # TODO, there's no point in including the chromosome
                    if includeLabels:
                        bed_regions_list = [[
                            chrom, x[4], x[2], x[3], x[5], x[6]
                        ] for x in bed_interval_tree.findOverlaps(
                            chrom,
                            reg[0],
                            reg[1],
                            trimOverlap=True,
                            numericGroups=True,
                            includeStrand=True)]
                    else:
                        bed_regions_list = [[
                            chrom, x[4], x[5], x[6]
                        ] for x in bed_interval_tree.findOverlaps(
                            chrom,
                            reg[0],
                            reg[1],
                            trimOverlap=True,
                            includeStrand=True)]

                    if len(bed_regions_list) == 0:
                        continue
                    # add to argument list, the position of the bed regions to use
                    argsList.append(bed_regions_list)

                TASKS.append(tuple(argsList))

    if len(TASKS) > 1 and numberOfProcessors > 1:
        if verbose:
            print(("using {} processors for {} "
                   "number of tasks".format(numberOfProcessors, len(TASKS))))
        random.shuffle(TASKS)
        pool = multiprocessing.Pool(numberOfProcessors)
        res = pool.map_async(func, TASKS).get(9999999)
    else:
        res = list(map(func, TASKS))

    if includeLabels:
        if bedFile:
            return res, bed_interval_tree.labels
        else:
            return res, None
    return res
Exemple #12
0
    def get_coverage_of_region(self,
                               bamHandle,
                               chrom,
                               regions,
                               fragmentFromRead_func=None):
        """
        Returns a numpy array that corresponds to the number of reads
        that overlap with each tile.

        >>> test = Tester()
        >>> import pysam
        >>> c = CountReadsPerBin([], stepSize=1, extendReads=300)

        For this case the reads are length 36. The number of overlapping
        read fragments is 4 and 5 for the positions tested.

        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000833, 5000834), (5000834, 5000835)])
        array([4., 5.])

        In the following example a paired read is extended to the fragment length which is 100
        The first mate starts at 5000000 and the second at 5000064. Each mate is
        extended to the fragment length *independently*
        At position 500090-500100 one fragment  of length 100 overlap, and after position 5000101
        there should be zero reads.

        >>> c.zerosToNans = True
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000090, 5000100), (5000100, 5000110)])
        array([ 1., nan])

        In the following  case the reads length is 50. Reads are not extended.

        >>> c.extendReads=False
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
        array([1., 2., 2.])


        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster. TODO: profile the function
            c = 0
            if chrom not in bamHandle.references:
                raise NameError(
                    "chromosome {} not found in bam file".format(chrom))

            prev_pos = set()
            lpos = None
            # of previous processed read pair
            for read in bamHandle.fetch(chrom, regStart, regEnd):
                if read.is_unmapped:
                    continue
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                tLen = deeptools.utilities.getTLen(read)
                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates:
                    # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                    if tLen >= 0:
                        s = read.pos
                        e = s + tLen
                    else:
                        s = read.pnext
                        e = s - tLen
                    if read.reference_id != read.next_reference_id:
                        e = read.pnext
                    if lpos is not None and lpos == read.reference_start \
                            and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                        continue
                    if lpos != read.reference_start:
                        prev_pos.clear()
                    lpos = read.reference_start
                    prev_pos.add(
                        (s, e, read.next_reference_id, read.is_reverse))

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                last_eIdx = None
                for fragmentStart, fragmentEnd in position_blocks:
                    if fragmentEnd is None or fragmentStart is None:
                        continue
                    fragmentLength = fragmentEnd - fragmentStart
                    if fragmentLength == 0:
                        continue
                    # skip reads that are not in the region being
                    # evaluated.
                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                        continue

                    if fragmentStart < reg[0]:
                        fragmentStart = reg[0]
                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
                        fragmentEnd = reg[0] + len(coverages) * tileSize

                    sIdx = vector_start + max(
                        (fragmentStart - reg[0]) // tileSize, 0)
                    eIdx = vector_start + min(
                        np.ceil(float(fragmentEnd - reg[0]) /
                                tileSize).astype('int'), nRegBins)
                    if last_eIdx is not None:
                        sIdx = max(last_eIdx, sIdx)
                        if sIdx >= eIdx:
                            continue
                    sIdx = int(sIdx)
                    eIdx = int(eIdx)
                    coverages[sIdx:eIdx] += 1
                    last_eIdx = eIdx

                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" %
                      (multiprocessing.current_process().name, c, c /
                       (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages
Exemple #13
0
    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
        """Counts the reads in each bam file at each 'stepSize' position
        within the interval (start, end) for a window or bin of size binLength.

        The stepSize controls the distance between bins. For example,
        a step size of 20 and a bin size of 20 will create bins next to
        each other. If the step size is smaller than the bin size the
        bins will overlap.

        If a list of bedRegions is given, then the number of reads
        that overlaps with each region is counted.

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        bed_regions_list: list
            List of list of tuples of the form (start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        numpy array
            The result is a numpy array that as rows each bin
            and as columns each bam file.


        Examples
        --------
        Initialize some useful values

        >>> test = Tester()
        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)

        The transpose is used to get better looking numbers. The first line
        corresponds to the number of reads per bin in the first bamfile.

        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
        >>> _array
        array([[0., 0.],
               [0., 1.],
               [1., 1.],
               [1., 2.]])

        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None and bed_regions_list is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        start_time = time.time()

        bam_handles = []
        for fname in self.bamFilesList:
            try:
                bam_handles.append(bamHandler.openBam(fname))
            except SystemExit:
                sys.exit(sys.exc_info()[1])
            except:
                bam_handles.append(pyBigWig.open(fname))

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        # A list of lists of tuples
        transcriptsToConsider = []
        if bed_regions_list is not None:
            transcriptsToConsider = [x[1] for x in bed_regions_list]
        else:
            if self.stepSize == self.binLength:
                transcriptsToConsider.append([(start, end, self.binLength)])
            else:
                for i in range(start, end, self.stepSize):
                    if i + self.binLength > end:
                        break
                    if blackList is not None and blackList.findOverlaps(
                            chrom, i, i + self.binLength):
                        continue
                    transcriptsToConsider.append([(i, i + self.binLength)])

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'),
                         'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for bam in bam_handles:
            for trans in transcriptsToConsider:
                tcov = self.get_coverage_of_region(bam, chrom, trans)
                if bed_regions_list is not None:
                    subnum_reads_per_bin.append(np.sum(tcov))
                else:
                    subnum_reads_per_bin.extend(tcov)

        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(
            -1, len(self.bamFilesList), order='F')

        if self.save_data:
            idx = 0
            for i, trans in enumerate(transcriptsToConsider):
                if len(trans[0]) != 3:
                    starts = ",".join([str(x[0]) for x in trans])
                    ends = ",".join([str(x[1]) for x in trans])
                    _file.write("\t".join([chrom, starts, ends]) + "\t")
                    _file.write("\t".join(
                        ["{}".format(x)
                         for x in subnum_reads_per_bin[i, :]]) + "\n")
                else:
                    for exon in trans:
                        for startPos in range(exon[0], exon[1], exon[2]):
                            if idx >= subnum_reads_per_bin.shape[0]:
                                # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
                                # Counts there are added to the bin before them, but range() will still try to include them.
                                break
                            _file.write("{0}\t{1}\t{2}\t".format(
                                chrom, startPos, startPos + exon[2]))
                            _file.write("\t".join([
                                "{}".format(x)
                                for x in subnum_reads_per_bin[idx, :]
                            ]) + "\n")
                            idx += 1
            _file.close()

        if self.verbose:
            endTime = time.time()
            rows = subnum_reads_per_bin.shape[0]
            print("%s countReadsInRegions_worker: processing %d "
                  "(%.1f per sec) @ %s:%s-%s" %
                  (multiprocessing.current_process().name, rows, rows /
                   (endTime - start_time), chrom, start, end))

        return subnum_reads_per_bin, _file_name
    def get_coverage_of_region(self, bamHandle, chrom, regions,
                               fragmentFromRead_func=None):
        """
        Returns a numpy array that corresponds to the number of reads
        that overlap with each tile.

        >>> test = Tester()
        >>> import pysam
        >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)

        For this case the reads are length 36. The number of overlapping
        read fragments is 4 and 5 for the positions tested. Note that reads are
        NOT extended, due to there being a 0 length input list of BAM files!

        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000833, 5000834), (5000834, 5000835)])
        array([4., 5.])

        In the following  case the reads length is 50. Reads are not extended.

        >>> c.extendReads=False
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
        array([2., 4., 4.])


        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster. TODO: profile the function
            c = 0
            try:
                # BAM input
                if chrom not in bamHandle.references:
                    raise NameError("chromosome {} not found in bam file".format(chrom))
            except:
                # bigWig input, as used by plotFingerprint
                if bamHandle.chroms(chrom):
                    _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
                    _[np.isnan(_)] = 0.0
                    _ = _ * tileSize
                    coverages += _
                    continue
                else:
                    raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))

            prev_pos = set()
            lpos = None
            # of previous processed read pair
            for read in bamHandle.fetch(chrom, regStart, regEnd):
                if read.is_unmapped:
                    continue
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                tLen = getTLen(read)
                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates:
                    # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                    if tLen >= 0:
                        s = read.pos
                        e = s + tLen
                    else:
                        s = read.pnext
                        e = s - tLen
                    if read.reference_id != read.next_reference_id:
                        e = read.pnext
                    if lpos is not None and lpos == read.reference_start \
                            and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                        continue
                    if lpos != read.reference_start:
                        prev_pos.clear()
                    lpos = read.reference_start
                    prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                last_eIdx = None
                for fragmentStart, fragmentEnd in position_blocks:
                    if fragmentEnd is None or fragmentStart is None:
                        continue
                    fragmentLength = fragmentEnd - fragmentStart
                    if fragmentLength == 0:
                        continue
                    # skip reads that are not in the region being
                    # evaluated.
                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                        continue

                    if fragmentStart < reg[0]:
                        fragmentStart = reg[0]
                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
                        fragmentEnd = reg[0] + len(coverages) * tileSize

                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
                    if eIdx >= len(coverages):
                        eIdx = len(coverages) - 1
                    if last_eIdx is not None:
                        sIdx = max(last_eIdx, sIdx)
                        if sIdx >= eIdx:
                            continue

                    # First bin
                    if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
                        _ = fragmentEnd - fragmentStart
                    else:
                        _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
                    if _ > tileSize:
                        _ = tileSize
                    coverages[sIdx] += _
                    _ = sIdx + 1
                    while _ < eIdx:
                        coverages[_] += tileSize
                        _ += 1
                    while eIdx - sIdx >= nRegBins:
                        eIdx -= 1
                    if eIdx > sIdx:
                        _ = fragmentEnd - (reg[0] + eIdx * tileSize)
                        if _ > tileSize:
                            _ = tileSize
                        elif _ < 0:
                            _ = 0
                        coverages[eIdx] += _
                    last_eIdx = eIdx

                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages
    def get_coverage_of_region(self, bamHandle, chrom, regions,
                               fragmentFromRead_func=None):
        """
        Returns a numpy array that corresponds to the number of reads
        that overlap with each tile.

        >>> test = Tester()
        >>> import pysam
        >>> c = CountReadsPerBin([], stepSize=1, extendReads=300)

        For this case the reads are length 36. The number of overlapping
        read fragments is 4 and 5 for the positions tested.

        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000833, 5000834), (5000834, 5000835)])
        array([ 4.,  5.])

        In the following example a paired read is extended to the fragment length which is 100
        The first mate starts at 5000000 and the second at 5000064. Each mate is
        extended to the fragment length *independently*
        At position 500090-500100 one fragment  of length 100 overlap, and after position 5000101
        there should be zero reads.

        >>> c.zerosToNans = True
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000090, 5000100), (5000100, 5000110)])
        array([  1.,  nan])

        In the following  case the reads length is 50. Reads are not extended.

        >>> c.extendReads=False
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
        array([ 1.,  2.,  2.])


        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster. TODO: profile the function
            c = 0
            if chrom in bamHandle.references:
                reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
                         if r.flag & 4 == 0]
            else:
                raise NameError("chromosome {} not found in bam file".format(chrom))

            prev_start_pos = None  # to store the start positions
            # of previous processed read pair
            for read in reads:
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                tLen = deeptools.utilities.getTLen(read)
                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates and prev_start_pos \
                        and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                    continue

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                last_eIdx = None
                for fragmentStart, fragmentEnd in position_blocks:
                    if fragmentEnd is None or fragmentStart is None:
                        continue
                    fragmentLength = fragmentEnd - fragmentStart
                    if fragmentLength == 0:
                        continue
                    # skip reads that are not in the region being
                    # evaluated.
                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                        continue

                    if fragmentStart < reg[0]:
                        fragmentStart = reg[0]
                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
                        fragmentEnd = reg[0] + len(coverages) * tileSize

                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
                    if last_eIdx is not None:
                        sIdx = max(last_eIdx, sIdx)
                        if sIdx >= eIdx:
                            continue
                    coverages[sIdx:eIdx] += 1
                    last_eIdx = eIdx

                prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)
                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages
    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
        """Counts the reads in each bam file at each 'stepSize' position
        within the interval (start, end) for a window or bin of size binLength.

        The stepSize controls the distance between bins. For example,
        a step size of 20 and a bin size of 20 will create bins next to
        each other. If the step size is smaller than the bin size the
        bins will overlap.

        If a list of bedRegions is given, then the number of reads
        that overlaps with each region is counted.

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        bed_regions_list: list
            List of list of tuples of the form (start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        numpy array
            The result is a numpy array that as rows each bin
            and as columns each bam file.


        Examples
        --------
        Initialize some useful values

        >>> test = Tester()
        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)

        The transpose is used to get better looking numbers. The first line
        corresponds to the number of reads per bin in the first bamfile.

        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
        >>> _array
        array([[ 0.,  0.],
               [ 0.,  1.],
               [ 1.,  1.],
               [ 1.,  2.]])

        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        start_time = time.time()

        bam_handlers = []
        for fname in self.bamFilesList:
            try:
                bam_handlers.append(bamHandler.openBam(fname))
            except:
                bam_handlers.append(pyBigWig.open(fname))

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        # A list of lists of tuples
        transcriptsToConsider = []
        if bed_regions_list is not None:
            transcriptsToConsider = [x[1] for x in bed_regions_list]
        else:
            if self.stepSize == self.binLength:
                transcriptsToConsider.append([(start, end, self.binLength)])
            else:
                for i in range(start, end, self.stepSize):
                    if i + self.binLength > end:
                        break
                    if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength):
                        continue
                    transcriptsToConsider.append([(i, i + self.binLength)])

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for bam in bam_handlers:
            for trans in transcriptsToConsider:
                tcov = self.get_coverage_of_region(bam, chrom, trans)
                if bed_regions_list is not None:
                    subnum_reads_per_bin.append(np.sum(tcov))
                else:
                    subnum_reads_per_bin.extend(tcov)

        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')

        if self.save_data:
            idx = 0
            for i, trans in enumerate(transcriptsToConsider):
                if len(trans[0]) != 3:
                    starts = ",".join([str(x[0]) for x in trans])
                    ends = ",".join([str(x[1]) for x in trans])
                    _file.write("\t".join([chrom, starts, ends]) + "\t")
                    _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n")
                else:
                    for exon in trans:
                        for startPos in range(exon[0], exon[1], exon[2]):
                            if idx >= subnum_reads_per_bin.shape[0]:
                                # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
                                # Counts there are added to the bin before them, but range() will still try to include them.
                                break
                            _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, startPos + exon[2]))
                            _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n")
                            idx += 1
            _file.close()

        if self.verbose:
            endTime = time.time()
            rows = subnum_reads_per_bin.shape[0]
            print("%s countReadsInRegions_worker: processing %d "
                  "(%.1f per sec) @ %s:%s-%s" %
                  (multiprocessing.current_process().name,
                   rows, rows / (endTime - start_time), chrom, start, end))

        return subnum_reads_per_bin, _file_name
Exemple #17
0
    def count_reads_in_region_with_intron(self,
                                          chrom,
                                          start,
                                          end,
                                          bed_regions_list=None):
        """
        Rewrite deeptools.CountReadsPerBin.count_reads_in_region

        Args:
            chrom (str): Chrom
            start (int): Start position
            end (int): End position
            bed_regions_list (list): List of bed region

        Returns:
            tuple: subnum_reads_per_bin, file_name
        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        start_time = time.time()

        bam_handlers = []
        for fname in self.bamFilesList:
            try:
                bam_handlers.append(bamHandler.openBam(fname))
            except:
                bam_handlers.append(pyBigWig.open(fname))

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        # A list of lists of tuples
        transcriptsToConsider = []
        if bed_regions_list is not None:
            transcriptsToConsider = [x[1] for x in bed_regions_list]
        else:
            if self.stepSize == self.binLength:
                transcriptsToConsider.append([(start, end, self.binLength)])
            else:
                for i in range(start, end, self.stepSize):
                    if i + self.binLength > end:
                        break
                    if blackList is not None and blackList.findOverlaps(
                            chrom, i, i + self.binLength):
                        continue
                    transcriptsToConsider.append([(i, i + self.binLength)])

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'),
                         'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for bam in bam_handlers:
            for trans in transcriptsToConsider:
                tcov = self.get_coverage_of_region_with_intron(
                    bam, chrom, trans)
                if bed_regions_list is not None:
                    subnum_reads_per_bin.append(np.sum(tcov))
                else:
                    subnum_reads_per_bin.extend(tcov)

        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(
            -1, len(self.bamFilesList), order='F')

        if self.save_data:
            idx = 0
            for i, trans in enumerate(transcriptsToConsider):
                if len(trans[0]) != 3:
                    starts = ",".join([str(x[0]) for x in trans])
                    ends = ",".join([str(x[1]) for x in trans])
                    _file.write("\t".join([chrom, starts, ends]) + "\t")
                    _file.write("\t".join(
                        ["{}".format(x)
                         for x in subnum_reads_per_bin[i, :]]) + "\n")
                else:
                    for exon in trans:
                        for startPos in range(exon[0], exon[1], exon[2]):
                            if idx >= subnum_reads_per_bin.shape[0]:
                                # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
                                # Counts there are added to the bin before them, but range() will still try to include them.
                                break
                            _file.write("{0}\t{1}\t{2}\t".format(
                                chrom, startPos, startPos + exon[2]))
                            _file.write("\t".join([
                                "{}".format(x)
                                for x in subnum_reads_per_bin[idx, :]
                            ]) + "\n")
                            idx += 1
            _file.close()

        if self.verbose:
            endTime = time.time()
            rows = subnum_reads_per_bin.shape[0]
            print("%s countReadsInRegions_worker: processing %d "
                  "(%.1f per sec) @ %s:%s-%s" %
                  (multiprocessing.current_process().name, rows, rows /
                   (endTime - start_time), chrom, start, end))

        return subnum_reads_per_bin, _file_name
Exemple #18
0
    def get_coverage_of_region_with_intron(self,
                                           bamHandle,
                                           chrom,
                                           regions,
                                           fragmentFromRead_func=None):
        """
        Rewrite deeptools.CountReadsPerBin.get_coverage_of_region

        Args:
            bamHandle (AlignmentFile): Bam object
            chrom (str): Chrom
            regions (list): List of block
            fragmentFromRead_func (function): Function to get fragment from read

        Returns:
            float: coverages
        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster.
            c = 0
            if chrom in bamHandle.references:
                reads = [
                    r for r in bamHandle.fetch(chrom, regStart, regEnd)
                    if r.flag & 4 == 0
                ]
            else:
                raise NameError(
                    "chromosome {} not found in bam file".format(chrom))

            prev_start_pos = None  # to store the start positions
            # of previous processed read pair
            for read in reads:
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                tLen = deeptools.utilities.getTLen(read)
                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates and prev_start_pos \
                        and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                    continue

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                # Rewrite !!!
                sIdx = vector_start + max(
                    (read.reference_start - reg[0]) // tileSize, 0)
                eIdx = vector_start + min(
                    np.ceil(float(read.reference_end - reg[0]) /
                            tileSize).astype('int'), nRegBins)
                coverages[sIdx:eIdx] += 1

                prev_start_pos = (read.reference_start, read.pnext,
                                  read.is_reverse)
                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" %
                      (multiprocessing.current_process().name, c, c /
                       (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages
Exemple #19
0
    def get_coverage_of_region(self,
                               bamHandle,
                               chrom,
                               regions,
                               fragmentFromRead_func=None):
        """
        Returns a numpy array that corresponds to the number of reads
        that overlap with each tile.

        >>> test = Tester()
        >>> import pysam
        >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)

        For this case the reads are length 36. The number of overlapping
        read fragments is 4 and 5 for the positions tested. Note that reads are
        NOT extended, due to there being a 0 length input list of BAM files!

        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000833, 5000834), (5000834, 5000835)])
        array([ 4.,  5.])

        In the following  case the reads length is 50. Reads are not extended.

        >>> c.extendReads=False
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
        array([ 2.,  4.,  4.])


        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster. TODO: profile the function
            c = 0
            try:
                # BAM input
                if chrom in bamHandle.references:
                    reads = [
                        r for r in bamHandle.fetch(chrom, regStart, regEnd)
                        if r.flag & 4 == 0
                    ]
                else:
                    raise NameError(
                        "chromosome {} not found in bam file".format(chrom))
            except:
                # bigWig input, as used by plotFingerprint
                if bamHandle.chroms(chrom):
                    _ = np.array(bamHandle.stats(chrom,
                                                 regStart,
                                                 regEnd,
                                                 type="mean",
                                                 nBins=nRegBins),
                                 dtype=np.float)
                    _[np.isnan(_)] = 0.0
                    _ = _ * tileSize
                    coverages += _
                    continue
                else:
                    raise NameError(
                        "chromosome {} not found in bigWig file with chroms {}"
                        .format(chrom, bamHandle.chroms()))

            prev_start_pos = None  # to store the start positions
            # of previous processed read pair
            for read in reads:
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                if self.minFragmentLength > 0 and abs(
                        read.template_length) < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and abs(
                        read.template_length) > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates and prev_start_pos \
                        and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                    continue

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                last_eIdx = None
                for fragmentStart, fragmentEnd in position_blocks:
                    if fragmentEnd is None or fragmentStart is None:
                        continue
                    fragmentLength = fragmentEnd - fragmentStart
                    if fragmentLength == 0:
                        continue
                    # skip reads that are not in the region being
                    # evaluated.
                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                        continue

                    if fragmentStart < reg[0]:
                        fragmentStart = reg[0]
                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
                        fragmentEnd = reg[0] + len(coverages) * tileSize

                    sIdx = vector_start + max(
                        (fragmentStart - reg[0]) // tileSize, 0)
                    eIdx = vector_start + min(
                        np.ceil(float(fragmentEnd - reg[0]) /
                                tileSize).astype('int'), nRegBins)
                    if eIdx >= len(coverages):
                        eIdx = len(coverages) - 1
                    if last_eIdx is not None:
                        sIdx = max(last_eIdx, sIdx)
                        if sIdx >= eIdx:
                            continue

                    # First bin
                    if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
                        _ = fragmentEnd - fragmentStart
                    else:
                        _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
                    if _ > tileSize:
                        _ = tileSize
                    coverages[sIdx] += _
                    _ = sIdx + 1
                    while _ < eIdx:
                        coverages[_] += tileSize
                        _ += 1
                    while eIdx - sIdx >= nRegBins:
                        eIdx -= 1
                    if eIdx > sIdx:
                        _ = fragmentEnd - (reg[0] + eIdx * tileSize)
                        if _ > tileSize:
                            _ = tileSize
                        elif _ < 0:
                            _ = 0
                        coverages[eIdx] += _
                    last_eIdx = eIdx

                prev_start_pos = (read.reference_start, read.pnext,
                                  read.is_reverse)
                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" %
                      (multiprocessing.current_process().name, c, c /
                       (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages
Exemple #20
0
def mapReduce(staticArgs, func, chromSize,
              genomeChunkLength=None,
              region=None,
              bedFile=None,
              blackListFileName=None,
              numberOfProcessors=4,
              verbose=False,
              includeLabels=False,
              keepExons=False,
              transcriptID="transcriptID",
              exonID="exonID",
              transcript_id_designator="transcript_id",
              self_=None):
    """
    Split the genome into parts that are sent to workers using a defined
    number of procesors. Results are collected and returned.

    For each genomic region the given 'func' is called using
    the following parameters:

     chrom, start, end, staticArgs

    The *arg* are static, *pickable* variables that need to be sent
    to workers.

    The genome chunk length corresponds to a fraction of the genome, in bp,
    that is send to each of the workers for processing.

    Depending on the type of process a larger or shorter regions may be
    preferred

    :param chromSize: A list of duples containing the chromosome
                      name and its length
    :param region: The format is chr:start:end:tileSize (see function
                   getUserRegion)
    :param staticArgs: tuple of arguments that are sent to the given 'func'

    :param func: function to call. The function is called using the
                 following parameters (chrom, start, end, staticArgs)
    :param bedFile: Is a bed file is given, the args to the func to be
                    called are extended to include a list of bed
                    defined regions.
    :param blackListFileName: A list of regions to exclude from all computations.
                              Note that this has genomeChunkLength resolution...
    :param self_: In case mapreduce should make a call to an object
                  the self variable has to be passed.
    :param includeLabels: Pass group and transcript labels into the calling
                          function. These are added to the static args
                          (groupLabel and transcriptName).

    If "includeLabels" is true, a tuple of (results, labels) is returned
    """

    if not genomeChunkLength:
        genomeChunkLength = 1e5
    genomeChunkLength = int(genomeChunkLength)

    if verbose:
        print("genome partition size for multiprocessing: {0}".format(
            genomeChunkLength))

    region_start = 0
    region_end = None

    # if a region is set, that means that the task should only cover
    # the given genomic position

    if region:
        chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, region)
        if verbose:
            print("chrom size: {0}, region start: {1}, region end: {2}, "
                  "genome chunk length sent to each procesor: {3}".format(chromSize, region_start, region_end, genomeChunkLength))

    if bedFile:
        defaultGroup = None
        if len(bedFile) == 1:
            defaultGroup = "genes"
        bed_interval_tree = GTF(bedFile, defaultGroup=defaultGroup, transcriptID=transcriptID, exonID=exonID, transcript_id_designator=transcript_id_designator, keepExons=keepExons)

    if blackListFileName:
        blackList = GTF(blackListFileName)

    TASKS = []
    # iterate over all chromosomes
    for chrom, size in chromSize:
        # the start is zero unless a specific region is defined
        start = 0 if region_start == 0 else region_start
        for startPos in range(start, size, genomeChunkLength):
            endPos = min(size, startPos + genomeChunkLength)

            # Reject a chunk if it overlaps
            if blackListFileName:
                regions = blSubtract(blackList, chrom, [startPos, endPos])
            else:
                regions = [[startPos, endPos]]

            for reg in regions:
                if self_ is not None:
                    argsList = [self_]
                else:
                    argsList = []

                argsList.extend([chrom, reg[0], reg[1]])
                # add to argument list the static list received the the function
                argsList.extend(staticArgs)

                # if a bed file is given, append to the TASK list,
                # a list of bed regions that overlap with the
                # current genomeChunk.
                if bedFile:
                    # This effectively creates batches of intervals, which is
                    # generally more performant due to the added overhead of
                    # initializing additional workers.

                    # TODO, there's no point in including the chromosome
                    if includeLabels:
                        bed_regions_list = [[chrom, x[4], x[2], x[3], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, numericGroups=True, includeStrand=True)]
                    else:
                        bed_regions_list = [[chrom, x[4], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, includeStrand=True)]

                    if len(bed_regions_list) == 0:
                        continue
                    # add to argument list, the position of the bed regions to use
                    argsList.append(bed_regions_list)

                TASKS.append(tuple(argsList))

    if len(TASKS) > 1 and numberOfProcessors > 1:
        if verbose:
            print(("using {} processors for {} "
                   "number of tasks".format(numberOfProcessors,
                                            len(TASKS))))
        random.shuffle(TASKS)
        pool = multiprocessing.Pool(numberOfProcessors)
        res = pool.map_async(func, TASKS).get(9999999)
    else:
        res = list(map(func, TASKS))

    if includeLabels:
        if bedFile:
            return res, bed_interval_tree.labels
        else:
            return res, None
    return res