Esempio n. 1
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels)
    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        prev_start_pos = None  # to store the start positions
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
Esempio n. 2
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels)
    olist = []
    for f in args.bamfiles:
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        prev_start_pos = None  # to store the start positions
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            if args.minFragmentLength > 0 and abs(read.template_length) < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and abs(read.template_length) > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)

            # Get blocks, possibly extending
            features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features
Esempio n. 3
0
    def preload(self, regions, tmpDir=None):
        """
        Given a sample and a set of regions, write a bigWig file containing the underlying signal.

        This function returns the file name, which needs to be deleted by the calling function at some point.

        This sends queries one chromosome at a time, due to memory limits on deepBlue
        """
        startTime = datetime.datetime.now()
        regions2 = mergeRegions(regions)

        # Make a temporary file
        f = tempfile.NamedTemporaryFile(delete=False, dir=tmpDir)
        fname = f.name
        f.close()

        # Start with the bigWig file
        bw = pyBigWig.open(fname, "w")
        bw.addHeader(self.chromsTuple, maxZooms=0)  # This won't work in IGV!

        # Make a string out of everything in a resonable order
        for k, v in self.chromsTuple:
            # Munge chromosome names as appropriate
            chrom = mungeChromosome(k, regions2.keys())
            if not chrom:
                continue
            if chrom not in regions2 or len(regions2) == 0:
                continue
            regionsStr = "\n".join([
                "{}\t{}\t{}".format(k, reg[0], reg[1])
                for reg in regions2[chrom]
            ])
            regionsStr += "\n"

            # Send the regions
            (status,
             regionsID) = self.server.input_regions(self.genome, regionsStr,
                                                    self.userKey)
            if status != "okay":
                raise RuntimeError(
                    "Received the following error while sending regions for '{}': {}"
                    .format(regionsID, self.sample))

            # Get the experiment information
            (status,
             queryID) = self.server.select_experiments(self.sample, k, None,
                                                       None, self.userKey)
            if status != "okay":
                raise RuntimeError(
                    "Received the following error while running select_experiments on file '{}': {}"
                    .format(self.sample, queryID))
            if not queryID:
                raise RuntimeError(
                    "Somehow, we received None as a query ID (file '{}')".
                    format(self.sample))

            # Intersect
            (status,
             intersectID) = self.server.intersection(queryID, regionsID,
                                                     self.userKey)
            if status != "okay":
                raise RuntimeError(
                    "Received the following error while running intersection on file '{}': {}"
                    .format(self.sample, intersectID))
            if not intersectID:
                raise RuntimeError(
                    "Somehow, we received None as an intersect ID (file '{}')".
                    format(self.sample))

            # Query the regions
            (status, reqID) = self.server.get_regions(intersectID,
                                                      "START,END,VALUE",
                                                      self.userKey)
            if status != "okay":
                raise RuntimeError(
                    "Received the following error while fetching regions in file '{}': {}"
                    .format(self.sample, reqID))

            # Wait for the server to process the data
            (status, info) = self.server.info(reqID, self.userKey)
            request_status = info[0]["state"]
            while request_status != "done" and request_status != "failed":
                time.sleep(0.1)
                (status, info) = self.server.info(reqID, self.userKey)
                request_status = info[0]["state"]

            # Get the actual data
            (status, resp) = self.server.get_request_data(reqID, self.userKey)
            if status != "okay":
                raise RuntimeError(
                    "Received the following error while fetching data in file '{}': {}"
                    .format(self.sample, resp))

            for intervals in resp.split("\n"):
                interval = intervals.split("\t")
                if interval[0] == '':
                    continue
                bw.addEntries([k], [int(interval[0])],
                              ends=[int(interval[1])],
                              values=[float(interval[2])])
        bw.close()
        sys.stderr.write("{} done (took {})\n".format(
            self.sample,
            datetime.datetime.now() - startTime))
        sys.stderr.flush()

        return fname
Esempio n. 4
0
    def preload(self, regions, tmpDir=None):
        """
        Given a sample and a set of regions, write a bigWig file containing the underlying signal.

        This function returns the file name, which needs to be deleted by the calling function at some point.

        This sends queries one chromosome at a time, due to memory limits on deepBlue
        """
        startTime = datetime.datetime.now()
        regions2 = mergeRegions(regions)

        # Make a temporary file
        f = tempfile.NamedTemporaryFile(delete=False, dir=tmpDir)
        fname = f.name
        f.close()

        # Start with the bigWig file
        bw = pyBigWig.open(fname, "w")
        bw.addHeader(self.chromsTuple, maxZooms=0)  # This won't work in IGV!

        # Make a string out of everything in a resonable order
        for k, v in self.chromsTuple:
            # Munge chromosome names as appropriate
            chrom = mungeChromosome(k, regions2.keys())
            if not chrom:
                continue
            if chrom not in regions2 or len(regions2) == 0:
                continue
            regionsStr = "\n".join(["{}\t{}\t{}".format(k, reg[0], reg[1]) for reg in regions2[chrom]])
            regionsStr += "\n"

            # Send the regions
            (status, regionsID) = self.server.input_regions(self.genome, regionsStr, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while sending regions for '{}': {}".format(regionsID, self.sample))

            # Get the experiment information
            (status, queryID) = self.server.select_experiments(self.sample, k, None, None, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while running select_experiments on file '{}': {}".format(self.sample, queryID))
            if not queryID:
                raise RuntimeError("Somehow, we received None as a query ID (file '{}')".format(self.sample))

            # Intersect
            (status, intersectID) = self.server.intersection(queryID, regionsID, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while running intersection on file '{}': {}".format(self.sample, intersectID))
            if not intersectID:
                raise RuntimeError("Somehow, we received None as an intersect ID (file '{}')".format(self.sample))

            # Query the regions
            (status, reqID) = self.server.get_regions(intersectID, "START,END,VALUE", self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while fetching regions in file '{}': {}".format(self.sample, reqID))

            # Wait for the server to process the data
            (status, info) = self.server.info(reqID, self.userKey)
            request_status = info[0]["state"]
            while request_status != "done" and request_status != "failed":
                time.sleep(0.1)
                (status, info) = self.server.info(reqID, self.userKey)
                request_status = info[0]["state"]

            # Get the actual data
            (status, resp) = self.server.get_request_data(reqID, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while fetching data in file '{}': {}".format(self.sample, resp))

            for intervals in resp.split("\n"):
                interval = intervals.split("\t")
                if interval[0] == '':
                    continue
                bw.addEntries([k], [int(interval[0])], ends=[int(interval[1])], values=[float(interval[2])])
        bw.close()
        sys.stderr.write("{} done (took {})\n".format(self.sample, datetime.datetime.now() - startTime))
        sys.stderr.flush()

        return fname
Esempio n. 5
0
def getFiltered_worker(arglist):
    chrom, start, end, args = arglist
    # Fix the bounds
    if end - start > args.binSize and end - start > args.distanceBetweenBins:
        end -= args.distanceBetweenBins
    if end <= start:
        end = start + 1

    o = []
    for fname in args.bamfiles:
        fh = bamHandler.openBam(fname)
        chromUse = utilities.mungeChromosome(chrom, fh.references)
        prev_pos = set()
        lpos = None

        minMapq = 0
        samFlagInclude = 0
        samFlagExclude = 0
        internalDupes = 0
        externalDupes = 0
        singletons = 0
        filterRNAstrand = 0
        nFiltered = 0
        total = 0  # This is only used to estimate the percentage affected
        for read in fh.fetch(chromUse, start, end):
            filtered = 0
            if read.pos < start:
                # ensure that we never double count (in case distanceBetweenBins == 0)
                continue

            if read.flag & 4:
                # Ignore unmapped reads, they were counted already
                continue

            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered = 1
                minMapq += 1
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered = 1
                samFlagInclude += 1
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered = 1
                samFlagExclude += 1
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if read.tlen >= 0:
                    s = read.pos
                    e = s + read.tlen
                else:
                    s = read.pnext
                    e = s - read.tlen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    filtered = 1
                    internalDupes += 1
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            if read.is_duplicate:
                filtered = 1
                externalDupes += 1
            if read.is_paired and read.mate_is_unmapped:
                filtered = 1
                singletons += 1

            # filterRNAstrand
            if args.filterRNAstrand:
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 144 == 128 or read.flag & 96 == 64:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 144 == 144 or read.flag & 96 == 96:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                else:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 16 == 16:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 16 == 0:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1

            total += 1
            nFiltered += filtered
        fh.close()

        # Append a tuple to the output
        tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude,
               internalDupes, externalDupes, singletons, filterRNAstrand)
        o.append(tup)
    return o
Esempio n. 6
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        lpos = None
        prev_pos = set()
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if tLen >= 0:
                    s = read.pos
                    e = s + tLen
                else:
                    s = read.pnext
                    e = s - tLen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    continue
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(
                chrom,
                getBAMBlocks(read, defaultFragmentLength, args.centerReads,
                             args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
def getFiltered_worker(arglist):
    chrom, start, end, args = arglist
    # Fix the bounds
    if end - start > args.binSize and end - start > args.distanceBetweenBins:
        end -= args.distanceBetweenBins
    if end <= start:
        end = start + 1

    o = []
    for fname in args.bamfiles:
        fh = bamHandler.openBam(fname)
        chromUse = utilities.mungeChromosome(chrom, fh.references)
        prev_pos = set()
        lpos = None

        minMapq = 0
        samFlagInclude = 0
        samFlagExclude = 0
        internalDupes = 0
        externalDupes = 0
        singletons = 0
        filterRNAstrand = 0
        nFiltered = 0
        total = 0  # This is only used to estimate the percentage affected
        for read in fh.fetch(chromUse, start, end):
            filtered = 0
            if read.pos < start:
                # ensure that we never double count (in case distanceBetweenBins == 0)
                continue

            if read.flag & 4:
                # Ignore unmapped reads, they were counted already
                continue

            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered = 1
                minMapq += 1
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered = 1
                samFlagInclude += 1
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered = 1
                samFlagExclude += 1
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if read.tlen >= 0:
                    s = read.pos
                    e = s + read.tlen
                else:
                    s = read.pnext
                    e = s - read.tlen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    filtered = 1
                    internalDupes += 1
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            if read.is_duplicate:
                filtered = 1
                externalDupes += 1
            if read.is_paired and read.mate_is_unmapped:
                filtered = 1
                singletons += 1

            # filterRNAstrand
            if args.filterRNAstrand:
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 144 == 128 or read.flag & 96 == 64:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 144 == 144 or read.flag & 96 == 96:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                else:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 16 == 16:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 16 == 0:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1

            total += 1
            nFiltered += filtered
        fh.close()

        # Append a tuple to the output
        tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand)
        o.append(tup)
    return o
Esempio n. 8
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        lpos = None
        prev_pos = set()
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if tLen >= 0:
                    s = read.pos
                    e = s + tLen
                else:
                    s = read.pnext
                    e = s - tLen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    continue
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total