Exemple #1
0
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Testable!

    Clipping has already been done!
    """
    _, refStart, refEnd = refWindow

    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [ a.read(orientation="genomic", aligned=False)
                     for a in alns
                     if a.spansReferenceRange(refStart, refEnd) ]
    assert len(fwdSequences) >= arrowConfig.minPoaCoverage

    try:
        p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage])
    except:
        logging.info("%s: POA could not be generated" % (refWindow,))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
    ga = cc.Align(refSequence, p.Sequence)
    numPoaVariants = ga.Errors()
    poaCss = p.Sequence

    # Extract reads into ConsensusCore2-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [ (lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads ]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig())
    for (mr, snr) in mappedReads:
        # TODO (dalexander, lhepler): check for success to compute coverage accurately
        ai.AddRead(mr, snr)

    # Iterate until covergence
    try:
        _, converged = refineConsensus(ai, arrowConfig)
        assert converged, "Arrow did not converge to MLE"
        arrowCss = str(ai)
        if arrowConfig.computeConfidence:
            confidence = consensusConfidence(ai)
        else:
            confidence = np.zeros(shape=len(arrowCss), dtype=int)
        return ArrowConsensus(refWindow,
                              arrowCss,
                              confidence,
                              ai)
    except:
        traceback = ''.join(format_exception(*sys.exc_info()))
        logging.info("%s: %s" % (refWindow, traceback))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
Exemple #2
0
    def onChunk(self, workChunk):
        referenceWindow  = workChunk.window
        refId, refStart, refEnd = referenceWindow

        refSeqInWindow = reference.sequenceInWindow(referenceWindow)

        # Quick cutout for no-coverage case
        if not workChunk.hasCoverage:
            noCallCss = ArrowConsensus.noCallConsensus(self.arrowConfig.noEvidenceConsensus,
                                                       referenceWindow, refSeqInWindow)
            return (referenceWindow, (noCallCss, []))

        # General case
        eWindow = reference.enlargedReferenceWindow(referenceWindow,
                                                    options.referenceChunkOverlap)
        _, eStart, eEnd = eWindow

        # We call consensus on the enlarged window and then map back
        # to the reference and clip the consensus at the implied
        # bounds.  This seems to be more reliable thank cutting the
        # consensus bluntly
        refContig = reference.byName[refId].sequence
        refSequenceInEnlargedWindow = refContig[eStart:eEnd]

        #
        # Get the consensus for the enlarged window.
        #
        css_, variants_ = \
            consensusAndVariantsForWindow(self._inAlnFile, eWindow,
                                          refContig, options.coverage, self.arrowConfig)

        #
        # Restrict the consensus and variants to the reference window.
        #
        ga = cc.Align(refSequenceInEnlargedWindow, css_.sequence)
        targetPositions = cc.TargetToQueryPositions(ga)
        cssStart = targetPositions[refStart-eStart]
        cssEnd   = targetPositions[refEnd-eStart]

        cssSequence    = css_.sequence[cssStart:cssEnd]
        cssQv          = css_.confidence[cssStart:cssEnd]
        variants       = [ v for v in variants_
                           if refStart <= v.refStart < refEnd ]

        consensusObj = Consensus(referenceWindow,
                                 cssSequence,
                                 cssQv)

        return (referenceWindow, (consensusObj, variants))
Exemple #3
0
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
                                  depthLimit, arrowConfig):
    """
    High-level routine for calling the consensus for a
    window of the genome given a cmp.h5.

    Identifies the coverage contours of the window in order to
    identify subintervals where a good consensus can be called.
    Creates the desired "no evidence consensus" where there is
    inadequate coverage.
    """
    winId, winStart, winEnd = refWindow
    logging.info("Arrow operating on %s" %
                 reference.windowToString(refWindow))

    if options.fancyChunking:
        # 1) identify the intervals with adequate coverage for arrow
        #    consensus; restrict to intervals of length > 10
        alnHits = U.readsInWindow(alnFile, refWindow,
                                  depthLimit=20000,
                                  minMapQV=arrowConfig.minMapQV,
                                  strategy="longest",
                                  stratum=options.readStratum,
                                  barcode=options.barcode)
        starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
        ends   = np.fromiter((hit.tEnd   for hit in alnHits), np.int)
        intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage,
                                      starts, ends, minLength=10)
        coverageGaps = holes(refWindow, intervals)
        allIntervals = sorted(intervals + coverageGaps)
        if len(allIntervals) > 1:
            logging.info("Usable coverage in %s: %r" %
                         (reference.windowToString(refWindow), intervals))

    else:
        allIntervals = [ (winStart, winEnd) ]

    # 2) pull out the reads we will use for each interval
    # 3) call consensusForAlignments on the interval
    subConsensi = []
    variants = []

    for interval in allIntervals:
        intStart, intEnd = interval
        intRefSeq = referenceContig[intStart:intEnd]
        subWin = subWindow(refWindow, interval)

        windowRefSeq = referenceContig[intStart:intEnd]
        alns = U.readsInWindow(alnFile, subWin,
                               depthLimit=depthLimit,
                               minMapQV=arrowConfig.minMapQV,
                               strategy="longest",
                               stratum=options.readStratum,
                               barcode=options.barcode)
        clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
        clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig)

        if len([ a for a in clippedAlns
                 if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage:

            logging.debug("%s: Reads being used: %s" %
                          (reference.windowToString(subWin),
                           " ".join([str(hit.readName) for hit in alns])))

            css = U.consensusForAlignments(subWin,
                                           intRefSeq,
                                           clippedAlns,
                                           arrowConfig)

            siteCoverage = U.coverageInWindow(subWin, alns)

            variants_ = U.variantsFromConsensus(subWin, windowRefSeq,
                                                css.sequence, css.confidence, siteCoverage,
                                                options.aligner,
                                                ai=None)

            filteredVars =  filterVariants(options.minCoverage,
                                           options.minConfidence,
                                           variants_)
            # Annotate?
            if options.annotateGFF:
                annotateVariants(filteredVars, clippedAlns)

            variants += filteredVars

            # Dump?
            shouldDumpEvidence = \
                ((options.dumpEvidence == "all") or
                 (options.dumpEvidence == "variants") and (len(variants) > 0))
            if shouldDumpEvidence:
                logging.info("Arrow does not yet support --dumpEvidence")
#                 dumpEvidence(options.evidenceDirectory,
#                              subWin, windowRefSeq,
#                              clippedAlns, css)
        else:
            css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                                 subWin, intRefSeq)
        subConsensi.append(css)

    # 4) glue the subwindow consensus objects together to form the
    #    full window consensus
    css = join(subConsensi)

    # 5) Return
    return css, variants
Exemple #4
0
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
                                  depthLimit, arrowConfig):
    """
    High-level routine for calling the consensus for a
    window of the genome given a BAM file.

    Identifies the coverage contours of the window in order to
    identify subintervals where a good consensus can be called.
    Creates the desired "no evidence consensus" where there is
    inadequate coverage.
    """
    winId, winStart, winEnd = refWindow
    logging.info("Arrow operating on %s" % reference.windowToString(refWindow))

    if options.fancyChunking:
        # 1) identify the intervals with adequate coverage for arrow
        #    consensus; restrict to intervals of length > 10
        alnHits = U.readsInWindow(alnFile,
                                  refWindow,
                                  depthLimit=20000,
                                  minMapQV=arrowConfig.minMapQV,
                                  strategy="long-and-strand-balanced",
                                  stratum=options.readStratum,
                                  barcode=options.barcode)
        starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
        ends = np.fromiter((hit.tEnd for hit in alnHits), np.int)
        intervals = kSpannedIntervals(refWindow,
                                      arrowConfig.minPoaCoverage,
                                      starts,
                                      ends,
                                      minLength=10)
        coverageGaps = holes(refWindow, intervals)
        allIntervals = sorted(intervals + coverageGaps)
        if len(allIntervals) > 1:
            logging.info("Usable coverage in %s: %r" %
                         (reference.windowToString(refWindow), intervals))

    else:
        allIntervals = [(winStart, winEnd)]

    # 2) pull out the reads we will use for each interval
    # 3) call consensusForAlignments on the interval
    subConsensi = []
    variants = []

    for interval in allIntervals:
        intStart, intEnd = interval
        intRefSeq = referenceContig[intStart:intEnd]
        subWin = subWindow(refWindow, interval)

        windowRefSeq = referenceContig[intStart:intEnd]
        alns = U.readsInWindow(alnFile,
                               subWin,
                               depthLimit=depthLimit,
                               minMapQV=arrowConfig.minMapQV,
                               strategy="long-and-strand-balanced",
                               stratum=options.readStratum,
                               barcode=options.barcode)
        clippedAlns_ = [aln.clippedTo(*interval) for aln in alns]
        clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig)

        if len([a for a in clippedAlns if a.spansReferenceRange(*interval)
                ]) >= arrowConfig.minPoaCoverage:

            logging.debug("%s: Reads being used: %s" %
                          (reference.windowToString(subWin), " ".join(
                              [str(hit.readName) for hit in alns])))

            alnsUsed = [] if options.reportEffectiveCoverage else None
            css = U.consensusForAlignments(subWin,
                                           intRefSeq,
                                           clippedAlns,
                                           arrowConfig,
                                           alnsUsed=alnsUsed)

            # Tabulate the coverage implied by these alignments, as
            # well as the post-filtering ("effective") coverage
            siteCoverage = U.coverageInWindow(subWin, alns)
            effectiveSiteCoverage = U.coverageInWindow(
                subWin, alnsUsed) if options.reportEffectiveCoverage else None

            variants_, newPureCss = U.variantsFromConsensus(
                subWin,
                windowRefSeq,
                css.sequence,
                css.confidence,
                siteCoverage,
                effectiveSiteCoverage,
                options.aligner,
                ai=None,
                diploid=arrowConfig.polishDiploid)

            # Annotate?
            if options.annotateGFF:
                annotateVariants(variants_, clippedAlns)

            variants += variants_

            # The nascent consensus sequence might contain ambiguous bases, these
            # need to be removed as software in the wild cannot deal with such
            # characters and we only use IUPAC for *internal* bookkeeping.
            if arrowConfig.polishDiploid:
                css.sequence = newPureCss
        else:
            css = ArrowConsensus.noCallConsensus(
                arrowConfig.noEvidenceConsensus, subWin, intRefSeq)
        subConsensi.append(css)

    # 4) glue the subwindow consensus objects together to form the
    #    full window consensus
    css = join(subConsensi)

    # 5) Return
    return css, variants
Exemple #5
0
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig, draft=None, polish=True,
                           alnsUsed=None):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Returns an ArrowConsensus object.

    Requires that clipping has already been done.

    If `draft` is provided, it will serve as the starting
    point for polishing.  If not, the POA will be used to generate a
    draft starting point.

    If `polish` is False, the arrow polishing procedure will not be
    used, and the draft consensus will be returned.

    `alnsUsed` is an output parameter; if not None, it should be an
    empty list on entry; on return from this function, the list will
    contain the alns objects that were actually used to compute the
    consensus (those not filtered out).
    """
    _, refStart, refEnd = refWindow

    if alnsUsed is not None:
        assert alnsUsed == []

    if draft is None:
        # Compute the POA consensus, which is our initial guess, and
        # should typically be > 99.5% accurate
        fwdSequences = [ a.read(orientation="genomic", aligned=False)
                         for a in alns
                         if a.spansReferenceRange(refStart, refEnd) ]
        assert len(fwdSequences) >= arrowConfig.minPoaCoverage

        try:
            p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage])
        except:
            logging.info("%s: POA could not be generated" % (refWindow,))
            return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                                  refWindow, refSequence)
        draft = p.Sequence

    ga = cc.Align(refSequence, draft)

    # Extract reads into ConsensusCore2-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    ai = cc.MultiMolecularIntegrator(draft, cc.IntegratorConfig(arrowConfig.minZScore))
    coverage = 0
    for i, mr in enumerate(mappedReads):
        if (mr.TemplateEnd <= mr.TemplateStart or
            mr.TemplateEnd - mr.TemplateStart < 2 or
            mr.Length() < 2):
            continue
        if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy):
            tpl = draft[mr.TemplateStart:mr.TemplateEnd]
            if mr.Strand == cc.StrandType_FORWARD:
                pass
            elif mr.Strand == cc.StrandType_REVERSE:
                tpl = reverseComplement(tpl)
            else:
                tpl = "INACTIVE/UNMAPPED"
            logging.debug("%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq))
            continue
        if ai.AddRead(mr) == cc.State_VALID:
            coverage += 1
            if alnsUsed is not None:
                alnsUsed.append(alns[i])

    if coverage < arrowConfig.minPoaCoverage:
        logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)

    if not polish:
        confidence = np.zeros(len(draft), dtype=int)
        return ArrowConsensus(refWindow, draft, confidence, ai)

    # Iterate until covergence
    _, converged = refineConsensus(ai, arrowConfig)
    if converged:
        arrowCss = str(ai)
        if arrowConfig.computeConfidence:
            confidence = consensusConfidence(ai)
        else:
            confidence = np.zeros(shape=len(arrowCss), dtype=int)
        return ArrowConsensus(refWindow,
                              arrowCss,
                              confidence,
                              ai)
    else:
        logging.info("%s: Arrow did not converge to MLE" % (refWindow,))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Testable!

    Clipping has already been done!
    """
    _, refStart, refEnd = refWindow

    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [ a.read(orientation="genomic", aligned=False)
                     for a in alns
                     if a.spansReferenceRange(refStart, refEnd) ]
    assert len(fwdSequences) >= arrowConfig.minPoaCoverage

    try:
        p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage])
    except:
        logging.info("%s: POA could not be generated" % (refWindow,))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
    ga = cc.Align(refSequence, p.Sequence)
    numPoaVariants = ga.Errors()
    poaCss = p.Sequence

    # Extract reads into ConsensusCore2-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig(arrowConfig.minZScore))
    coverage = 0
    for mr in mappedReads:
        if (mr.TemplateEnd <= mr.TemplateStart or
            mr.TemplateEnd - mr.TemplateStart < 2 or
            mr.Length() < 2):
            continue
        if not sufficientlyAccurate(mr, poaCss, arrowConfig.minAccuracy):
            tpl = poaCss[mr.TemplateStart:mr.TemplateEnd]
            if mr.Strand == cc.StrandType_FORWARD:
                pass
            elif mr.Strand == cc.StrandType_REVERSE:
                tpl = reverseComplement(tpl)
            else:
                tpl = "INACTIVE/UNMAPPED"
            logging.debug("%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq))
            continue
        if ai.AddRead(mr) == cc.State_VALID:
            coverage += 1

    # Iterate until covergence
    if coverage < arrowConfig.minPoaCoverage:
        logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
    _, converged = refineConsensus(ai, arrowConfig)

    if converged:
        arrowCss = str(ai)
        if arrowConfig.computeConfidence:
            confidence = consensusConfidence(ai)
        else:
            confidence = np.zeros(shape=len(arrowCss), dtype=int)
        return ArrowConsensus(refWindow,
                              arrowCss,
                              confidence,
                              ai)
    else:
        logging.info("%s: Arrow did not converge to MLE" % (refWindow,))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
Exemple #7
0
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Testable!

    Clipping has already been done!
    """
    _, refStart, refEnd = refWindow

    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [
        a.read(orientation="genomic", aligned=False) for a in alns
        if a.spansReferenceRange(refStart, refEnd)
    ]
    assert len(fwdSequences) >= arrowConfig.minPoaCoverage

    try:
        p = cc.PoaConsensus.FindConsensus(
            fwdSequences[:arrowConfig.maxPoaCoverage])
    except:
        logging.info("%s: POA could not be generated" % (refWindow, ))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
    ga = cc.Align(refSequence, p.Sequence)
    numPoaVariants = ga.Errors()
    poaCss = p.Sequence

    # Extract reads into ConsensusCore2-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [
        arrowConfig.extractMappedRead(aln, refStart) for aln in alns
    ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [(lifted(queryPositions, mr), snr)
                   for (mr, snr) in mappedReads]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    ai = cc.MultiMolecularIntegrator(
        poaCss, cc.IntegratorConfig(arrowConfig.minZScore))
    coverage = 0
    for (mr, snr) in mappedReads:
        if (mr.TemplateEnd <= mr.TemplateStart
                or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2):
            continue
        coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0

    # TODO(lhepler, dalexander): propagate coverage around somehow

    # Iterate until covergence
    try:
        assert coverage >= arrowConfig.minPoaCoverage, \
            "Insufficient coverage (%d) to call consensus (%d)" \
            % (coverage, arrowConfig.minPoaCoverage)

        _, converged = refineConsensus(ai, arrowConfig)
        assert converged, "Arrow did not converge to MLE"
        arrowCss = str(ai)
        if arrowConfig.computeConfidence:
            confidence = consensusConfidence(ai)
        else:
            confidence = np.zeros(shape=len(arrowCss), dtype=int)
        return ArrowConsensus(refWindow, arrowCss, confidence, ai)
    except:
        traceback = ''.join(format_exception(*sys.exc_info()))
        logging.info("%s: %s" % (refWindow, traceback))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
Exemple #8
0
def consensusForAlignments(refWindow,
                           refSequence,
                           alns,
                           arrowConfig,
                           draft=None,
                           polish=True,
                           alnsUsed=None):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Returns an ArrowConsensus object.

    Requires that clipping has already been done.

    If `draft` is provided, it will serve as the starting
    point for polishing.  If not, the POA will be used to generate a
    draft starting point.

    If `polish` is False, the arrow polishing procedure will not be
    used, and the draft consensus will be returned.

    `alnsUsed` is an output parameter; if not None, it should be an
    empty list on entry; on return from this function, the list will
    contain the alns objects that were actually used to compute the
    consensus (those not filtered out).
    """
    _, refStart, refEnd = refWindow

    if alnsUsed is not None:
        assert alnsUsed == []

    if draft is None:
        # Compute the POA consensus, which is our initial guess, and
        # should typically be > 99.5% accurate
        fwdSequences = [
            a.read(orientation="genomic", aligned=False) for a in alns
            if a.spansReferenceRange(refStart, refEnd)
        ]
        assert len(fwdSequences) >= arrowConfig.minPoaCoverage

        try:
            p = poaConsensus(fwdSequences, arrowConfig)
        except Exception:
            logging.info("%s: POA could not be generated" % (refWindow, ))
            return ArrowConsensus.noCallConsensus(
                arrowConfig.noEvidenceConsensus, refWindow, refSequence)
        draft = p.Sequence

    ga = cc.Align(refSequence, draft)

    # Extract reads into ConsensusCore2-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [
        arrowConfig.extractMappedRead(aln, refStart) for aln in alns
    ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [lifted(queryPositions, mr) for mr in mappedReads]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    ai = cc.Integrator(draft, cc.IntegratorConfig(arrowConfig.minZScore))
    coverage = 0
    for i, mr in enumerate(mappedReads):
        if (mr.TemplateEnd <= mr.TemplateStart
                or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2):
            continue
        if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy):
            tpl = draft[mr.TemplateStart:mr.TemplateEnd]
            if mr.Strand == cc.StrandType_FORWARD:
                pass
            elif mr.Strand == cc.StrandType_REVERSE:
                tpl = reverseComplement(tpl)
            else:
                tpl = "INACTIVE/UNMAPPED"
            logging.debug(
                "%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')"
                % (refWindow, mr.Name, tpl, mr.Seq))
            continue
        if ai.AddRead(mr) == cc.State_VALID:
            coverage += 1
            if alnsUsed is not None:
                alnsUsed.append(alns[i])

    if coverage < arrowConfig.minPoaCoverage:
        logging.info("%s: Inadequate coverage to call consensus" %
                     (refWindow, ))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)

    if not polish:
        confidence = np.zeros(len(draft), dtype=int)
        return ArrowConsensus(refWindow, draft, confidence, ai)

    # Iterate until covergence
    _, converged = refineConsensus(ai, arrowConfig, polishDiploid=False)
    if converged:
        arrowCss = str(ai)
        if arrowConfig.computeConfidence:
            confidence = consensusConfidence(ai)
        else:
            confidence = np.zeros(shape=len(arrowCss), dtype=int)
    else:
        logging.info("%s: Arrow did not converge to MLE" % (refWindow, ))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)

    if arrowConfig.polishDiploid:
        # additional rounds of diploid polishing
        _, converged = refineConsensus(ai, arrowConfig, polishDiploid=True)
        if converged:
            arrowCss = str(ai)
            if arrowConfig.computeConfidence:
                confidence = consensusConfidence(ai)
            else:
                confidence = np.zeros(shape=len(arrowCss), dtype=int)
        else:
            logging.info(
                "%s: Arrow (diploid) did not converge to optimal solution" %
                (refWindow, ))

    return ArrowConsensus(refWindow, arrowCss, confidence, ai)
Exemple #9
0
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
                                  depthLimit, arrowConfig):
    """
    High-level routine for calling the consensus for a
    window of the genome given a cmp.h5.

    Identifies the coverage contours of the window in order to
    identify subintervals where a good consensus can be called.
    Creates the desired "no evidence consensus" where there is
    inadequate coverage.
    """
    winId, winStart, winEnd = refWindow
    logging.info("Arrow operating on %s" % reference.windowToString(refWindow))

    if options.fancyChunking:
        # 1) identify the intervals with adequate coverage for arrow
        #    consensus; restrict to intervals of length > 10
        alnHits = U.readsInWindow(alnFile,
                                  refWindow,
                                  depthLimit=20000,
                                  minMapQV=arrowConfig.minMapQV,
                                  strategy="long-and-strand-balanced",
                                  stratum=options.readStratum,
                                  barcode=options.barcode)
        starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
        ends = np.fromiter((hit.tEnd for hit in alnHits), np.int)
        intervals = kSpannedIntervals(refWindow,
                                      arrowConfig.minPoaCoverage,
                                      starts,
                                      ends,
                                      minLength=10)
        coverageGaps = holes(refWindow, intervals)
        allIntervals = sorted(intervals + coverageGaps)
        if len(allIntervals) > 1:
            logging.info("Usable coverage in %s: %r" %
                         (reference.windowToString(refWindow), intervals))

    else:
        allIntervals = [(winStart, winEnd)]

    # 2) pull out the reads we will use for each interval
    # 3) call consensusForAlignments on the interval
    subConsensi = []
    variants = []

    for interval in allIntervals:
        intStart, intEnd = interval
        intRefSeq = referenceContig[intStart:intEnd]
        subWin = subWindow(refWindow, interval)

        windowRefSeq = referenceContig[intStart:intEnd]
        alns = U.readsInWindow(alnFile,
                               subWin,
                               depthLimit=depthLimit,
                               minMapQV=arrowConfig.minMapQV,
                               strategy="long-and-strand-balanced",
                               stratum=options.readStratum,
                               barcode=options.barcode)
        clippedAlns_ = [aln.clippedTo(*interval) for aln in alns]
        clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig)

        if len([a for a in clippedAlns if a.spansReferenceRange(*interval)
                ]) >= arrowConfig.minPoaCoverage:

            logging.debug("%s: Reads being used: %s" %
                          (reference.windowToString(subWin), " ".join(
                              [str(hit.readName) for hit in alns])))

            alnsUsed = [] if options.reportEffectiveCoverage else None
            css = U.consensusForAlignments(subWin,
                                           intRefSeq,
                                           clippedAlns,
                                           arrowConfig,
                                           alnsUsed=alnsUsed)

            # Tabulate the coverage implied by these alignments, as
            # well as the post-filtering ("effective") coverage
            siteCoverage = U.coverageInWindow(subWin, alns)
            effectiveSiteCoverage = U.coverageInWindow(
                subWin, alnsUsed) if options.reportEffectiveCoverage else None

            variants_ = U.variantsFromConsensus(subWin,
                                                windowRefSeq,
                                                css.sequence,
                                                css.confidence,
                                                siteCoverage,
                                                effectiveSiteCoverage,
                                                options.aligner,
                                                ai=None)

            filteredVars = filterVariants(options.minCoverage,
                                          options.minConfidence, variants_)
            # Annotate?
            if options.annotateGFF:
                annotateVariants(filteredVars, clippedAlns)

            variants += filteredVars

            # Dump?
            maybeDumpEvidence = \
                ((options.dumpEvidence == "all") or
                 (options.dumpEvidence == "outliers") or
                 (options.dumpEvidence == "variants") and (len(variants) > 0))
            if maybeDumpEvidence:
                refId, refStart, refEnd = subWin
                refName = reference.idToName(refId)
                windowDirectory = os.path.join(options.evidenceDirectory,
                                               refName,
                                               "%d-%d" % (refStart, refEnd))
                ev = ArrowEvidence.fromConsensus(css)
                if options.dumpEvidence != "outliers":
                    ev.save(windowDirectory)
                elif (np.max(ev.delta) > 20):
                    # Mathematically I don't think we should be seeing
                    # deltas > 6 in magnitude, but let's just restrict
                    # attention to truly bonkers outliers.
                    ev.save(windowDirectory)

        else:
            css = ArrowConsensus.noCallConsensus(
                arrowConfig.noEvidenceConsensus, subWin, intRefSeq)
        subConsensi.append(css)

    # 4) glue the subwindow consensus objects together to form the
    #    full window consensus
    css = join(subConsensi)

    # 5) Return
    return css, variants
Exemple #10
0
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
                                  depthLimit, arrowConfig):
    """
    High-level routine for calling the consensus for a
    window of the genome given a cmp.h5.

    Identifies the coverage contours of the window in order to
    identify subintervals where a good consensus can be called.
    Creates the desired "no evidence consensus" where there is
    inadequate coverage.
    """
    winId, winStart, winEnd = refWindow
    logging.info("Arrow operating on %s" %
                 reference.windowToString(refWindow))

    if options.fancyChunking:
        # 1) identify the intervals with adequate coverage for arrow
        #    consensus; restrict to intervals of length > 10
        alnHits = U.readsInWindow(alnFile, refWindow,
                                  depthLimit=20000,
                                  minMapQV=arrowConfig.minMapQV,
                                  strategy="long-and-strand-balanced",
                                  stratum=options.readStratum,
                                  barcode=options.barcode)
        starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
        ends   = np.fromiter((hit.tEnd   for hit in alnHits), np.int)
        intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage,
                                      starts, ends, minLength=10)
        coverageGaps = holes(refWindow, intervals)
        allIntervals = sorted(intervals + coverageGaps)
        if len(allIntervals) > 1:
            logging.info("Usable coverage in %s: %r" %
                         (reference.windowToString(refWindow), intervals))

    else:
        allIntervals = [ (winStart, winEnd) ]

    # 2) pull out the reads we will use for each interval
    # 3) call consensusForAlignments on the interval
    subConsensi = []
    variants = []

    for interval in allIntervals:
        intStart, intEnd = interval
        intRefSeq = referenceContig[intStart:intEnd]
        subWin = subWindow(refWindow, interval)

        windowRefSeq = referenceContig[intStart:intEnd]
        alns = U.readsInWindow(alnFile, subWin,
                               depthLimit=depthLimit,
                               minMapQV=arrowConfig.minMapQV,
                               strategy="long-and-strand-balanced",
                               stratum=options.readStratum,
                               barcode=options.barcode)
        clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
        clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig)

        if len([ a for a in clippedAlns
                 if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage:

            logging.debug("%s: Reads being used: %s" %
                          (reference.windowToString(subWin),
                           " ".join([str(hit.readName) for hit in alns])))

            alnsUsed = [] if options.reportEffectiveCoverage else None
            css = U.consensusForAlignments(subWin,
                                           intRefSeq,
                                           clippedAlns,
                                           arrowConfig,
                                           alnsUsed=alnsUsed)

            # Tabulate the coverage implied by these alignments, as
            # well as the post-filtering ("effective") coverage
            siteCoverage = U.coverageInWindow(subWin, alns)
            effectiveSiteCoverage = U.coverageInWindow(subWin, alnsUsed) if options.reportEffectiveCoverage else None

            variants_ = U.variantsFromConsensus(subWin, windowRefSeq,
                                                css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage,
                                                options.aligner,
                                                ai=None)

            filteredVars =  filterVariants(options.minCoverage,
                                           options.minConfidence,
                                           variants_)
            # Annotate?
            if options.annotateGFF:
                annotateVariants(filteredVars, clippedAlns)

            variants += filteredVars

            # Dump?
            maybeDumpEvidence = \
                ((options.dumpEvidence == "all") or
                 (options.dumpEvidence == "outliers") or
                 (options.dumpEvidence == "variants") and (len(variants) > 0))
            if maybeDumpEvidence:
                refId, refStart, refEnd = subWin
                refName = reference.idToName(refId)
                windowDirectory = os.path.join(
                    options.evidenceDirectory,
                    refName,
                    "%d-%d" % (refStart, refEnd))
                ev = ArrowEvidence.fromConsensus(css)
                if options.dumpEvidence != "outliers":
                    ev.save(windowDirectory)
                elif (np.max(ev.delta) > 20):
                    # Mathematically I don't think we should be seeing
                    # deltas > 6 in magnitude, but let's just restrict
                    # attention to truly bonkers outliers.
                    ev.save(windowDirectory)

        else:
            css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                                 subWin, intRefSeq)
        subConsensi.append(css)

    # 4) glue the subwindow consensus objects together to form the
    #    full window consensus
    css = join(subConsensi)

    # 5) Return
    return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
                                  depthLimit, arrowConfig):
    """
    High-level routine for calling the consensus for a
    window of the genome given a BAM file.

    Identifies the coverage contours of the window in order to
    identify subintervals where a good consensus can be called.
    Creates the desired "no evidence consensus" where there is
    inadequate coverage.
    """
    winId, winStart, winEnd = refWindow
    logging.info("Arrow operating on %s" %
                 reference.windowToString(refWindow))

    if options.fancyChunking:
        # 1) identify the intervals with adequate coverage for arrow
        #    consensus; restrict to intervals of length > 10
        alnHits = U.readsInWindow(alnFile, refWindow,
                                  depthLimit=20000,
                                  minMapQV=arrowConfig.minMapQV,
                                  strategy="long-and-strand-balanced",
                                  stratum=options.readStratum,
                                  barcode=options.barcode)
        starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
        ends   = np.fromiter((hit.tEnd   for hit in alnHits), np.int)
        intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage,
                                      starts, ends, minLength=10)
        coverageGaps = holes(refWindow, intervals)
        allIntervals = sorted(intervals + coverageGaps)
        if len(allIntervals) > 1:
            logging.info("Usable coverage in %s: %r" %
                         (reference.windowToString(refWindow), intervals))

    else:
        allIntervals = [ (winStart, winEnd) ]

    # 2) pull out the reads we will use for each interval
    # 3) call consensusForAlignments on the interval
    subConsensi = []
    variants = []

    for interval in allIntervals:
        intStart, intEnd = interval
        intRefSeq = referenceContig[intStart:intEnd]
        subWin = subWindow(refWindow, interval)

        windowRefSeq = referenceContig[intStart:intEnd]
        alns = U.readsInWindow(alnFile, subWin,
                               depthLimit=depthLimit,
                               minMapQV=arrowConfig.minMapQV,
                               strategy="long-and-strand-balanced",
                               stratum=options.readStratum,
                               barcode=options.barcode)
        clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
        clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig)

        if len([ a for a in clippedAlns
                 if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage:

            logging.debug("%s: Reads being used: %s" %
                          (reference.windowToString(subWin),
                           " ".join([str(hit.readName) for hit in alns])))

            alnsUsed = [] if options.reportEffectiveCoverage else None
            css = U.consensusForAlignments(subWin,
                                           intRefSeq,
                                           clippedAlns,
                                           arrowConfig,
                                           alnsUsed=alnsUsed)

            # Tabulate the coverage implied by these alignments, as
            # well as the post-filtering ("effective") coverage
            siteCoverage = U.coverageInWindow(subWin, alns)
            effectiveSiteCoverage = U.coverageInWindow(subWin, alnsUsed) if options.reportEffectiveCoverage else None

            variants_, newPureCss = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence,
                                                            siteCoverage, effectiveSiteCoverage,
                                                            options.aligner, ai=None,
                                                            diploid=arrowConfig.polishDiploid)

            # Annotate?
            if options.annotateGFF:
                annotateVariants(variants_, clippedAlns)

            variants += variants_

            # The nascent consensus sequence might contain ambiguous bases, these
            # need to be removed as software in the wild cannot deal with such
            # characters and we only use IUPAC for *internal* bookkeeping.
            if arrowConfig.polishDiploid:
                css.sequence = newPureCss
        else:
            css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                                 subWin, intRefSeq)
        subConsensi.append(css)

    # 4) glue the subwindow consensus objects together to form the
    #    full window consensus
    css = join(subConsensi)

    # 5) Return
    return css, variants