Ejemplo n.º 1
0
def consensusForAlignments(refWindow, refSequence, alns, quiverConfig):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Testable!

    Clipping has already been done!
    """
    _, refStart, refEnd = refWindow

    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [
        a.read(orientation="genomic", aligned=False) for a in alns
        if a.spansReferenceRange(refStart, refEnd)
    ]
    assert len(fwdSequences) >= quiverConfig.minPoaCoverage

    try:
        p = cc.PoaConsensus.FindConsensus(
            fwdSequences[:quiverConfig.maxPoaCoverage])
    except:
        logging.info("%s: POA could not be generated" % (refWindow, ))
        return QuiverConsensus.noCallConsensus(
            quiverConfig.noEvidenceConsensus, refWindow, refSequence)
    ga = cc.Align(refSequence, p.Sequence())
    numPoaVariants = ga.Errors()
    poaCss = p.Sequence()

    # Extract reads into ConsensusCore-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [
        quiverConfig.extractMappedRead(aln, refStart) for aln in alns
    ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [lifted(queryPositions, mr) for mr in mappedReads]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    configTbl = quiverConfig.ccQuiverConfigTbl
    mms = cc.SparseSseQvMultiReadMutationScorer(configTbl, poaCss)
    for mr in mappedReads:
        mms.AddRead(mr)

    # Iterate until covergence
    _, quiverConverged = refineConsensus(mms, quiverConfig)
    if quiverConverged:
        if quiverConfig.refineDinucleotideRepeats:
            refineDinucleotideRepeats(mms)
        quiverCss = mms.Template()
        if quiverConfig.computeConfidence:
            confidence = consensusConfidence(mms)
        else:
            confidence = np.zeros(shape=len(quiverCss), dtype=int)
        return QuiverConsensus(refWindow, quiverCss, confidence, mms)
    else:
        logging.info("%s: Quiver did not converge to MLE" % (refWindow, ))
        return QuiverConsensus.noCallConsensus(
            quiverConfig.noEvidenceConsensus, refWindow, refSequence)
Ejemplo n.º 2
0
def consensusForAlignments(refWindow, refSequence, alns, quiverConfig):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Testable!

    Clipping has already been done!
    """
    _, refStart, refEnd = refWindow

    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [ a.read(orientation="genomic", aligned=False)
                     for a in alns
                     if a.spansReferenceRange(refStart, refEnd) ]
    assert len(fwdSequences) >= quiverConfig.minPoaCoverage

    try:
        p = cc.PoaConsensus.FindConsensus(fwdSequences[:quiverConfig.maxPoaCoverage])
    except:
        logging.info("%s: POA could not be generated" % (refWindow,))
        return QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus,
                                               refWindow, refSequence)
    ga = cc.Align(refSequence, p.Sequence)
    numPoaVariants = ga.Errors()
    poaCss = p.Sequence

    # Extract reads into ConsensusCore-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [ quiverConfig.extractMappedRead(aln, refStart) for aln in alns ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    configTbl = quiverConfig.ccQuiverConfigTbl
    mms = cc.SparseSseQvMultiReadMutationScorer(configTbl, poaCss)
    for mr in mappedReads:
        mms.AddRead(mr)

    # Iterate until covergence
    _, quiverConverged = refineConsensus(mms, quiverConfig)
    if quiverConverged:
        if quiverConfig.refineDinucleotideRepeats:
            refineDinucleotideRepeats(mms)
        quiverCss = mms.Template()
        if quiverConfig.computeConfidence:
            confidence = consensusConfidence(mms)
        else:
            confidence = np.zeros(shape=len(quiverCss), dtype=int)
        return QuiverConsensus(refWindow,
                               quiverCss,
                               confidence,
                               mms)
    else:
        logging.info("%s: Quiver did not converge to MLE" % (refWindow,))
        return QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus,
                                               refWindow, refSequence)
Ejemplo n.º 3
0
    def onChunk(self, workChunk):
        referenceWindow  = workChunk.window
        refId, refStart, refEnd = referenceWindow

        refSeqInWindow = reference.sequenceInWindow(referenceWindow)

        # Quick cutout for no-coverage case
        if not workChunk.hasCoverage:
            noCallCss = QuiverConsensus.noCallConsensus(self.quiverConfig.noEvidenceConsensus,
                                                        referenceWindow, refSeqInWindow)
            return (referenceWindow, (noCallCss, []))

        # General case
        eWindow = reference.enlargedReferenceWindow(referenceWindow,
                                                    options.referenceChunkOverlap)
        _, eStart, eEnd = eWindow

        # We call consensus on the enlarged window and then map back
        # to the reference and clip the consensus at the implied
        # bounds.  This seems to be more reliable thank cutting the
        # consensus bluntly
        refContig = reference.byName[refId].sequence
        refSequenceInEnlargedWindow = refContig[eStart:eEnd]

        #
        # Get the consensus for the enlarged window.
        #
        css_, variants_ = \
            consensusAndVariantsForWindow(self._inAlnFile, eWindow,
                                          refContig, options.coverage, self.quiverConfig)

        #
        # Restrict the consensus and variants to the reference window.
        #
        ga = cc.Align(refSequenceInEnlargedWindow, css_.sequence)
        targetPositions = cc.TargetToQueryPositions(ga)
        cssStart = targetPositions[refStart-eStart]
        cssEnd   = targetPositions[refEnd-eStart]

        cssSequence    = css_.sequence[cssStart:cssEnd]
        cssQv          = css_.confidence[cssStart:cssEnd]
        variants       = [ v for v in variants_
                           if refStart <= v.refStart < refEnd ]

        consensusObj = Consensus(referenceWindow,
                                 cssSequence,
                                 cssQv)

        return (referenceWindow, (consensusObj, variants))
Ejemplo n.º 4
0
    def onChunk(self, workChunk):
        referenceWindow  = workChunk.window
        refId, refStart, refEnd = referenceWindow

        refSeqInWindow = reference.sequenceInWindow(referenceWindow)

        # Quick cutout for no-coverage case
        if not workChunk.hasCoverage:
            noCallCss = QuiverConsensus.noCallConsensus(self.quiverConfig.noEvidenceConsensus,
                                                        referenceWindow, refSeqInWindow)
            return (referenceWindow, (noCallCss, []))

        # General case
        eWindow = reference.enlargedReferenceWindow(referenceWindow,
                                                    options.referenceChunkOverlap)
        _, eStart, eEnd = eWindow

        # We call consensus on the enlarged window and then map back
        # to the reference and clip the consensus at the implied
        # bounds.  This seems to be more reliable thank cutting the
        # consensus bluntly
        refContig = reference.byName[refId].sequence
        refSequenceInEnlargedWindow = refContig[eStart:eEnd]

        #
        # Get the consensus for the enlarged window.
        #
        css_, variants_ = \
            consensusAndVariantsForWindow(self._inAlnFile, eWindow,
                                          refContig, options.coverage, self.quiverConfig)

        #
        # Restrict the consensus and variants to the reference window.
        #
        ga = cc.Align(refSequenceInEnlargedWindow, css_.sequence)
        targetPositions = cc.TargetToQueryPositions(ga)
        cssStart = targetPositions[refStart-eStart]
        cssEnd   = targetPositions[refEnd-eStart]

        cssSequence    = css_.sequence[cssStart:cssEnd]
        cssQv          = css_.confidence[cssStart:cssEnd]
        variants       = [ v for v in variants_
                           if refStart <= v.refStart < refEnd ]

        consensusObj = Consensus(referenceWindow,
                                 cssSequence,
                                 cssQv)

        return (referenceWindow, (consensusObj, variants))
Ejemplo n.º 5
0
def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
                                  depthLimit, quiverConfig):
    """
    High-level routine for calling the consensus for a
    window of the genome given an alignment file.

    Identifies the coverage contours of the window in order to
    identify subintervals where a good consensus can be called.
    Creates the desired "no evidence consensus" where there is
    inadequate coverage.
    """
    winId, winStart, winEnd = refWindow
    logging.info("Quiver operating on %s" %
                 reference.windowToString(refWindow))

    if options.fancyChunking:
        # 1) identify the intervals with adequate coverage for quiver
        #    consensus; restrict to intervals of length > 10
        alnHits = U.readsInWindow(cmpH5, refWindow,
                                  depthLimit=20000,
                                  minMapQV=quiverConfig.minMapQV,
                                  strategy="long-and-strand-balanced",
                                  stratum=options.readStratum,
                                  barcode=options.barcode)
        starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
        ends   = np.fromiter((hit.tEnd   for hit in alnHits), np.int)
        intervals = kSpannedIntervals(refWindow, quiverConfig.minPoaCoverage,
                                      starts, ends, minLength=10)
        coverageGaps = holes(refWindow, intervals)
        allIntervals = sorted(intervals + coverageGaps)
        if len(allIntervals) > 1:
            logging.info("Usable coverage in %s: %r" %
                         (reference.windowToString(refWindow), intervals))

    else:
        allIntervals = [ (winStart, winEnd) ]

    # 2) pull out the reads we will use for each interval
    # 3) call consensusForAlignments on the interval
    subConsensi = []
    variants = []

    for interval in allIntervals:
        intStart, intEnd = interval
        intRefSeq = referenceContig[intStart:intEnd]
        subWin = subWindow(refWindow, interval)

        windowRefSeq = referenceContig[intStart:intEnd]
        alns = U.readsInWindow(cmpH5, subWin,
                               depthLimit=depthLimit,
                               minMapQV=quiverConfig.minMapQV,
                               strategy="long-and-strand-balanced",
                               stratum=options.readStratum,
                               barcode=options.barcode)
        clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
        clippedAlns = U.filterAlns(subWin, clippedAlns_, quiverConfig)

        if len([ a for a in clippedAlns
                 if a.spansReferenceRange(*interval) ]) >= quiverConfig.minPoaCoverage:

            logging.debug("%s: Reads being used: %s" %
                          (reference.windowToString(subWin),
                           " ".join([str(hit.readName) for hit in alns])))

            css = U.consensusForAlignments(subWin,
                                           intRefSeq,
                                           clippedAlns,
                                           quiverConfig)

            siteCoverage = U.coverageInWindow(subWin, alns)

            if options.diploid:
                variants_ = diploid.variantsFromConsensus(subWin, windowRefSeq,
                                                          css.sequence, css.confidence, siteCoverage,
                                                          options.aligner,
                                                          css.mms)
            else:
                variants_ = U.variantsFromConsensus(subWin, windowRefSeq,
                                                    css.sequence, css.confidence, siteCoverage,
                                                    options.aligner,
                                                    mms=None)

            filteredVars =  filterVariants(options.minCoverage,
                                           options.minConfidence,
                                           variants_)
            # Annotate?
            if options.annotateGFF:
                annotateVariants(filteredVars, clippedAlns)

            variants += filteredVars

            # Dump?
            shouldDumpEvidence = \
                ((options.dumpEvidence == "all") or
                 (options.dumpEvidence == "variants") and (len(variants) > 0))
            if shouldDumpEvidence:
                dumpEvidence(options.evidenceDirectory,
                             subWin, windowRefSeq,
                             clippedAlns, css)
        else:
            css = QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus,
                                                  subWin, intRefSeq)
        subConsensi.append(css)

    # 4) glue the subwindow consensus objects together to form the
    #    full window consensus
    css = join(subConsensi)

    # 5) Return
    return css, variants
Ejemplo n.º 6
0
def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
                                  depthLimit, quiverConfig):
    """
    High-level routine for calling the consensus for a
    window of the genome given an alignment file.

    Identifies the coverage contours of the window in order to
    identify subintervals where a good consensus can be called.
    Creates the desired "no evidence consensus" where there is
    inadequate coverage.
    """
    winId, winStart, winEnd = refWindow
    logging.info("Quiver operating on %s" %
                 reference.windowToString(refWindow))

    if options.fancyChunking:
        # 1) identify the intervals with adequate coverage for quiver
        #    consensus; restrict to intervals of length > 10
        alnHits = U.readsInWindow(cmpH5, refWindow,
                                  depthLimit=20000,
                                  minMapQV=quiverConfig.minMapQV,
                                  strategy="long-and-strand-balanced",
                                  stratum=options.readStratum,
                                  barcode=options.barcode)
        starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
        ends   = np.fromiter((hit.tEnd   for hit in alnHits), np.int)
        intervals = kSpannedIntervals(refWindow, quiverConfig.minPoaCoverage,
                                      starts, ends, minLength=10)
        coverageGaps = holes(refWindow, intervals)
        allIntervals = sorted(intervals + coverageGaps)
        if len(allIntervals) > 1:
            logging.info("Usable coverage in %s: %r" %
                         (reference.windowToString(refWindow), intervals))

    else:
        allIntervals = [ (winStart, winEnd) ]

    # 2) pull out the reads we will use for each interval
    # 3) call consensusForAlignments on the interval
    subConsensi = []
    variants = []

    for interval in allIntervals:
        intStart, intEnd = interval
        intRefSeq = referenceContig[intStart:intEnd]
        subWin = subWindow(refWindow, interval)

        windowRefSeq = referenceContig[intStart:intEnd]
        alns = U.readsInWindow(cmpH5, subWin,
                               depthLimit=depthLimit,
                               minMapQV=quiverConfig.minMapQV,
                               strategy="long-and-strand-balanced",
                               stratum=options.readStratum,
                               barcode=options.barcode)
        clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
        clippedAlns = U.filterAlns(subWin, clippedAlns_, quiverConfig)

        if len([ a for a in clippedAlns
                 if a.spansReferenceRange(*interval) ]) >= quiverConfig.minPoaCoverage:

            logging.debug("%s: Reads being used: %s" %
                          (reference.windowToString(subWin),
                           " ".join([str(hit.readName) for hit in alns])))

            css = U.consensusForAlignments(subWin,
                                           intRefSeq,
                                           clippedAlns,
                                           quiverConfig)

            siteCoverage = U.coverageInWindow(subWin, alns)

            if options.diploid:
                variants_ = diploid.variantsFromConsensus(subWin, windowRefSeq,
                                                          css.sequence, css.confidence, siteCoverage,
                                                          options.aligner,
                                                          css.mms)
            else:
                variants_ = U.variantsFromConsensus(subWin, windowRefSeq,
                                                    css.sequence, css.confidence, siteCoverage,
                                                    options.aligner,
                                                    mms=None)

            filteredVars =  filterVariants(options.minCoverage,
                                           options.minConfidence,
                                           variants_)
            # Annotate?
            if options.annotateGFF:
                annotateVariants(filteredVars, clippedAlns)

            variants += filteredVars

            # Dump?
            shouldDumpEvidence = \
                ((options.dumpEvidence == "all") or
                 (options.dumpEvidence == "variants") and (len(variants) > 0))
            if shouldDumpEvidence:
                dumpEvidence(options.evidenceDirectory,
                             subWin, windowRefSeq,
                             clippedAlns, css)
        else:
            css = QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus,
                                                  subWin, intRefSeq)
        subConsensi.append(css)

    # 4) glue the subwindow consensus objects together to form the
    #    full window consensus
    css = join(subConsensi)

    # 5) Return
    return css, variants