コード例 #1
0
ファイル: utils.py プロジェクト: hainm/GenomicConsensus
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Testable!

    Clipping has already been done!
    """
    _, refStart, refEnd = refWindow

    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [
        a.read(orientation="genomic", aligned=False) for a in alns
        if a.spansReferenceRange(refStart, refEnd)
    ]
    assert len(fwdSequences) >= arrowConfig.minPoaCoverage

    try:
        p = cc.PoaConsensus.FindConsensus(
            fwdSequences[:arrowConfig.maxPoaCoverage])
    except:
        logging.info("%s: POA could not be generated" % (refWindow, ))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
    ga = cc.Align(refSequence, p.Sequence)
    numPoaVariants = ga.Errors()
    poaCss = p.Sequence

    # Extract reads into ConsensusCore2-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [
        arrowConfig.extractMappedRead(aln, refStart) for aln in alns
    ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [(lifted(queryPositions, mr), snr)
                   for (mr, snr) in mappedReads]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    ai = cc.MultiMolecularIntegrator(
        poaCss, cc.IntegratorConfig(arrowConfig.minZScore))
    coverage = 0
    for (mr, snr) in mappedReads:
        if (mr.TemplateEnd <= mr.TemplateStart
                or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2):
            continue
        coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0

    # TODO(lhepler, dalexander): propagate coverage around somehow

    # Iterate until covergence
    try:
        assert coverage >= arrowConfig.minPoaCoverage, \
            "Insufficient coverage (%d) to call consensus (%d)" \
            % (coverage, arrowConfig.minPoaCoverage)

        _, converged = refineConsensus(ai, arrowConfig)
        assert converged, "Arrow did not converge to MLE"
        arrowCss = str(ai)
        if arrowConfig.computeConfidence:
            confidence = consensusConfidence(ai)
        else:
            confidence = np.zeros(shape=len(arrowCss), dtype=int)
        return ArrowConsensus(refWindow, arrowCss, confidence, ai)
    except:
        traceback = ''.join(format_exception(*sys.exc_info()))
        logging.info("%s: %s" % (refWindow, traceback))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)
コード例 #2
0
ファイル: utils.py プロジェクト: xxz19900/GenomicConsensus
def consensusForAlignments(refWindow,
                           refSequence,
                           alns,
                           arrowConfig,
                           draft=None,
                           polish=True,
                           alnsUsed=None):
    """
    Call consensus on this interval---without subdividing the interval
    further.

    Returns an ArrowConsensus object.

    Requires that clipping has already been done.

    If `draft` is provided, it will serve as the starting
    point for polishing.  If not, the POA will be used to generate a
    draft starting point.

    If `polish` is False, the arrow polishing procedure will not be
    used, and the draft consensus will be returned.

    `alnsUsed` is an output parameter; if not None, it should be an
    empty list on entry; on return from this function, the list will
    contain the alns objects that were actually used to compute the
    consensus (those not filtered out).
    """
    _, refStart, refEnd = refWindow

    if alnsUsed is not None:
        assert alnsUsed == []

    if draft is None:
        # Compute the POA consensus, which is our initial guess, and
        # should typically be > 99.5% accurate
        fwdSequences = [
            a.read(orientation="genomic", aligned=False) for a in alns
            if a.spansReferenceRange(refStart, refEnd)
        ]
        assert len(fwdSequences) >= arrowConfig.minPoaCoverage

        try:
            p = poaConsensus(fwdSequences, arrowConfig)
        except Exception:
            logging.info("%s: POA could not be generated" % (refWindow, ))
            return ArrowConsensus.noCallConsensus(
                arrowConfig.noEvidenceConsensus, refWindow, refSequence)
        draft = p.Sequence

    ga = cc.Align(refSequence, draft)

    # Extract reads into ConsensusCore2-compatible objects, and map them into the
    # coordinates relative to the POA consensus
    mappedReads = [
        arrowConfig.extractMappedRead(aln, refStart) for aln in alns
    ]
    queryPositions = cc.TargetToQueryPositions(ga)
    mappedReads = [lifted(queryPositions, mr) for mr in mappedReads]

    # Load the mapped reads into the mutation scorer, and iterate
    # until convergence.
    ai = cc.Integrator(draft, cc.IntegratorConfig(arrowConfig.minZScore))
    coverage = 0
    for i, mr in enumerate(mappedReads):
        if (mr.TemplateEnd <= mr.TemplateStart
                or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2):
            continue
        if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy):
            tpl = draft[mr.TemplateStart:mr.TemplateEnd]
            if mr.Strand == cc.StrandType_FORWARD:
                pass
            elif mr.Strand == cc.StrandType_REVERSE:
                tpl = reverseComplement(tpl)
            else:
                tpl = "INACTIVE/UNMAPPED"
            logging.debug(
                "%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')"
                % (refWindow, mr.Name, tpl, mr.Seq))
            continue
        if ai.AddRead(mr) == cc.State_VALID:
            coverage += 1
            if alnsUsed is not None:
                alnsUsed.append(alns[i])

    if coverage < arrowConfig.minPoaCoverage:
        logging.info("%s: Inadequate coverage to call consensus" %
                     (refWindow, ))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)

    if not polish:
        confidence = np.zeros(len(draft), dtype=int)
        return ArrowConsensus(refWindow, draft, confidence, ai)

    # Iterate until covergence
    _, converged = refineConsensus(ai, arrowConfig, polishDiploid=False)
    if converged:
        arrowCss = str(ai)
        if arrowConfig.computeConfidence:
            confidence = consensusConfidence(ai)
        else:
            confidence = np.zeros(shape=len(arrowCss), dtype=int)
    else:
        logging.info("%s: Arrow did not converge to MLE" % (refWindow, ))
        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                              refWindow, refSequence)

    if arrowConfig.polishDiploid:
        # additional rounds of diploid polishing
        _, converged = refineConsensus(ai, arrowConfig, polishDiploid=True)
        if converged:
            arrowCss = str(ai)
            if arrowConfig.computeConfidence:
                confidence = consensusConfidence(ai)
            else:
                confidence = np.zeros(shape=len(arrowCss), dtype=int)
        else:
            logging.info(
                "%s: Arrow (diploid) did not converge to optimal solution" %
                (refWindow, ))

    return ArrowConsensus(refWindow, arrowCss, confidence, ai)