def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): """ Call consensus on this interval---without subdividing the interval further. Testable! Clipping has already been done! """ _, refStart, refEnd = refWindow # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus( fwdSequences[:arrowConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) ga = cc.Align(refSequence, p.Sequence) numPoaVariants = ga.Errors() poaCss = p.Sequence # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [(lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.MultiMolecularIntegrator( poaCss, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for (mr, snr) in mappedReads: if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0 # TODO(lhepler, dalexander): propagate coverage around somehow # Iterate until covergence try: assert coverage >= arrowConfig.minPoaCoverage, \ "Insufficient coverage (%d) to call consensus (%d)" \ % (coverage, arrowConfig.minPoaCoverage) _, converged = refineConsensus(ai, arrowConfig) assert converged, "Arrow did not converge to MLE" arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) return ArrowConsensus(refWindow, arrowCss, confidence, ai) except: traceback = ''.join(format_exception(*sys.exc_info())) logging.info("%s: %s" % (refWindow, traceback)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence)
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig, draft=None, polish=True, alnsUsed=None): """ Call consensus on this interval---without subdividing the interval further. Returns an ArrowConsensus object. Requires that clipping has already been done. If `draft` is provided, it will serve as the starting point for polishing. If not, the POA will be used to generate a draft starting point. If `polish` is False, the arrow polishing procedure will not be used, and the draft consensus will be returned. `alnsUsed` is an output parameter; if not None, it should be an empty list on entry; on return from this function, the list will contain the alns objects that were actually used to compute the consensus (those not filtered out). """ _, refStart, refEnd = refWindow if alnsUsed is not None: assert alnsUsed == [] if draft is None: # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = poaConsensus(fwdSequences, arrowConfig) except Exception: logging.info("%s: POA could not be generated" % (refWindow, )) return ArrowConsensus.noCallConsensus( arrowConfig.noEvidenceConsensus, refWindow, refSequence) draft = p.Sequence ga = cc.Align(refSequence, draft) # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [lifted(queryPositions, mr) for mr in mappedReads] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.Integrator(draft, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for i, mr in enumerate(mappedReads): if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy): tpl = draft[mr.TemplateStart:mr.TemplateEnd] if mr.Strand == cc.StrandType_FORWARD: pass elif mr.Strand == cc.StrandType_REVERSE: tpl = reverseComplement(tpl) else: tpl = "INACTIVE/UNMAPPED" logging.debug( "%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq)) continue if ai.AddRead(mr) == cc.State_VALID: coverage += 1 if alnsUsed is not None: alnsUsed.append(alns[i]) if coverage < arrowConfig.minPoaCoverage: logging.info("%s: Inadequate coverage to call consensus" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) if not polish: confidence = np.zeros(len(draft), dtype=int) return ArrowConsensus(refWindow, draft, confidence, ai) # Iterate until covergence _, converged = refineConsensus(ai, arrowConfig, polishDiploid=False) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) else: logging.info("%s: Arrow did not converge to MLE" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) if arrowConfig.polishDiploid: # additional rounds of diploid polishing _, converged = refineConsensus(ai, arrowConfig, polishDiploid=True) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) else: logging.info( "%s: Arrow (diploid) did not converge to optimal solution" % (refWindow, )) return ArrowConsensus(refWindow, arrowCss, confidence, ai)