def consensusForAlignments(refWindow, refSequence, alns, quiverConfig): """ Call consensus on this interval---without subdividing the interval further. Testable! Clipping has already been done! """ _, refStart, refEnd = refWindow # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= quiverConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus( fwdSequences[:quiverConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow, )) return QuiverConsensus.noCallConsensus( quiverConfig.noEvidenceConsensus, refWindow, refSequence) ga = cc.Align(refSequence, p.Sequence()) numPoaVariants = ga.Errors() poaCss = p.Sequence() # Extract reads into ConsensusCore-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ quiverConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [lifted(queryPositions, mr) for mr in mappedReads] # Load the mapped reads into the mutation scorer, and iterate # until convergence. configTbl = quiverConfig.ccQuiverConfigTbl mms = cc.SparseSseQvMultiReadMutationScorer(configTbl, poaCss) for mr in mappedReads: mms.AddRead(mr) # Iterate until covergence _, quiverConverged = refineConsensus(mms, quiverConfig) if quiverConverged: if quiverConfig.refineDinucleotideRepeats: refineDinucleotideRepeats(mms) quiverCss = mms.Template() if quiverConfig.computeConfidence: confidence = consensusConfidence(mms) else: confidence = np.zeros(shape=len(quiverCss), dtype=int) return QuiverConsensus(refWindow, quiverCss, confidence, mms) else: logging.info("%s: Quiver did not converge to MLE" % (refWindow, )) return QuiverConsensus.noCallConsensus( quiverConfig.noEvidenceConsensus, refWindow, refSequence)
def consensusForAlignments(refWindow, refSequence, alns, quiverConfig): """ Call consensus on this interval---without subdividing the interval further. Testable! Clipping has already been done! """ _, refStart, refEnd = refWindow # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= quiverConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus(fwdSequences[:quiverConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow,)) return QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus, refWindow, refSequence) ga = cc.Align(refSequence, p.Sequence) numPoaVariants = ga.Errors() poaCss = p.Sequence # Extract reads into ConsensusCore-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ quiverConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ] # Load the mapped reads into the mutation scorer, and iterate # until convergence. configTbl = quiverConfig.ccQuiverConfigTbl mms = cc.SparseSseQvMultiReadMutationScorer(configTbl, poaCss) for mr in mappedReads: mms.AddRead(mr) # Iterate until covergence _, quiverConverged = refineConsensus(mms, quiverConfig) if quiverConverged: if quiverConfig.refineDinucleotideRepeats: refineDinucleotideRepeats(mms) quiverCss = mms.Template() if quiverConfig.computeConfidence: confidence = consensusConfidence(mms) else: confidence = np.zeros(shape=len(quiverCss), dtype=int) return QuiverConsensus(refWindow, quiverCss, confidence, mms) else: logging.info("%s: Quiver did not converge to MLE" % (refWindow,)) return QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus, refWindow, refSequence)
def onChunk(self, workChunk): referenceWindow = workChunk.window refId, refStart, refEnd = referenceWindow refSeqInWindow = reference.sequenceInWindow(referenceWindow) # Quick cutout for no-coverage case if not workChunk.hasCoverage: noCallCss = QuiverConsensus.noCallConsensus(self.quiverConfig.noEvidenceConsensus, referenceWindow, refSeqInWindow) return (referenceWindow, (noCallCss, [])) # General case eWindow = reference.enlargedReferenceWindow(referenceWindow, options.referenceChunkOverlap) _, eStart, eEnd = eWindow # We call consensus on the enlarged window and then map back # to the reference and clip the consensus at the implied # bounds. This seems to be more reliable thank cutting the # consensus bluntly refContig = reference.byName[refId].sequence refSequenceInEnlargedWindow = refContig[eStart:eEnd] # # Get the consensus for the enlarged window. # css_, variants_ = \ consensusAndVariantsForWindow(self._inAlnFile, eWindow, refContig, options.coverage, self.quiverConfig) # # Restrict the consensus and variants to the reference window. # ga = cc.Align(refSequenceInEnlargedWindow, css_.sequence) targetPositions = cc.TargetToQueryPositions(ga) cssStart = targetPositions[refStart-eStart] cssEnd = targetPositions[refEnd-eStart] cssSequence = css_.sequence[cssStart:cssEnd] cssQv = css_.confidence[cssStart:cssEnd] variants = [ v for v in variants_ if refStart <= v.refStart < refEnd ] consensusObj = Consensus(referenceWindow, cssSequence, cssQv) return (referenceWindow, (consensusObj, variants))
def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig, depthLimit, quiverConfig): """ High-level routine for calling the consensus for a window of the genome given an alignment file. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Quiver operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for quiver # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(cmpH5, refWindow, depthLimit=20000, minMapQV=quiverConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, quiverConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [ (winStart, winEnd) ] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(cmpH5, subWin, depthLimit=depthLimit, minMapQV=quiverConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns = U.filterAlns(subWin, clippedAlns_, quiverConfig) if len([ a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= quiverConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join([str(hit.readName) for hit in alns]))) css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, quiverConfig) siteCoverage = U.coverageInWindow(subWin, alns) if options.diploid: variants_ = diploid.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, options.aligner, css.mms) else: variants_ = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, options.aligner, mms=None) filteredVars = filterVariants(options.minCoverage, options.minConfidence, variants_) # Annotate? if options.annotateGFF: annotateVariants(filteredVars, clippedAlns) variants += filteredVars # Dump? shouldDumpEvidence = \ ((options.dumpEvidence == "all") or (options.dumpEvidence == "variants") and (len(variants) > 0)) if shouldDumpEvidence: dumpEvidence(options.evidenceDirectory, subWin, windowRefSeq, clippedAlns, css) else: css = QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants