def mainCmpH5(options): alnReader = AlignmentSet(options.inputCmpH5, referenceFastaFname=options.referenceFilename) if options.fofn is not None: alnReader.attach(options.fofn) if options.referenceFilename: referenceTable = loadReferences(options.referenceFilename, alnReader) else: referenceTable = None for refWindow in options.referenceWindows: refId = refWindow.refId refName = alnReader.referenceInfo(refWindow.refId).FullName refLength = alnReader.referenceInfo(refWindow.refId).Length refWindow = refWindow._replace(refId=refId) refWindow = makeDisplayWindow(refLength, options.width, refWindow) if options.rowNumbers != None: alns = alnReader[options.rowNumbers] else: alns = readsInWindow(alnReader, refWindow, options.depth, minMapQV=options.minMapQV, strategy=options.sorting) print windowToGffString(Window(refName, refWindow.start, refWindow.end)) if options.oneAtATime: formatIndividualAlignments(alnReader, refWindow, alns) else: formatWindow(alnReader, refWindow, alns, referenceTable, options.aligned, options.color, options.realign, options.consensus) print
def mainGff(options): reader = GffReader(options.inputGff) cmpH5Fname, referenceFname = extractCmpH5AndReferenceFromGff(reader) # Allow overriding cmpH5Fname = options.inputCmpH5 or cmpH5Fname referenceFname = options.referenceFilename or referenceFname assert cmpH5Fname assert referenceFname cmpH5 = CmpH5Reader(cmpH5Fname) referenceTable = loadReferences(referenceFname, cmpH5) for gffRecord in reader: referenceSeq = gffRecord.get("reference", "-") variantSeq = gffRecord.get("variantSeq", "-") variantConfidence = gffRecord.confidence variantSummary = "(%s > %s)" % (referenceSeq, variantSeq) print gffRecord.type, gffRecord.seqid, gffRecord.start, gffRecord.end, \ variantSummary, variantConfidence refId = cmpH5.referenceInfo(gffRecord.seqid).ID refWindow = Window(refId, gffRecord.start - 10, gffRecord.end + 10) rowNumbers = readsInWindow(cmpH5, refWindow, options.depth, minMapQV=options.minMapQV, strategy=options.sorting) formatWindow(cmpH5, refWindow, rowNumbers, referenceTable, aligned=(gffRecord.type != "insertion"), consensus=options.consensus) print
def consensus(alnReader, refWindow, referenceTable, alns): # identify the enlarged interval [-5, +5] refName = alnReader.referenceInfo(refWindow.refId).FullName refLength = len(referenceTable[refName].sequence) eWindow = enlargedReferenceWindow(refWindow, refLength, overlap) refSeqInEnlargedWindow = referenceTable[refName].sequence[eWindow.start:eWindow.end] # find 3-spanned intervals in the enlarged interval # call css for each interval subConsensi = [] tStart = [ a.tStart for a in alns ] tEnd = [ a.tEnd for a in alns ] coveredIntervals = w.kSpannedIntervals(eWindow, K, tStart, tEnd) holes = w.holes(eWindow, coveredIntervals) for interval in sorted(coveredIntervals + holes): subWin = subWindow(eWindow, interval) #print subWin intStart, intEnd = interval intRefSeq = refSeqInEnlargedWindow[intStart-eWindow.start: intEnd-eWindow.start] css_ = Consensus.nAsConsensus(subWin, intRefSeq) if interval in coveredIntervals: alns = readsInWindow(alnReader, subWin, depthLimit=100, minMapQV=quiverConfig.minMapQV, strategy="longest") clippedAlns = [ aln.clippedTo(*interval) for aln in alns ] goodAlns = q.utils.filterAlns(subWin, clippedAlns, quiverConfig) if len(goodAlns) >= K: css_ = q.utils.consensusForAlignments(subWin, intRefSeq, goodAlns, quiverConfig) subConsensi.append(css_) # join subconsensus objects css = join(subConsensi) # align css back to refWindow, and clip ga = cc.Align(refSeqInEnlargedWindow, css.sequence) targetPositions = cc.TargetToQueryPositions(ga) cssStart = targetPositions[refWindow.start-eWindow.start] cssEnd = targetPositions[refWindow.end-eWindow.start] cssSequence = css.sequence[cssStart:cssEnd] cssQv = css.confidence[cssStart:cssEnd] consensusObj = Consensus(refWindow, cssSequence, cssQv) return consensusObj
def mainGff(options): reader = GffReader(options.inputGff) alnsFname, referenceFname = extractCmpH5AndReferenceFromGff(reader) # Allow overriding alnsFname = options.inputCmpH5 or alnsFname referenceFname = options.referenceFilename or referenceFname assert os.path.isfile(alnsFname) assert os.path.isfile(referenceFname) alnReader = AlignmentSet(alnsFname, referenceFastaFname=referenceFname) if options.fofn is not None: alnReader.attach(options.fofn) referenceTable = loadReferences(referenceFname, alnReader) for i, gffRecord in enumerate(reader): referenceSeq = gffRecord.get("reference", "-") variantSeq = gffRecord.get("variantSeq", "-") variantConfidence = gffRecord.confidence variantSummary = "(%s > %s)" % (referenceSeq, variantSeq) print gffRecord.type, gffRecord.seqid, gffRecord.start, gffRecord.end, \ variantSummary, variantConfidence refId = gffRecord.seqid refLength = alnReader.referenceInfo(gffRecord.seqid).Length refWindow = makeDisplayWindow(refLength, options.width, Window(refId, gffRecord.start-10, gffRecord.end+10)) if "rows" in gffRecord.attributes: alns = alnReader[map(int, gffRecord.rows.split(","))] else: alns = readsInWindow(alnReader, refWindow, options.depth, minMapQV=options.minMapQV, strategy=options.sorting) formatWindow(alnReader, refWindow, alns, referenceTable, aligned=(gffRecord.type != "insertion"), consensus=options.consensus, useColor=options.color, doRealign=options.realign) if options.pulseRecognizer: # CSV output for pulse recognizer print csvFname = "variant-" + str(i) + ".csv" dumpVariantCsv(csvFname, alnReader, alns, gffRecord) formatVariantCsvLink(csvFname) print
def mainCmpH5(options): cmpH5 = CmpH5Reader(options.inputCmpH5) refId = cmpH5.referenceInfo(options.referenceWindow.refId).ID refWindow = options.referenceWindow._replace(refId=refId) if options.rowNumbers != None: rowNumbers = options.rowNumbers else: rowNumbers = readsInWindow(cmpH5, refWindow, options.depth, minMapQV=options.minMapQV, strategy=options.sorting) if options.referenceFilename: referenceTable = loadReferences(options.referenceFilename, cmpH5) else: referenceTable = None formatWindow(cmpH5, refWindow, rowNumbers, referenceTable, options.aligned, options.color, options.consensus) print
def run_real_quiver(cmpH5, quiverConfig, interval, depthLimit, refSeq, refWindow, seedConsensus): intStart, intEnd = interval subWin = subWindow(refWindow, interval) windowRefSeq = refSeq[intStart:intEnd] rows = readsInWindow(cmpH5, subWin, depthLimit = depthLimit, minMapQV = quiverConfig.minMapQV, strategy = "longest", stratum = None, barcode = None) spanningRows = [row for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd) ] alns = cmpH5[spanningRows] clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns__ = [ aln for aln in clippedAlns_ if aln.alignedLength <= 120] clippedAlns = filterAlns(subWin, clippedAlns__, quiverConfig) consensus = consensusForAlignmentsDisregardPOA(subWin, windowRefSeq, clippedAlns, quiverConfig, "A"*100) print(str(consensus.sequence))
def getReads(cmpH5, reference, interval, paddedTemplateWidth, depthLimit, real_quiver=False): minMapQV = 10 minPoaCoverage = 3 maxPoaCoverage = 11 mutationSeparation = 10 mutationNeighborhood = 20 maxIterations = 20 refineDinucleotideRepeats = True noEvidenceConsensus = "nocall" computeConfidence = True readStumpinessThreshold = 0.1 refId = [x for x in reference.enumerateIds()][0] refSeq = reference.byId[refId].sequence refWindow = (refId, 0, reference.byId[refId].length) intStart, intEnd = interval subWin = subWindow(refWindow, interval) windowRefSeq = refSeq[intStart:intEnd] rows = readsInWindow(cmpH5, subWin, depthLimit=depthLimit, minMapQV=minMapQV, strategy="longest", stratum=None, barcode=None) #print([cmpH5[row].alignedLength for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd)]) spanningRows = [ row for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd) ] alns = cmpH5[spanningRows] clippedAlns_ = [aln.clippedTo(*interval) for aln in alns] clippedAlns__ = [ aln for aln in clippedAlns_ if aln.alignedLength <= paddedTemplateWidth - 7 ] clippedAlns = filterAlns(subWin, clippedAlns__, readStumpinessThreshold) # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in clippedAlns ] p = cc.PoaConsensus.FindConsensus(fwdSequences[:maxPoaCoverage]) template = p.Sequence() tmplSeq = np.zeros((paddedTemplateWidth), dtype=np.uint8) tmplOrds = map(ord, template) tmplSeq[:len(tmplOrds)] = tmplOrds #read pos y, read x readSeqs = np.zeros((paddedTemplateWidth, len(clippedAlns)), dtype=np.uint8) for i in xrange(len(clippedAlns)): alnOrds = map(ord, fwdSequences[i]) readSeqs[:len(alnOrds), i] = alnOrds #uint8 #metric z, read pos y, read x qvInfo = np.zeros((8, paddedTemplateWidth, len(clippedAlns)), dtype=np.uint8) for i in xrange(len(clippedAlns)): qvInfo[0, :clippedAlns[i].readLength, i] = clippedAlns[i].InsertionQV(orientation="genomic", aligned=False) qvInfo[1, :clippedAlns[i].readLength, i] = clippedAlns[i].MergeQV(orientation="genomic", aligned=False) qvInfo[2, :clippedAlns[i].readLength, i] = clippedAlns[i].DeletionQV(orientation="genomic", aligned=False) qvInfo[3, :clippedAlns[i].readLength, i] = clippedAlns[i].DeletionTag(orientation="genomic", aligned=False) qvInfo[4, :clippedAlns[i].readLength, i] = clippedAlns[i].SubstitutionQV(orientation="genomic", aligned=False) if real_quiver: return template, len(tmplOrds), fwdSequences, qvInfo else: return tmplSeq, len(tmplOrds), readSeqs, qvInfo
def getReads(cmpH5, reference, interval, paddedTemplateWidth, depthLimit, real_quiver=False): minMapQV = 10 minPoaCoverage = 3 maxPoaCoverage = 11 mutationSeparation = 10 mutationNeighborhood = 20 maxIterations = 20 refineDinucleotideRepeats = True noEvidenceConsensus = "nocall" computeConfidence = True readStumpinessThreshold = 0.1 refId = [x for x in reference.enumerateIds()][0] refSeq = reference.byId[refId].sequence refWindow = (refId, 0, reference.byId[refId].length) intStart, intEnd = interval subWin = subWindow(refWindow, interval) windowRefSeq = refSeq[intStart:intEnd] rows = readsInWindow(cmpH5, subWin, depthLimit = depthLimit, minMapQV = minMapQV, strategy = "longest", stratum = None, barcode = None) #print([cmpH5[row].alignedLength for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd)]) spanningRows = [row for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd) ] alns = cmpH5[spanningRows] clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns__ = [ aln for aln in clippedAlns_ if aln.alignedLength <= paddedTemplateWidth - 7] clippedAlns = filterAlns(subWin, clippedAlns__, readStumpinessThreshold) # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in clippedAlns] p = cc.PoaConsensus.FindConsensus(fwdSequences[:maxPoaCoverage]) template = p.Sequence() tmplSeq = np.zeros((paddedTemplateWidth), dtype=np.uint8) tmplOrds = map(ord, template) tmplSeq[:len(tmplOrds)] = tmplOrds #read pos y, read x readSeqs = np.zeros((paddedTemplateWidth, len(clippedAlns)), dtype=np.uint8) for i in xrange(len(clippedAlns)): alnOrds = map(ord, fwdSequences[i]) readSeqs[:len(alnOrds), i] = alnOrds #uint8 #metric z, read pos y, read x qvInfo = np.zeros((8, paddedTemplateWidth, len(clippedAlns)), dtype=np.uint8) for i in xrange(len(clippedAlns)): qvInfo[0, :clippedAlns[i].readLength, i] = clippedAlns[i].InsertionQV(orientation="genomic", aligned=False) qvInfo[1, :clippedAlns[i].readLength, i] = clippedAlns[i].MergeQV(orientation="genomic", aligned=False) qvInfo[2, :clippedAlns[i].readLength, i] = clippedAlns[i].DeletionQV(orientation="genomic", aligned=False) qvInfo[3, :clippedAlns[i].readLength, i] = clippedAlns[i].DeletionTag(orientation="genomic", aligned=False) qvInfo[4, :clippedAlns[i].readLength, i] = clippedAlns[i].SubstitutionQV(orientation="genomic", aligned=False) if real_quiver: return template, len(tmplOrds), fwdSequences, qvInfo else: return tmplSeq, len(tmplOrds), readSeqs, qvInfo