Ejemplo n.º 1
0
def mainCmpH5(options):
    alnReader = AlignmentSet(options.inputCmpH5,
                             referenceFastaFname=options.referenceFilename)
    if options.fofn is not None:
        alnReader.attach(options.fofn)

    if options.referenceFilename:
        referenceTable = loadReferences(options.referenceFilename, alnReader)
    else:
        referenceTable = None

    for refWindow in options.referenceWindows:
        refId = refWindow.refId
        refName = alnReader.referenceInfo(refWindow.refId).FullName
        refLength = alnReader.referenceInfo(refWindow.refId).Length
        refWindow = refWindow._replace(refId=refId)
        refWindow = makeDisplayWindow(refLength, options.width, refWindow)

        if options.rowNumbers != None:
            alns = alnReader[options.rowNumbers]
        else:
            alns = readsInWindow(alnReader, refWindow, options.depth,
                                       minMapQV=options.minMapQV, strategy=options.sorting)

        print windowToGffString(Window(refName, refWindow.start, refWindow.end))

        if options.oneAtATime:
            formatIndividualAlignments(alnReader, refWindow, alns)
        else:
            formatWindow(alnReader, refWindow, alns,
                         referenceTable, options.aligned, options.color,
                         options.realign, options.consensus)
        print
Ejemplo n.º 2
0
def mainGff(options):
    reader = GffReader(options.inputGff)
    cmpH5Fname, referenceFname = extractCmpH5AndReferenceFromGff(reader)
    # Allow overriding
    cmpH5Fname = options.inputCmpH5 or cmpH5Fname
    referenceFname = options.referenceFilename or referenceFname

    assert cmpH5Fname
    assert referenceFname

    cmpH5 = CmpH5Reader(cmpH5Fname)
    referenceTable = loadReferences(referenceFname, cmpH5)

    for gffRecord in reader:
        referenceSeq = gffRecord.get("reference", "-")
        variantSeq   = gffRecord.get("variantSeq", "-")
        variantConfidence = gffRecord.confidence
        variantSummary = "(%s > %s)" % (referenceSeq, variantSeq)
        print gffRecord.type, gffRecord.seqid, gffRecord.start, gffRecord.end, \
            variantSummary, variantConfidence
        refId = cmpH5.referenceInfo(gffRecord.seqid).ID
        refWindow = Window(refId,
                           gffRecord.start - 10,
                           gffRecord.end   + 10)
        rowNumbers = readsInWindow(cmpH5, refWindow, options.depth,
                                   minMapQV=options.minMapQV, strategy=options.sorting)
        formatWindow(cmpH5, refWindow, rowNumbers, referenceTable,
                     aligned=(gffRecord.type != "insertion"),
                     consensus=options.consensus)
        print
Ejemplo n.º 3
0
def consensus(alnReader, refWindow, referenceTable, alns):
    # identify the enlarged interval [-5, +5]
    refName = alnReader.referenceInfo(refWindow.refId).FullName
    refLength = len(referenceTable[refName].sequence)
    eWindow = enlargedReferenceWindow(refWindow, refLength, overlap)
    refSeqInEnlargedWindow = referenceTable[refName].sequence[eWindow.start:eWindow.end]

    # find 3-spanned intervals in the enlarged interval
    # call css for each interval
    subConsensi = []
    tStart = [ a.tStart for a in alns ]
    tEnd = [ a.tEnd for a in alns ]
    coveredIntervals = w.kSpannedIntervals(eWindow, K, tStart, tEnd)
    holes = w.holes(eWindow, coveredIntervals)

    for interval in sorted(coveredIntervals + holes):
        subWin = subWindow(eWindow, interval)
        #print subWin
        intStart, intEnd = interval
        intRefSeq = refSeqInEnlargedWindow[intStart-eWindow.start:
                                           intEnd-eWindow.start]
        css_ = Consensus.nAsConsensus(subWin, intRefSeq)
        if interval in coveredIntervals:
            alns = readsInWindow(alnReader, subWin,
                                 depthLimit=100,
                                 minMapQV=quiverConfig.minMapQV,
                                 strategy="longest")
            clippedAlns = [ aln.clippedTo(*interval) for aln in alns ]
            goodAlns = q.utils.filterAlns(subWin, clippedAlns, quiverConfig)
            if len(goodAlns) >= K:
                css_ = q.utils.consensusForAlignments(subWin,
                                                      intRefSeq,
                                                      goodAlns,
                                                      quiverConfig)

        subConsensi.append(css_)

    # join subconsensus objects
    css = join(subConsensi)

    # align css back to refWindow, and clip
    ga = cc.Align(refSeqInEnlargedWindow, css.sequence)
    targetPositions = cc.TargetToQueryPositions(ga)
    cssStart = targetPositions[refWindow.start-eWindow.start]
    cssEnd   = targetPositions[refWindow.end-eWindow.start]

    cssSequence    = css.sequence[cssStart:cssEnd]
    cssQv          = css.confidence[cssStart:cssEnd]

    consensusObj = Consensus(refWindow,
                             cssSequence,
                             cssQv)
    return consensusObj
Ejemplo n.º 4
0
def mainGff(options):
    reader = GffReader(options.inputGff)
    alnsFname, referenceFname = extractCmpH5AndReferenceFromGff(reader)
    # Allow overriding
    alnsFname = options.inputCmpH5 or alnsFname
    referenceFname = options.referenceFilename or referenceFname

    assert os.path.isfile(alnsFname)
    assert os.path.isfile(referenceFname)

    alnReader = AlignmentSet(alnsFname, referenceFastaFname=referenceFname)

    if options.fofn is not None:
        alnReader.attach(options.fofn)

    referenceTable = loadReferences(referenceFname, alnReader)

    for i, gffRecord in enumerate(reader):
        referenceSeq = gffRecord.get("reference", "-")
        variantSeq   = gffRecord.get("variantSeq", "-")
        variantConfidence = gffRecord.confidence
        variantSummary = "(%s > %s)" % (referenceSeq, variantSeq)
        print gffRecord.type, gffRecord.seqid, gffRecord.start, gffRecord.end, \
            variantSummary, variantConfidence
        refId = gffRecord.seqid
        refLength = alnReader.referenceInfo(gffRecord.seqid).Length
        refWindow = makeDisplayWindow(refLength, options.width,
                                       Window(refId,
                                              gffRecord.start-10,
                                              gffRecord.end+10))
        if "rows" in gffRecord.attributes:
            alns = alnReader[map(int, gffRecord.rows.split(","))]
        else:
            alns = readsInWindow(alnReader, refWindow, options.depth,
                                 minMapQV=options.minMapQV, strategy=options.sorting)
        formatWindow(alnReader, refWindow, alns, referenceTable,
                     aligned=(gffRecord.type != "insertion"),
                     consensus=options.consensus,
                     useColor=options.color,
                     doRealign=options.realign)

        if options.pulseRecognizer:
            # CSV output for pulse recognizer
            print
            csvFname = "variant-" + str(i) +  ".csv"
            dumpVariantCsv(csvFname, alnReader, alns, gffRecord)
            formatVariantCsvLink(csvFname)

        print
Ejemplo n.º 5
0
def mainCmpH5(options):
    cmpH5 = CmpH5Reader(options.inputCmpH5)
    refId = cmpH5.referenceInfo(options.referenceWindow.refId).ID
    refWindow = options.referenceWindow._replace(refId=refId)

    if options.rowNumbers != None:
        rowNumbers = options.rowNumbers
    else:
        rowNumbers = readsInWindow(cmpH5, refWindow, options.depth,
                                   minMapQV=options.minMapQV, strategy=options.sorting)

    if options.referenceFilename:
        referenceTable = loadReferences(options.referenceFilename, cmpH5)
    else:
        referenceTable = None

    formatWindow(cmpH5, refWindow, rowNumbers,
                 referenceTable, options.aligned, options.color,
                 options.consensus)
    print
Ejemplo n.º 6
0
def run_real_quiver(cmpH5, quiverConfig, interval, depthLimit, refSeq, refWindow, seedConsensus):
    
    intStart, intEnd = interval
    subWin = subWindow(refWindow, interval)
    
    windowRefSeq = refSeq[intStart:intEnd]
    rows = readsInWindow(cmpH5, subWin,
                           depthLimit = depthLimit,
                           minMapQV = quiverConfig.minMapQV,
                           strategy = "longest",
                           stratum = None,
                           barcode = None)
    
    spanningRows = [row for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd) ]
    
    alns = cmpH5[spanningRows]
    clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
    clippedAlns__ = [ aln for aln in clippedAlns_ if aln.alignedLength <= 120]
    clippedAlns = filterAlns(subWin, clippedAlns__, quiverConfig)
    
    consensus = consensusForAlignmentsDisregardPOA(subWin, windowRefSeq, clippedAlns, quiverConfig, "A"*100)
    print(str(consensus.sequence))
Ejemplo n.º 7
0
def getReads(cmpH5,
             reference,
             interval,
             paddedTemplateWidth,
             depthLimit,
             real_quiver=False):

    minMapQV = 10
    minPoaCoverage = 3
    maxPoaCoverage = 11
    mutationSeparation = 10
    mutationNeighborhood = 20
    maxIterations = 20
    refineDinucleotideRepeats = True
    noEvidenceConsensus = "nocall"
    computeConfidence = True
    readStumpinessThreshold = 0.1

    refId = [x for x in reference.enumerateIds()][0]
    refSeq = reference.byId[refId].sequence
    refWindow = (refId, 0, reference.byId[refId].length)

    intStart, intEnd = interval
    subWin = subWindow(refWindow, interval)

    windowRefSeq = refSeq[intStart:intEnd]
    rows = readsInWindow(cmpH5,
                         subWin,
                         depthLimit=depthLimit,
                         minMapQV=minMapQV,
                         strategy="longest",
                         stratum=None,
                         barcode=None)

    #print([cmpH5[row].alignedLength for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd)])
    spanningRows = [
        row for row in rows
        if cmpH5[row].spansReferenceRange(intStart, intEnd)
    ]

    alns = cmpH5[spanningRows]
    clippedAlns_ = [aln.clippedTo(*interval) for aln in alns]
    clippedAlns__ = [
        aln for aln in clippedAlns_
        if aln.alignedLength <= paddedTemplateWidth - 7
    ]
    clippedAlns = filterAlns(subWin, clippedAlns__, readStumpinessThreshold)

    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [
        a.read(orientation="genomic", aligned=False) for a in clippedAlns
    ]

    p = cc.PoaConsensus.FindConsensus(fwdSequences[:maxPoaCoverage])

    template = p.Sequence()

    tmplSeq = np.zeros((paddedTemplateWidth), dtype=np.uint8)
    tmplOrds = map(ord, template)
    tmplSeq[:len(tmplOrds)] = tmplOrds

    #read pos y, read x
    readSeqs = np.zeros((paddedTemplateWidth, len(clippedAlns)),
                        dtype=np.uint8)

    for i in xrange(len(clippedAlns)):
        alnOrds = map(ord, fwdSequences[i])
        readSeqs[:len(alnOrds), i] = alnOrds

    #uint8
    #metric z, read pos y, read x
    qvInfo = np.zeros((8, paddedTemplateWidth, len(clippedAlns)),
                      dtype=np.uint8)

    for i in xrange(len(clippedAlns)):
        qvInfo[0, :clippedAlns[i].readLength,
               i] = clippedAlns[i].InsertionQV(orientation="genomic",
                                               aligned=False)
        qvInfo[1, :clippedAlns[i].readLength,
               i] = clippedAlns[i].MergeQV(orientation="genomic",
                                           aligned=False)
        qvInfo[2, :clippedAlns[i].readLength,
               i] = clippedAlns[i].DeletionQV(orientation="genomic",
                                              aligned=False)
        qvInfo[3, :clippedAlns[i].readLength,
               i] = clippedAlns[i].DeletionTag(orientation="genomic",
                                               aligned=False)
        qvInfo[4, :clippedAlns[i].readLength,
               i] = clippedAlns[i].SubstitutionQV(orientation="genomic",
                                                  aligned=False)

    if real_quiver:
        return template, len(tmplOrds), fwdSequences, qvInfo
    else:
        return tmplSeq, len(tmplOrds), readSeqs, qvInfo
Ejemplo n.º 8
0
def getReads(cmpH5, reference, interval, paddedTemplateWidth, depthLimit, real_quiver=False):
    
    minMapQV = 10
    minPoaCoverage = 3
    maxPoaCoverage = 11
    mutationSeparation = 10
    mutationNeighborhood = 20
    maxIterations = 20
    refineDinucleotideRepeats = True
    noEvidenceConsensus = "nocall"
    computeConfidence = True
    readStumpinessThreshold = 0.1
    
    refId = [x for x in reference.enumerateIds()][0]
    refSeq = reference.byId[refId].sequence
    refWindow = (refId, 0, reference.byId[refId].length)
    
    intStart, intEnd = interval
    subWin = subWindow(refWindow, interval)
    
    windowRefSeq = refSeq[intStart:intEnd]
    rows = readsInWindow(cmpH5, subWin,
                           depthLimit = depthLimit,
                           minMapQV = minMapQV,
                           strategy = "longest",
                           stratum = None,
                           barcode = None)
    
    #print([cmpH5[row].alignedLength for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd)])
    spanningRows = [row for row in rows if cmpH5[row].spansReferenceRange(intStart, intEnd) ]
    
    alns = cmpH5[spanningRows]
    clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
    clippedAlns__ = [ aln for aln in clippedAlns_ if aln.alignedLength <= paddedTemplateWidth - 7]
    clippedAlns = filterAlns(subWin, clippedAlns__, readStumpinessThreshold)
    
    # Compute the POA consensus, which is our initial guess, and
    # should typically be > 99.5% accurate
    fwdSequences = [ a.read(orientation="genomic", aligned=False)
                     for a in clippedAlns]
    
    p = cc.PoaConsensus.FindConsensus(fwdSequences[:maxPoaCoverage])
    
    template = p.Sequence()
    
    tmplSeq = np.zeros((paddedTemplateWidth), dtype=np.uint8)
    tmplOrds = map(ord, template)
    tmplSeq[:len(tmplOrds)] = tmplOrds
    
    #read pos y, read x
    readSeqs = np.zeros((paddedTemplateWidth, len(clippedAlns)), dtype=np.uint8)
    
    for i in xrange(len(clippedAlns)):
        alnOrds = map(ord, fwdSequences[i])
        readSeqs[:len(alnOrds), i] = alnOrds
    
    #uint8
    #metric z, read pos y, read x
    qvInfo = np.zeros((8, paddedTemplateWidth, len(clippedAlns)), dtype=np.uint8)
    
    for i in xrange(len(clippedAlns)):
        qvInfo[0, :clippedAlns[i].readLength, i] = clippedAlns[i].InsertionQV(orientation="genomic", aligned=False)
        qvInfo[1, :clippedAlns[i].readLength, i] = clippedAlns[i].MergeQV(orientation="genomic", aligned=False)
        qvInfo[2, :clippedAlns[i].readLength, i] = clippedAlns[i].DeletionQV(orientation="genomic", aligned=False)
        qvInfo[3, :clippedAlns[i].readLength, i] = clippedAlns[i].DeletionTag(orientation="genomic", aligned=False)
        qvInfo[4, :clippedAlns[i].readLength, i] = clippedAlns[i].SubstitutionQV(orientation="genomic", aligned=False)
    
    if real_quiver:
        return template, len(tmplOrds), fwdSequences, qvInfo
    else:
        return tmplSeq, len(tmplOrds), readSeqs, qvInfo