Ejemplo n.º 1
0
def test_diploid_variantsFromAlignment():
    refWin = (0, 10, 17)

    EQ([], variantsFromAlignment(refWin, "GATTACA", "GATTACA"))

    EQ([Variant(0, 13, 14, "T", "G")],
       variantsFromAlignment(refWin, "GATTACA", "GATGACA"))

    EQ([Variant(0, 12, 14, "TT", "GG")],
       variantsFromAlignment(refWin, "GATTACA", "GAGGACA"))

    EQ([Variant(0, 12, 13, "T", "G"),
        Variant(0, 14, 15, "A", "G")],
       variantsFromAlignment(refWin, "GATTACA", "GAGNGCA"))

    EQ([Variant(0, 15, 16, "C", "")],
       variantsFromAlignment(refWin, "GATTACA", "GATTAA"))

    EQ([Variant(0, 12, 12, "", "T")],
       variantsFromAlignment(refWin, "GATTACA", "GATTTACA"))

    EQ([Variant(0, 13, 14, "T", "A", "T")],
       variantsFromAlignment(refWin, "GATTACA", "GATWACA"))

    EQ([Variant(0, 12, 13, "T", "A", "T"),
        Variant(0, 13, 14, "T", "A", "T")],
       variantsFromAlignment(refWin, "GATTACA", "GAWWACA"))
Ejemplo n.º 2
0
def variantsFromAlignment(refWindow,
                          refSeq,
                          cssSeq,
                          cssQV=None,
                          refCoverage=None):
    """
    Extract the variants implied by a pairwise alignment of cssSeq to
    refSeq reference.  If cssQV, refCoverage are provided, they will
    be used to decorate the variants with those attributes.

    Arguments:
      - cssQV: QV array, same length as css
      - refCoverage: coverage array, sample length as reference window

    This is trickier than in the haploid case.  We have to break out
    diploid variants as single bases, in order to avoid implying
    phase.
    """
    variants = []
    refId, refStart, refEnd = refWindow

    aln = cc.AlignAffineIupac(refSeq, cssSeq)
    alnTarget = aln.Target()
    alnQuery = aln.Query()

    assert (cssQV is None) == (refCoverage is None)  # Both or none
    assert len(refSeq) == refEnd - refStart
    assert cssQV is None or len(cssSeq) == len(cssQV)
    assert refCoverage is None or len(refSeq) == len(refCoverage)

    transcript = [
        X if (Q != "N" and T != "N") else "N"
        for (X, T, Q) in zip(aln.Transcript(), alnTarget, alnQuery)
    ]

    def findPrev(s, pos):
        for i in xrange(pos - 1, -1, -1):
            if s[i] != '-':
                return s[i]
        return "N"

    variants = []
    runStart = -1
    runStartRefPos = None
    runX = None
    refPos = refStart
    for pos, (X, T, Q) in enumerate(zip(transcript, alnTarget, alnQuery)):
        if X != runX or isHeterozygote(Q):
            if runStart >= 0 and runX not in "MN":
                # Package up the run and dump a variant
                ref = alnTarget[runStart:pos].replace("-", "")
                read = alnQuery[runStart:pos].replace("-", "")
                refPrev = findPrev(alnTarget, runStart)
                cssPrev = findPrev(alnQuery, runStart)
                if isHeterozygote(read):
                    allele1, allele2 = unpackIUPAC(read)
                    var = Variant(refId,
                                  runStartRefPos,
                                  refPos,
                                  ref,
                                  allele1,
                                  allele2,
                                  refPrev=refPrev,
                                  readPrev=cssPrev)
                else:
                    var = Variant(refId,
                                  runStartRefPos,
                                  refPos,
                                  ref,
                                  read,
                                  refPrev=refPrev,
                                  readPrev=cssPrev)
                variants.append(var)
            runStart = pos
            runStartRefPos = refPos
            runX = X
        if T != "-": refPos += 1

    # This might be better handled within the loop above, just keeping
    # track of Qpos, Tpos
    if cssQV is not None:
        cssPosition = cc.TargetToQueryPositions(aln)
        for v in variants:
            # HACK ALERT: we are not really handling the confidence or
            # coverage for variants at last position of the window
            # correctly here.
            refPos_ = min(v.refStart - refStart, len(refCoverage) - 1)
            cssPos_ = min(cssPosition[v.refStart - refStart], len(cssQV) - 1)

            if refCoverage is not None: v.coverage = refCoverage[refPos_]
            if cssQV is not None: v.confidence = cssQV[cssPos_]

    return variants