def test_diploid_variantsFromAlignment(): refWin = (0, 10, 17) EQ([], variantsFromAlignment(refWin, "GATTACA", "GATTACA")) EQ([Variant(0, 13, 14, "T", "G")], variantsFromAlignment(refWin, "GATTACA", "GATGACA")) EQ([Variant(0, 12, 14, "TT", "GG")], variantsFromAlignment(refWin, "GATTACA", "GAGGACA")) EQ([Variant(0, 12, 13, "T", "G"), Variant(0, 14, 15, "A", "G")], variantsFromAlignment(refWin, "GATTACA", "GAGNGCA")) EQ([Variant(0, 15, 16, "C", "")], variantsFromAlignment(refWin, "GATTACA", "GATTAA")) EQ([Variant(0, 12, 12, "", "T")], variantsFromAlignment(refWin, "GATTACA", "GATTTACA")) EQ([Variant(0, 13, 14, "T", "A", "T")], variantsFromAlignment(refWin, "GATTACA", "GATWACA")) EQ([Variant(0, 12, 13, "T", "A", "T"), Variant(0, 13, 14, "T", "A", "T")], variantsFromAlignment(refWin, "GATTACA", "GAWWACA"))
def variantsFromAlignment(refWindow, refSeq, cssSeq, cssQV=None, refCoverage=None): """ Extract the variants implied by a pairwise alignment of cssSeq to refSeq reference. If cssQV, refCoverage are provided, they will be used to decorate the variants with those attributes. Arguments: - cssQV: QV array, same length as css - refCoverage: coverage array, sample length as reference window This is trickier than in the haploid case. We have to break out diploid variants as single bases, in order to avoid implying phase. """ variants = [] refId, refStart, refEnd = refWindow aln = cc.AlignAffineIupac(refSeq, cssSeq) alnTarget = aln.Target() alnQuery = aln.Query() assert (cssQV is None) == (refCoverage is None) # Both or none assert len(refSeq) == refEnd - refStart assert cssQV is None or len(cssSeq) == len(cssQV) assert refCoverage is None or len(refSeq) == len(refCoverage) transcript = [ X if (Q != "N" and T != "N") else "N" for (X, T, Q) in zip(aln.Transcript(), alnTarget, alnQuery) ] def findPrev(s, pos): for i in xrange(pos - 1, -1, -1): if s[i] != '-': return s[i] return "N" variants = [] runStart = -1 runStartRefPos = None runX = None refPos = refStart for pos, (X, T, Q) in enumerate(zip(transcript, alnTarget, alnQuery)): if X != runX or isHeterozygote(Q): if runStart >= 0 and runX not in "MN": # Package up the run and dump a variant ref = alnTarget[runStart:pos].replace("-", "") read = alnQuery[runStart:pos].replace("-", "") refPrev = findPrev(alnTarget, runStart) cssPrev = findPrev(alnQuery, runStart) if isHeterozygote(read): allele1, allele2 = unpackIUPAC(read) var = Variant(refId, runStartRefPos, refPos, ref, allele1, allele2, refPrev=refPrev, readPrev=cssPrev) else: var = Variant(refId, runStartRefPos, refPos, ref, read, refPrev=refPrev, readPrev=cssPrev) variants.append(var) runStart = pos runStartRefPos = refPos runX = X if T != "-": refPos += 1 # This might be better handled within the loop above, just keeping # track of Qpos, Tpos if cssQV is not None: cssPosition = cc.TargetToQueryPositions(aln) for v in variants: # HACK ALERT: we are not really handling the confidence or # coverage for variants at last position of the window # correctly here. refPos_ = min(v.refStart - refStart, len(refCoverage) - 1) cssPos_ = min(cssPosition[v.refStart - refStart], len(cssQV) - 1) if refCoverage is not None: v.coverage = refCoverage[refPos_] if cssQV is not None: v.confidence = cssQV[cssPos_] return variants