def testMatchTwoAfter(self): """ Test that a soft-clipped base two sites after a non-soft-clipped site returns the correct offset. """ self.assertEqual( 12, softClippedOffset(2, ((0, 10), (1, None), (2, None)), (CMATCH, CSOFT_CLIP, CSOFT_CLIP)))
def testMatchTwoBefore(self): """ Test that a soft-clipped base two sites before a non-soft-clipped site returns the correct offset. """ self.assertEqual( 8, softClippedOffset(0, ((0, None), (1, None), (2, 10)), (CSOFT_CLIP, CSOFT_CLIP, CMATCH)))
def testMatchOneAfter(self): """ Test that a soft-clipped base one site after a non-soft-clipped site returns the correct offset. """ self.assertEqual( 11, softClippedOffset(1, ((0, 10), (1, None)), (CMATCH, CSOFT_CLIP)))
def testMatchTwoAfterThenHardClips(self): """ Test that a soft-clipped base two sites after a non-soft-clipped site returns the correct offset, including when there are also hard clips. """ self.assertEqual( 12, softClippedOffset( 2, ((0, 10), (1, None), (2, None), (3, None), (4, None)), (CMATCH, CSOFT_CLIP, CSOFT_CLIP, CHARD_CLIP, CHARD_CLIP)))
def addPairsInfo(pairs, cigarOperations, query, qualities, referenceLength, includeSoftClipped, correspondences, deletions, insertions): """ Add information about matched pairs of nucleotides. @param pairs: A C{list} of 2-C{tuple}s of query offset, reference offset. Either (but not both) member of each tuple might be C{None} to indicate an indel mismatch. @param cigarOperations: A C{list} of CIGAR operations corresponding to the information in C{pairs}. @param query: A C{str} query DNA sequence. @param qualities: A C{list} of quality scores. @param includeSoftClipped: Include information from read bases that were marked as soft-clipped by the algorithm that made the BAM file. @param correspondences: A C{defaultdict(list)}, to hold (base, quality) scores for when a query offset corresponds to a reference offset. @param deletions: A C{set} of C{int} reference offsets that are deleted in the query. @param insertions: A C{defaultdict(list)}, to hold (base, quality) scores for when a query contains an insertion to the reference. """ assert len(pairs) == len(cigarOperations) assert not any(pair == (None, None) for pair in pairs) inInsertion = False for count, ((queryOffset, referenceOffset), cigarOperation) in enumerate(zip(pairs, cigarOperations)): if queryOffset is None: # The query is missing something that is in the reference. So this # is a deletion from the reference. assert cigarOperation == CDEL assert referenceOffset is not None deletions[referenceOffset] += 1 inInsertion = False elif referenceOffset is None: base = query[queryOffset] quality = qualities[queryOffset] if cigarOperation == CINS: # The query has an insertion (relative to the reference). # A CIGAR string shouldn't start with an insertion, IMO. # Rather, in such a case, it must start with unmatched # (soft-clipped) bases. # assert lastReferenceOffset is not None lookedBack, iOffset = insertionOffset(count, pairs, cigarOperations) if not inInsertion: inInsertion = True if iOffset not in insertions: insertions[iOffset] = Insertion(iOffset) insertions[iOffset].start(iOffset if lookedBack else None) insertions[iOffset].append(base, quality) else: assert cigarOperation == CSOFT_CLIP inInsertion = False if includeSoftClipped: correspondences[softClippedOffset(count, pairs, cigarOperations)].append( base, quality) else: # Query and reference offsets are both non-None. assert cigarOperation in CONSUMES_REFERENCE inInsertion = False base = query[queryOffset] quality = qualities[queryOffset] correspondences[referenceOffset].append(base, quality)