def test_insert(self): self.assertEqual(compare('abc', 'xabc'), 1) self.assertEqual(compare('abc', 'axbc'), 1) self.assertEqual(compare('abc', 'abxc'), 1) self.assertEqual(compare('abc', 'abcx'), 1) self.assertEqual(compare('abc', 'xxabc'), 2) self.assertEqual(compare('abc', 'axxbc'), 2) self.assertEqual(compare('abc', 'abxxc'), 2) self.assertEqual(compare('abc', 'abcxx'), 2) self.assertEqual(compare('abc', 'xabcx'), 2)
def test_delete(self): self.assertEqual(compare('abc', 'ab'), 1) self.assertEqual(compare('abc', 'ac'), 1) self.assertEqual(compare('abc', 'bc'), 1) self.assertEqual(compare('a', 'abc'), 2) self.assertEqual(compare('b', 'abc'), 2) self.assertEqual(compare('c', 'abc'), 2)
def test_replace(self): self.assertEqual(compare('abc', 'xbc'), 1) self.assertEqual(compare('abc', 'axc'), 1) self.assertEqual(compare('abc', 'abx'), 1) self.assertEqual(compare('abc', 'xxc'), 2) self.assertEqual(compare('abc', 'axx'), 2) self.assertEqual(compare('abc', 'xbx'), 2)
def compress(self): # Merging to-be-merged list with template list for seq in tqdm(self.tbmergedList): mergeCandidatesD1 = [ ] # List containing templates that each sequence might be merged with distance of 1, mergeCandidatesD2 = [] # and distance of 2 seqReadCount = seq[1][0] ampID1, ampID2 = extractAmpID(seq[1][2]) self.numMergeAttempts += seqReadCount for template in self.templateNestedList[ ampID1]: # Get edit distance between sequence and every applicable template dist = compare(seq[0], template[0]) if dist != -1: # If distance is not more than 2, put template in consideration for merge if dist == 1: mergeCandidatesD1.append(template) else: mergeCandidatesD2.append(template) numCandidates = len(mergeCandidatesD1) + len(mergeCandidatesD2) if numCandidates > 0: self.mergedCount += seqReadCount if numCandidates > 1: self.mergedUnsureCount += seqReadCount if mergeCandidatesD1: # Prioritize templates that are a distance of 1, rather than 2, from the current sequence splitValue = seqReadCount / len( mergeCandidatesD1 ) # Allocate read count equally among templates equally similar to sequence self.mergedD1Count += seqReadCount for template in mergeCandidatesD1: template[1][ 0] += splitValue # Increase total read count template[1][ 1] += splitValue # Increase read count of merges else: splitValue = seqReadCount / len(mergeCandidatesD2) self.mergedD2Count += seqReadCount for template in mergeCandidatesD2: template[1][0] += splitValue template[1][1] += splitValue else: if seqReadCount >= self.j3x_readDeletorThreshold: # If we can't merge the sequence but it has a high read depth self.leftoverList.append(seq) else: self.discardedList.append(seq) self.discardCountList[ampID1] += 1 # Discard self.failedMergeAndDiscarded += 1 # Combine the newly reinforced templates with the leftovers for inclusion in j3x j3xSeqs = self.templateFlatList + self.leftoverList return j3xSeqs
def test_damerau_levenshtein(self): for i, s1 in enumerate(PATTERNS): for j, s2 in enumerate(PATTERNS): self.assertEqual(compare(s1, s2, True), MATRIX_DL[i][j])
def test_beyond(self): self.assertEqual(compare('abc', 'def'), -1)
def test_emptystr(self): self.assertEqual(compare('', ''), 0) self.assertEqual(compare('', 'a'), 1) self.assertEqual(compare('', 'ab'), 2) self.assertEqual(compare('', 'abc'), -1) self.assertEqual(compare('abc', ''), -1)
def test_transpose(self): self.assertEqual(compare('abc', 'bac', True), 1) self.assertEqual(compare('abc', 'acb', True), 1) self.assertEqual(compare('abc', 'cba', True), 2) self.assertEqual(compare('abc', 'ba', True), 2) self.assertEqual(compare('abc', 'ca', True), 2)
def test_equal(self): self.assertEqual(compare('abc', 'abc'), 0)
def test_insert_delete(self): self.assertEqual(compare('abcde', 'eabcd'), 2) self.assertEqual(compare('abcde', 'acdeb'), 2) self.assertEqual(compare('abcde', 'abdec'), 2) self.assertEqual(compare('ababa', 'babab'), 2)