def find_near_matches_dropin(subsequence, sequence, *args, **kwargs): if isinstance(sequence, (tuple, list)): self.skipTest( 'skipping word-list tests with find_near_matches_in_file') try: from Bio.Seq import Seq except ImportError: pass else: if isinstance(sequence, Seq): self.skipTest( 'skipping BioPython Seq tests with find_near_matches_in_file' ) tempfilepath = tempfile.mktemp() if isinstance(sequence, text_type): f = io.open(tempfilepath, 'w+', encoding='utf-8') else: f = open(tempfilepath, 'w+b') try: f.write(sequence) f.seek(0) return find_near_matches_in_file(subsequence, f, *args, **kwargs) finally: f.close() os.remove(tempfilepath)
def test_unicode_encodings(self): needle = u('PATTERN') haystack = u('---PATERN---') for encoding in ['ascii', 'latin-1', 'latin1', 'utf-8', 'utf-16']: with self.subTest(encoding=encoding): with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f: filename = f.name f.write(haystack.encode(encoding)) self.addCleanup(os.remove, filename) with io.open(filename, 'r', encoding=encoding) as f: self.assertEqual( find_near_matches_in_file(needle, f, max_l_dist=1), [Match(3, 9, 1, u('PATERN'))], )
def test_unicode_encodings(self): needle = u('PATTERN') haystack = u('---PATERN---') for encoding in ['ascii', 'latin-1', 'latin1', 'utf-8', 'utf-16']: with self.subTest(encoding=encoding): with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f: filename = f.name f.write(haystack.encode(encoding)) self.addCleanup(os.remove, filename) with io.open(filename, 'r', encoding=encoding) as f: self.assertEqual( find_near_matches_in_file(needle, f, max_l_dist=1), [Match(3, 9, 1)], )
def find_near_matches_dropin(subsequence, sequence, *args, **kwargs): if isinstance(sequence, (tuple, list)): self.skipTest('skipping word-list tests with find_near_matches_in_file') try: from Bio.Seq import Seq except ImportError: pass else: if isinstance(sequence, Seq): self.skipTest('skipping BioPython Seq tests with find_near_matches_in_file') tempfilepath = tempfile.mktemp() if isinstance(sequence, text_type): f = io.open(tempfilepath, 'w+', encoding='utf-8') else: f = open(tempfilepath, 'w+b') try: f.write(sequence) f.seek(0) return find_near_matches_in_file(subsequence, f, *args, **kwargs) finally: f.close() os.remove(tempfilepath)
def test_subsequence_split_between_chunks(self): with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f: filename = f.name self.addCleanup(os.remove, filename) for needle, haystack_match, max_l_dist, expected_matches in [ (b('PATTERN'), b('PATERN'), 0, []), (b('PATTERN'), b('PATERN'), 1, [Match(0, 6, 1, b('PATERN'))]), (b('PATTERN'), b('PATERN'), 2, [Match(0, 6, 1, b('PATERN'))]), (b('PATTERN'), b('PATTERN'), 0, [Match(0, 7, 0, b('PATERN'))]), ]: for chunk_size, delta in product( [100, 2**10, 2**12, 2**18, 2**20], sorted({-len(needle), -len(needle) + 1, -4, -2, -1, 0, 1})): if len(needle) // (max_l_dist + 1) < 3: # no ngrams search, so skip long searches which will be slow if chunk_size > 2**10: continue with self.subTest( needle=needle, haystack_match=haystack_match, max_l_dist=max_l_dist, chunk_size=chunk_size, delta=delta, ): haystack = bytearray(chunk_size + 100) haystack[chunk_size + delta:chunk_size + delta + len(haystack_match)] = haystack_match with open(filename, 'wb') as f: f.write(haystack) with open(filename, 'rb') as f: self.assertEqual( find_near_matches_in_file(needle, f, max_l_dist=max_l_dist, _chunk_size=chunk_size), [ attr.evolve( match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta, matched=haystack_match) for match in expected_matches ]) f.seek(0) self.assertEqual( find_near_matches_in_file( needle, f, max_l_dist=max_l_dist, _chunk_size=chunk_size // 2), [ attr.evolve( match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta, matched=haystack_match) for match in expected_matches ]) with open(filename, 'r') as f: _needle = needle if PY2 else needle.decode('utf-8') self.assertEqual( find_near_matches_in_file(_needle, f, max_l_dist=max_l_dist, _chunk_size=chunk_size), [ attr.evolve( match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta, matched=haystack_match if PY2 else haystack.decode('utf-8')) for match in expected_matches ]) with io.open(filename, 'r', encoding='ascii') as f: self.assertEqual( find_near_matches_in_file(needle.decode('ascii'), f, max_l_dist=max_l_dist, _chunk_size=chunk_size), [ attr.evolve( match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta, matched=haystack_match if PY2 else haystack.decode('utf-8')) for match in expected_matches ]) f.seek(0) self.assertEqual( find_near_matches_in_file( needle.decode('ascii'), f, max_l_dist=max_l_dist, _chunk_size=chunk_size // 2), [ attr.evolve( match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta, matched=haystack_match if PY2 else haystack.decode('utf-8')) for match in expected_matches ])
def test_file_unicode(f): self.assertEqual( find_near_matches_in_file(u(needle), f, max_l_dist=1), [Match(3, 9, 1, u('PATERN'))])
def test_file_bytes(f): self.assertEqual( find_near_matches_in_file(b(needle), f, max_l_dist=1), [Match(3, 9, 1, b('PATERN'))])
def test_subsequence_split_between_chunks(self): with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f: filename = f.name self.addCleanup(os.remove, filename) for needle, haystack_match, max_l_dist, expected_matches in [ (b('PATTERN'), b('PATERN'), 0, []), (b('PATTERN'), b('PATERN'), 1, [Match(0, 6, 1)]), (b('PATTERN'), b('PATERN'), 2, [Match(0, 6, 1)]), (b('PATTERN'), b('PATTERN'), 0, [Match(0, 7, 0)]), ]: for chunk_size, delta in product( [100, 2**10, 2**12, 2**18, 2**20], sorted({-len(needle), -len(needle) + 1, -4, -2, -1, 0, 1}) ): if len(needle) // (max_l_dist + 1) < 3: # no ngrams search, so skip long searches which will be slow if chunk_size > 2**10: continue with self.subTest( needle=needle, haystack_match=haystack_match, max_l_dist=max_l_dist, chunk_size=chunk_size, delta=delta, ): haystack = bytearray(chunk_size + 100) haystack[chunk_size + delta:chunk_size + delta + len(haystack_match)] = haystack_match with open(filename, 'wb') as f: f.write(haystack) with open(filename, 'rb') as f: self.assertEqual( find_near_matches_in_file(needle, f, max_l_dist=max_l_dist, _chunk_size=chunk_size), [attr.evolve(match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta) for match in expected_matches] ) f.seek(0) self.assertEqual( find_near_matches_in_file(needle, f, max_l_dist=max_l_dist, _chunk_size=chunk_size // 2), [attr.evolve(match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta) for match in expected_matches] ) with open(filename, 'r') as f: _needle = needle if PY2 else needle.decode('utf-8') self.assertEqual( find_near_matches_in_file(_needle, f, max_l_dist=max_l_dist, _chunk_size=chunk_size), [attr.evolve(match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta) for match in expected_matches] ) with io.open(filename, 'r', encoding='ascii') as f: self.assertEqual( find_near_matches_in_file(needle.decode('ascii'), f, max_l_dist=max_l_dist, _chunk_size=chunk_size), [attr.evolve(match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta) for match in expected_matches] ) f.seek(0) self.assertEqual( find_near_matches_in_file(needle.decode('ascii'), f, max_l_dist=max_l_dist, _chunk_size=chunk_size // 2), [attr.evolve(match, start=match.start + chunk_size + delta, end=match.end + chunk_size + delta) for match in expected_matches] )
def test_file_unicode(f): self.assertEqual(find_near_matches_in_file(u(needle), f, max_l_dist=1), [Match(3, 9, 1)])