def test_multiple_fields(self): from corpus import InMemoryDocument, InMemoryCorpus from suffixarray import SuffixArray corpus = InMemoryCorpus() corpus.add_document( InMemoryDocument(0, { "field1": "a b c", "field2": "b c d" })) corpus.add_document(InMemoryDocument(1, { "field1": "x", "field2": "y" })) corpus.add_document(InMemoryDocument(2, { "field1": "y", "field2": "z" })) engine0 = SuffixArray(corpus, ["field1", "field2"], self._normalizer, self._tokenizer) engine1 = SuffixArray(corpus, ["field1"], self._normalizer, self._tokenizer) engine2 = SuffixArray(corpus, ["field2"], self._normalizer, self._tokenizer) self._process_query_and_verify_winner(engine0, "b c", [0], 2) self._process_query_and_verify_winner(engine0, "y", [1, 2], 1) self._process_query_and_verify_winner(engine1, "x", [1], 1) self._process_query_and_verify_winner(engine1, "y", [2], 1) self._process_query_and_verify_winner(engine1, "z", [], None) self._process_query_and_verify_winner(engine2, "z", [2], 1)
def populateSuffixArray(keys): """ Takes a list of keys and initializes and populates a suffixarray.SuffixArray object """ suffix_array = SuffixArray() for key in keys: suffix_array.insert(key) return suffix_array
def test_memory_usage(self): import tracemalloc import inspect from corpus import InMemoryDocument, InMemoryCorpus from suffixarray import SuffixArray corpus = InMemoryCorpus() corpus.add_document( InMemoryDocument(0, { "a": "o o\n\n\no\n\no", "b": "o o\no \no" })) corpus.add_document(InMemoryDocument(1, {"a": "ba", "b": "b bab"})) corpus.add_document(InMemoryDocument(2, {"a": "o o O o", "b": "o o"})) corpus.add_document(InMemoryDocument(3, {"a": "oO" * 10000, "b": "o"})) corpus.add_document( InMemoryDocument(4, { "a": "cbab o obab O ", "b": "o o " * 10000 })) tracemalloc.start() snapshot1 = tracemalloc.take_snapshot() engine = SuffixArray(corpus, ["a", "b"], self._normalizer, self._tokenizer) snapshot2 = tracemalloc.take_snapshot() tracemalloc.stop() for statistic in snapshot2.compare_to(snapshot1, "filename"): if statistic.traceback[0].filename == inspect.getfile(SuffixArray): self.assertLessEqual(statistic.size_diff, 2000000, "Memory usage seems excessive.")
def test_cran_corpus(self): import os.path from corpus import InMemoryCorpus from suffixarray import SuffixArray corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml')) engine = SuffixArray(corpus, ["body"], self._normalizer, self._tokenizer) self._process_query_and_verify_winner(engine, "visc", [328], 11) self._process_query_and_verify_winner(engine, "Of A", [946], 10) self._process_query_and_verify_winner(engine, "", [], None) self._process_query_and_verify_winner(engine, "approximate solution", [159, 1374], 3)
def assignment_b_suffixarray_1(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Prepare for some suffix array lookups. print("LOADING...") corpus = InMemoryCorpus("data/cran.xml") print("INDEXING...") engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) results = [] hit_count = 5 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Define the actual test queries. test1 = ("visc", 11, [328]) # Look for {'viscous', 'viscosity', ...}. test2 = ("Of A", 10, [946]) # Test robustness for case and whitespace. test3 = ("", 0, []) # Safety feature: Match nothing instead of everything. test4 = ("approximate solution", 3, [1374, 159]) # Multiple winners. # Test that the simple occurrence ranking works. Be robust towards how ties are resolved. for (query, winner_score, winner_document_ids) in [test1, test2, test3, test4]: print("SEARCHING for '" + query + "'...") results.clear() engine.evaluate(query, { "debug": False, "hit_count": hit_count }, match_collector) if winner_document_ids: assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids assert len(results) <= hit_count else: assert len(results) == 0
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from suffixarray import SuffixArray print("Building suffix array from Cranfield corpus...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml')) engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) options = {"debug": False, "hit_count": 5} print("Enter a prefix phrase query and find matching documents.") print(f"Lookup options are {options}.") print("Returned scores are occurrence counts.") def evaluator(query): matches = [] engine.evaluate(query, options, lambda m: matches.append(m)) return matches simple_repl("query", evaluator)
def assignment_b_suffixarray_2(): # For testing. class TestNormalizer(Normalizer): _table = str.maketrans({'Ø': 'O'}) def canonicalize(self, buffer: str) -> str: return buffer def normalize(self, token: str) -> str: return token.upper().translate(self._table) # For testing. class TestDocument(Document): def __init__(self, document_id: int, a: str, b: str): self._document_id = document_id self._a = a self._b = b def get_document_id(self) -> int: return self._document_id def get_field(self, field_name: str, default: str) -> str: if field_name == "a": return self._a if field_name == "b": return self._b return default # For testing. class TestCorpus(Corpus): def __init__(self): self._docs = [] self._docs.append(TestDocument(len(self._docs), "ø o\n\n\nø\n\no", "ø o\nø \no")) self._docs.append(TestDocument(len(self._docs), "ba", "b bab")) self._docs.append(TestDocument(len(self._docs), "ø o Ø o", "ø o")) self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o")) self._docs.append(TestDocument(len(self._docs), "cbab o øbab Ø ", "ø o " * 10000)) def __iter__(self): return iter(self._docs) def size(self) -> int: return len(self._docs) def get_document(self, document_id: int) -> Document: return self._docs[document_id] # Run the tests! for fields in [("b",), ("a", "b")]: # Create the suffix array over the given set of fields. Measure memory usage. If memory usage is # excessive, most likely the implementation is copying strings or doing other silly stuff instead # of working with buffer indices. The naive reference implementation is not in any way optimized, # and uses about 1.5 MB of memory on this corpus. tracemalloc.start() snapshot1 = tracemalloc.take_snapshot() engine = SuffixArray(TestCorpus(), fields, TestNormalizer(), BrainDeadTokenizer()) snapshot2 = tracemalloc.take_snapshot() for statistic in snapshot2.compare_to(snapshot1, "filename"): if statistic.traceback[0].filename == inspect.getfile(SuffixArray): assert statistic.size_diff < 2000000, f"Memory usage is {statistic.size_diff}" tracemalloc.stop() results = [] def process(m): results.append((m['document'].document_id, m['score'])) expected_results = { ('b',): ( ('bab', [(1, 1)]), ('ø o', [(4, 19999), (0, 3), (2, 1)]), ('o O', [(4, 19999), (0, 3), (2, 1)]), ('oooooo', []), ('o o o o', [(4, 19997), (0, 1)]), ), ('a', 'b'): ( ('bab', [(1, 1)]), ('ø o', [(4, 20000), (0, 6), (2, 4)]), ('o O', [(4, 20000), (0, 6), (2, 4)]), ('oøØOøO', [(3, 1), ]), ('o o o o', [(4, 19997), (0, 2), (2, 1)]), ) } for query, expected in expected_results[fields]: results.clear() engine.evaluate(query, {'hit_count': 10}, process) assert results == expected
def assignment_b_suffixarray_2(): # For testing. class TestNormalizer(Normalizer): _table = str.maketrans({'Ø': 'O'}) def canonicalize(self, buffer: str) -> str: return buffer def normalize(self, token: str) -> str: return token.upper().translate(self._table) # For testing. class TestDocument(Document): def __init__(self, document_id: int, a: str, b: str): self._document_id = document_id self._a = a self._b = b def get_document_id(self) -> int: return self._document_id def get_field(self, field_name: str, default: str) -> str: if field_name == "a": return self._a if field_name == "b": return self._b return default # For testing. class TestCorpus(Corpus): def __init__(self): self._docs = [] self._docs.append( TestDocument(len(self._docs), "ø o\n\n\nø\n\no", "ø o\nø \no")) self._docs.append(TestDocument(len(self._docs), "ba", "b bab")) self._docs.append(TestDocument(len(self._docs), "ø o Ø o", "ø o")) self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o")) self._docs.append( TestDocument(len(self._docs), "cbab o øbab Ø ", "ø o " * 10000)) def __iter__(self): return iter(self._docs) def size(self) -> int: return len(self._docs) def get_document(self, document_id: int) -> Document: return self._docs[document_id] # Run the tests! for fields in [("b", ), ("a", "b")]: engine = SuffixArray(TestCorpus(), fields, TestNormalizer(), BrainDeadTokenizer()) results = [] def process(m): results.append((m['document'].document_id, m['score'])) expected_results = { ('b', ): ( ('bab', [(1, 1)]), ('ø o', [(4, 19999), (0, 3), (2, 1)]), ('o O', [(4, 19999), (0, 3), (2, 1)]), ('oooooo', []), ('o o o o', [(4, 19997), (0, 1)]), ), ('a', 'b'): ( ('bab', [(1, 1)]), ('ø o', [(4, 20000), (0, 6), (2, 4)]), ('o O', [(4, 20000), (0, 6), (2, 4)]), ('oøØOøO', [ (3, 1), ]), ('o o o o', [(4, 19997), (0, 2), (2, 1)]), ) } for query, expected in expected_results[fields]: results.clear() engine.evaluate(query, {'hit_count': 10}, process) assert results == expected
def assignment_b(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Prepare for some suffix array lookups. print("LOADING...") corpus = InMemoryCorpus("data/cran.xml") print("INDEXING...") engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) results = [] hit_count = 5 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Define the actual test queries. test1 = ("visc", 11, [328]) # Look for {'viscous', 'viscosity', ...}. test2 = ("Of A", 10, [946]) # Test robustness for case and whitespace. test3 = ("", 0, []) # Safety feature: Match nothing instead of everything. test4 = ("approximate solution", 3, [1374, 159]) # Multiple winners. # Test that the simple occurrence ranking works. Be robust towards how ties are resolved. for (query, winner_score, winner_document_ids) in [test1, test2, test3, test4]: print("SEARCHING for '" + query + "'...") results.clear() engine.evaluate(query, { "debug": False, "hit_count": hit_count }, match_collector) if winner_document_ids: assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids assert len(results) <= hit_count else: assert len(results) == 0 # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer. trie = Trie() for s in [ "romerike", "apple computer", "norsk", "norsk ørret", "sverige", "ørret", "banan" ]: trie.add(s, tokenizer) finder = StringFinder(trie, tokenizer) buffer = "det var en gang en norsk ørret fra romerike som likte abba fra sverige" print("SCANNING...") results.clear() finder.scan(buffer, lambda m: results.append(m)) print("Buffer \"" + buffer + "\" contains", results) assert [m["match"] for m in results ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"] # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH # documents are medical terms and the Cranfield documents have technical content, the # overlap probably isn't that big. print("LOADING...") mesh = InMemoryCorpus("data/mesh.txt") cranfield = InMemoryCorpus("data/cran.xml") print("BUILDING...") trie = Trie() for d in mesh: trie.add(d["body"] or "", tokenizer) finder = StringFinder(trie, tokenizer) print("SCANNING...") for (document_id, expected_matches) in [(0, ["wing", "wing"]), (3, ["solutions", "skin", "friction"]), (1254, ["electrons", "ions"])]: document = cranfield[document_id] buffer = document["body"] or "" results.clear() finder.scan(buffer, lambda m: results.append(m)) print("Cranfield document", document, "contains MeSH terms", results) assert [m["match"] for m in results] == expected_matches