def test_memory_usage(self): import tracemalloc import inspect from corpus import InMemoryDocument, InMemoryCorpus from suffixarray import SuffixArray corpus = InMemoryCorpus() corpus.add_document( InMemoryDocument(0, { "a": "o o\n\n\no\n\no", "b": "o o\no \no" })) corpus.add_document(InMemoryDocument(1, {"a": "ba", "b": "b bab"})) corpus.add_document(InMemoryDocument(2, {"a": "o o O o", "b": "o o"})) corpus.add_document(InMemoryDocument(3, {"a": "oO" * 10000, "b": "o"})) corpus.add_document( InMemoryDocument(4, { "a": "cbab o obab O ", "b": "o o " * 10000 })) tracemalloc.start() snapshot1 = tracemalloc.take_snapshot() engine = SuffixArray(corpus, ["a", "b"], self._normalizer, self._tokenizer) snapshot2 = tracemalloc.take_snapshot() tracemalloc.stop() for statistic in snapshot2.compare_to(snapshot1, "filename"): if statistic.traceback[0].filename == inspect.getfile(SuffixArray): self.assertLessEqual(statistic.size_diff, 2000000, "Memory usage seems excessive.")
def test_multiple_fields(self): from corpus import InMemoryDocument, InMemoryCorpus from suffixarray import SuffixArray corpus = InMemoryCorpus() corpus.add_document( InMemoryDocument(0, { "field1": "a b c", "field2": "b c d" })) corpus.add_document(InMemoryDocument(1, { "field1": "x", "field2": "y" })) corpus.add_document(InMemoryDocument(2, { "field1": "y", "field2": "z" })) engine0 = SuffixArray(corpus, ["field1", "field2"], self._normalizer, self._tokenizer) engine1 = SuffixArray(corpus, ["field1"], self._normalizer, self._tokenizer) engine2 = SuffixArray(corpus, ["field2"], self._normalizer, self._tokenizer) self._process_query_and_verify_winner(engine0, "b c", [0], 2) self._process_query_and_verify_winner(engine0, "y", [1, 2], 1) self._process_query_and_verify_winner(engine1, "x", [1], 1) self._process_query_and_verify_winner(engine1, "y", [2], 1) self._process_query_and_verify_winner(engine1, "z", [], None) self._process_query_and_verify_winner(engine2, "z", [2], 1)
def assignment_a_inverted_index_1(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Dump postings for a dummy two-document corpus. print("INDEXING...") corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"})) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]): print(term) assert term in ["prøve", "wtf", "test"] postings = list(index[term]) for posting in postings: print(posting) assert len(postings) == len(expected) assert [(p.document_id, p.term_frequency) for p in postings] == expected print(index) # Document counts should be correct. assert index.get_document_frequency("wtf") == 0 assert index.get_document_frequency("test") == 2 assert index.get_document_frequency("prøve") == 1
def assignment_a(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Dump postings for a dummy two-document corpus. print("INDEXING...") corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"})) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]): print(term) assert term in ["prøve", "wtf", "test"] postings = list(index.get_postings_iterator(term)) for posting in postings: print(posting) assert len(postings) == len(expected) assert [(p.document_id, p.term_frequency) for p in postings] == expected print(index) # Again, for a slightly bigger corpus. print("LOADING...") corpus = InMemoryCorpus("data/mesh.txt") print("INDEXING...") index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]: print(term) for posting in index.get_postings_iterator(term): print(posting) assert len(list(index.get_postings_iterator(term))) == expected_length # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness. print("MERGING...") merger = PostingsMerger() and_query = ("HIV pROtein", "AND", [11316, 11319, 11320, 11321]) or_query = ("water Toxic", "OR", [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] + [i for i in range(25265, 25282)]) for (query, operator, expected_document_ids) in [and_query, or_query]: print(re.sub("\W+", " " + operator + " ", query)) terms = list(index.get_terms(query)) assert len(terms) == 2 postings = [ index.get_postings_iterator(terms[i]) for i in range(len(terms)) ] merged = { "AND": merger.intersection, "OR": merger.union }[operator](postings[0], postings[1]) documents = [ corpus.get_document(posting.document_id) for posting in merged ] print(*documents, sep="\n") assert len(documents) == len(expected_document_ids) assert [d.get_document_id() for d in documents] == expected_document_ids
def assignment_e_naivebayes_2(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def test_access_documents(self): from corpus import InMemoryDocument, InMemoryCorpus corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"title": "prØve", "body": "en to tre"})) self.assertEqual(corpus.size(), 2) self.assertListEqual([d.document_id for d in corpus], [0, 1]) self.assertListEqual([corpus[i].document_id for i in range(0, corpus.size())], [0, 1]) self.assertListEqual([corpus.get_document(i).document_id for i in range(0, corpus.size())], [0, 1])
def setUp(self): from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryDocument, InMemoryCorpus from invertedindex import InMemoryInvertedIndex from ranking import BetterRanker normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus() corpus.add_document( InMemoryDocument(0, { "title": "the foo", "static_quality_score": 0.9 })) corpus.add_document( InMemoryDocument(1, { "title": "the foo", "static_quality_score": 0.2 })) corpus.add_document( InMemoryDocument(2, { "title": "the foo foo", "static_quality_score": 0.2 })) corpus.add_document(InMemoryDocument(3, {"title": "the bar"})) corpus.add_document(InMemoryDocument(4, {"title": "the bar bar"})) corpus.add_document(InMemoryDocument(5, {"title": "the baz"})) corpus.add_document(InMemoryDocument(6, {"title": "the baz"})) corpus.add_document(InMemoryDocument(7, {"title": "the baz baz"})) index = InMemoryInvertedIndex(corpus, ["title"], normalizer, tokenizer) self._ranker = BetterRanker(corpus, index)
def test_access_postings(self): from corpus import InMemoryDocument, InMemoryCorpus from invertedindex import InMemoryInvertedIndex corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"})) index = InMemoryInvertedIndex(corpus, ["body"], self._normalizer, self._tokenizer) self.assertListEqual(list(index.get_terms("PRøvE wtf tesT")), ["prøve", "wtf", "test"]) self.assertListEqual([(p.document_id, p.term_frequency) for p in index["prøve"]], [(1, 1)]) self.assertListEqual([(p.document_id, p.term_frequency) for p in index.get_postings_iterator("wtf")], []) self.assertListEqual([(p.document_id, p.term_frequency) for p in index["test"]], [(0, 1), (1, 2)]) self.assertEqual(index.get_document_frequency("wtf"), 0) self.assertEqual(index.get_document_frequency("prøve"), 1) self.assertEqual(index.get_document_frequency("test"), 2)
def test_china_example_from_textbook(self): import math from corpus import InMemoryDocument, InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier china = InMemoryCorpus() china.add_document( InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document( InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], self._normalizer, self._tokenizer) results = [] classifier.classify("Chinese Chinese Chinese Tokyo Japan", lambda m: results.append(m)) self.assertEqual(len(results), 2) self.assertEqual(results[0]["category"], "china") self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4) self.assertEqual(results[1]["category"], "not china") self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
def test_multiple_fields(self): from corpus import InMemoryDocument, InMemoryCorpus from invertedindex import InMemoryInvertedIndex document = InMemoryDocument( 0, { 'felt1': 'Dette er en test. Test, sa jeg. TEST!', 'felt2': 'test er det', 'felt3': 'test TEsT', }) corpus = InMemoryCorpus() corpus.add_document(document) index = InMemoryInvertedIndex(corpus, ['felt1', 'felt3'], self._normalizer, self._tokenizer) posting = next(index.get_postings_iterator('test')) self.assertEqual(posting.document_id, 0) self.assertEqual(posting.term_frequency, 5)
def assignment_a_inverted_index_3(): # tests that multiple fields are handled correctly normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() doc = InMemoryDocument(document_id=0, fields={ 'felt 1': 'Dette er en test. Test, sa jeg. TEST!', 'felt 2': 'test er det', 'felt 3': 'test TEsT', }) corpus = InMemoryCorpus() corpus.add_document(doc) index = InMemoryInvertedIndex(corpus, ['felt 1', 'felt 3'], normalizer, tokenizer) p = next(index.get_postings_iterator('test')) print(f"term-freq: {p.term_frequency} (correct is 5)") assert p.document_id == 0 assert p.term_frequency == 5
def test_synthetic_corpus(self): from itertools import product, combinations_with_replacement from corpus import InMemoryDocument, InMemoryCorpus from invertedindex import InMemoryInvertedIndex from searchengine import SimpleSearchEngine corpus = InMemoryCorpus() words = ("".join(term) for term in product("bcd", "aei", "jkl")) texts = (" ".join(word) for word in combinations_with_replacement(words, 3)) for text in texts: corpus.add_document(InMemoryDocument(corpus.size(), {"a": text})) engine = SimpleSearchEngine(corpus, InMemoryInvertedIndex(corpus, ["a"], self._normalizer, self._tokenizer)) epsilon = 0.0001 self._process_query_verify_matches("baj BAJ baj", engine, {"match_threshold": 1.0, "hit_count": 27}, (27, 9.0, [0])) self._process_query_verify_matches("baj caj", engine, {"match_threshold": 1.0, "hit_count": 100}, (27, None, None)) self._process_query_verify_matches("baj caj daj", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 100}, (79, None, None)) self._process_query_verify_matches("baj caj", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 100}, (100, 3.0, [0, 9, 207, 2514])) self._process_query_verify_matches("baj cek dil", engine, {"match_threshold": 1.0, "hit_count": 10}, (1, 3.0, [286])) self._process_query_verify_matches("baj cek dil", engine, {"match_threshold": 1.0, "hit_count": 10}, (1, None, None)) self._process_query_verify_matches("baj cek dil", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 80}, (79, 3.0, [13, 26, 273, 286, 377, 3107, 3198])) self._process_query_verify_matches("baj xxx yyy", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 100}, (0, None, None)) self._process_query_verify_matches("baj xxx yyy", engine, {"match_threshold": 2/3 - epsilon, "hit_count": 100}, (100, None, None))
def assignment_c_simplesearchengine_2(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() ranker = BrainDeadRanker() # Used for comparing floating point numbers. epsilon = 0.0001 # Create a dummy test corpus. corpus = InMemoryCorpus() words = (''.join(term) for term in product("bcd", "aei", "jkl")) texts = (' '.join(word) for word in combinations_with_replacement(words, 3)) for text in texts: corpus.add_document(InMemoryDocument(corpus.size(), {'a': text})) # What we're testing. engine = SimpleSearchEngine( corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer)) # Where the callback will collect the matches. results = [] # Callback that collects matches. def collect(m): results.append((m['score'], m['document'].document_id)) # Executes a query. def search(q, t, n): results.clear() engine.evaluate(q, { 'match_threshold': t, 'hit_count': n }, ranker, collect) # Sorts the collected matches. def sort_results(): results.sort(key=lambda e: e[1]) results.sort(key=lambda e: e[0], reverse=True) # Test predicate. def check_at(i, expected): assert results[i] == expected # Test predicate. def check_range(indices, score, document_ids): for i, d in zip(indices, document_ids): check_at(i, (score, d)) # Test predicate. def check_hits(n): assert len(results) == n # Run tests! search('baj BAJ baj', 1.0, 27) check_hits(27) check_at(0, (9.0, 0)) sort_results() check_range(range(1, 27), 6.0, range(1, 27)) search('baj caj', 1.0, 100) check_hits(27) search('baj caj daj', 2 / 3 + epsilon, 100) check_hits(79) search('baj caj', 2 / 3 + epsilon, 100) check_hits(100) sort_results() check_at(0, (3.0, 0)) check_range(range(4, 12), 2.0, range(1, 9)) check_range(range(12, 29), 2.0, range(10, 27)) check_at(29, (2.0, 35)) check_at(78, (2.0, 2531)) search('baj cek dil', 1.0, 10) check_hits(1) check_at(0, (3.0, 286)) search('baj cek dil', 2 / 3 + epsilon, 80) check_hits(79) sort_results() check_at(0, (3.0, 13)) check_at(1, (3.0, 26)) check_at(2, (3.0, 273)) search('baj xxx yyy', 2 / 3 + epsilon, 100) check_hits(0) search('baj xxx yyy', 2 / 3 - epsilon, 100) check_hits(100)
def assignment_e(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def test_simple_search_engine(): from itertools import product, combinations_with_replacement from tokenization import BrainDeadTokenizer from normalization import BrainDeadNormalizer from corpus import InMemoryCorpus, InMemoryDocument from invertedindex import InMemoryInvertedIndex from searchengine import SimpleSearchEngine from ranking import BrainDeadRanker Ɛ = 0.0001 corpus = InMemoryCorpus() for txt in (' '.join(w) for w in combinations_with_replacement( list(''.join(t) for t in product( 'bcd', 'aei', 'jkl', )), 3)): corpus.add_document(InMemoryDocument(corpus.size(), {'a': txt})) engine = SimpleSearchEngine( corpus, InMemoryInvertedIndex(corpus, ('a', ), BrainDeadNormalizer(), BrainDeadTokenizer())) results = [] def search(q, r, n): results.clear() def match(m): results.append((m['score'], m['document'].document_id)) print('searching "' + q + '" at threshold', r, '…') engine.evaluate(q, { 'recall_threshold': r, 'hit_count': n }, BrainDeadRanker(), match) def sort_results(): results.sort(key=lambda e: e[1]) results.sort(key=lambda e: e[0], reverse=True) def check_at(i, expected): if results[i] != expected: print('FAILED, EXPECTED ', expected, ' RESULT', i, ' was', results[i]) def check_range(indices, score, docrange): for i, d in zip(indices, docrange): check_at(i, (score, d)) def check_hits(n): if len(results) != n: print('FAILED, expected', n, 'results, got', len(results)) search('baj BAJ baj', 1.0, 27) check_hits(27) check_at(0, (9.0, 0)) sort_results() check_range(range(1, 27), 6.0, range(1, 27)) search('baj CAj', 1.0, 100) check_hits(27) search('baj caj daj', 2 / 3 + Ɛ, 100) check_hits(79) search('baj caj', 2 / 3 + Ɛ, 100) # her check_hits(100) sort_results() check_at(0, (3.0, 0)) check_range(range(4, 12), 2.0, range(1, 9)) check_range(range(12, 29), 2.0, range(10, 27)) check_at(29, (2.0, 35)) check_at(78, (2.0, 2531)) search('baj cek dil', 1.0, 10) check_hits(1) check_at(0, (3.0, 286)) search('baj cek dil', 2 / 3 + Ɛ, 80) check_hits(79) sort_results() check_at(0, (3.0, 13)) check_at(1, (3.0, 26)) check_at(2, (3.0, 273)) search('baj xxx yyy', 2 / 3 + Ɛ, 100) check_hits(0) search('baj xxx yyy', 2 / 3 - Ɛ, 100) check_hits(100)
def get_document(self, document_id: int) -> Document: return InMemoryDocument( document_id, {'body': inverted_index.get_doc_body(document_id)})