class TestBrainDeadRanker(unittest.TestCase): def setUp(self): from ranking import BrainDeadRanker self._ranker = BrainDeadRanker() def test_term_frequency(self): from invertedindex import Posting self._ranker.reset(21) self._ranker.update("foo", 2, Posting(21, 4)) self._ranker.update("bar", 1, Posting(21, 3)) self.assertEqual(self._ranker.evaluate(), 11) self._ranker.reset(42) self._ranker.update("foo", 1, Posting(42, 1)) self._ranker.update("baz", 2, Posting(42, 2)) self.assertEqual(self._ranker.evaluate(), 5)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import ShingleGenerator from corpus import InMemoryCorpus from invertedindex import InMemoryInvertedIndex from ranking import BrainDeadRanker from searchengine import SimpleSearchEngine print("Indexing MeSH corpus...") normalizer = BrainDeadNormalizer() tokenizer = ShingleGenerator(3) corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt')) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) ranker = BrainDeadRanker() engine = SimpleSearchEngine(corpus, index) options = {"debug": False, "hit_count": 5, "match_threshold": 0.5} print("Enter a query and find matching documents.") print(f"Lookup options are {options}.") print(f"Tokenizer is {tokenizer.__class__.__name__}.") print(f"Ranker is {ranker.__class__.__name__}.") def evaluator(query): matches = [] engine.evaluate(query, options, ranker, lambda m: matches.append(m)) return matches simple_repl("query", evaluator)
def search(q, r, n): results.clear() def match(m): results.append((m['score'], m['document'].document_id)) print('searching "' + q + '" at threshold', r, '…') engine.evaluate(q, { 'recall_threshold': r, 'hit_count': n }, BrainDeadRanker(), match)
def _process_two_term_query_verify_matches(self, query, engine, options, expected): from ranking import BrainDeadRanker ranker = BrainDeadRanker() matches = [] hits, winners = expected engine.evaluate(query, options, ranker, lambda m: matches.append((m["score"], m["document"].document_id))) self.assertEqual(len(matches), hits) for (score, winner) in matches[:len(winners)]: self.assertEqual(score, 2.0) self.assertIn(winner, winners) for (score, contender) in matches[len(winners):]: self.assertEqual(score, 1.0)
def _process_query_verify_matches(self, query, engine, options, expected): from itertools import takewhile from ranking import BrainDeadRanker ranker = BrainDeadRanker() matches = [] hits, score, winners = expected engine.evaluate(query, options, ranker, lambda m: matches.append((m["score"], m["document"].document_id))) self.assertEqual(len(matches), hits) if matches: for i in range(1, hits): self.assertGreaterEqual(matches[i - 1][0], matches[i][0]) if score: self.assertEqual(matches[0][0], score) if winners: top = takewhile(lambda m: m[0] == matches[0][0], matches) self.assertListEqual(winners, list(sorted([m[1] for m in top])))
def assignment_d_betterranker(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] hit_count = 10 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Load and index some English news sentences. Look at the output and compare the two rankers! # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved # ranker does not. The test below for the improved ranker (with document #24 being the winner) # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the # textbook. print("LOADING...") corpus = InMemoryCorpus("data/en.txt") print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) simple_ranker = BrainDeadRanker() better_ranker = BetterRanker(corpus, inverted_index) engine = SimpleSearchEngine(corpus, inverted_index) for query in ["the terrorism attack and obama"]: options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } for ranker in [simple_ranker, better_ranker]: print("SEARCHING for '" + query + "' using " + ranker.__class__.__name__ + "...") results.clear() engine.evaluate(query, options, ranker, match_collector) winner_document_ids = { simple_ranker: [9221, 7263], better_ranker: [24] }[ranker] assert 0 < len(results) <= hit_count assert results[0]["document"].document_id in winner_document_ids
def assignment_c_simplesearchengine_1(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Load and index MeSH terms. print("LOADING...") corpus = InMemoryCorpus("../data/mesh.txt") print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) # Do ranked retrieval, using a simple ranker. engine = SimpleSearchEngine(corpus, inverted_index) simple_ranker = BrainDeadRanker() results = [] # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) query = "polluTION Water" for match_threshold in [0.1, 1.0]: print( f"SEARCHING for '{query}' with match threshold {str(match_threshold)}..." ) results.clear() options = { "match_threshold": match_threshold, "hit_count": 10, "debug": False } engine.evaluate(query, options, simple_ranker, match_collector) assert len(results) == {0.1: 10, 1.0: 3}[match_threshold] for (score, document_id) in [(match["score"], match["document"].document_id) for match in results[:3]]: assert score == 2.0 # Both 'pollution' and 'water'. assert document_id in [25274, 25275, 25276] for score in [match["score"] for match in results[3:]]: assert score == 1.0 # Only 'pollution' or 'water', but not both.
def assignment_d_shinglegenerator_2(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = ShingleGenerator(3) ranker = BrainDeadRanker() results = [] hit_count = 10 # Load MeSH terms. print("LOADING...") corpus = InMemoryCorpus("data/mesh.txt") # Do ranked retrieval, using n-grams (shingles) and a simple ranker. This allows for fuzzy retrieval. print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) engine = SimpleSearchEngine(corpus, inverted_index) # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Test with some mispelled queries. Be robust for arbitrary resolving of ties. for (query, winner_score, winner_document_ids) in [ ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]), ("synndrome", 7.0, [1275]) ]: print("SEARCHING for '" + query + "'...") results.clear() options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } engine.evaluate(query, options, ranker, match_collector) assert 0 < len(results) <= hit_count assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids
def assignment_c_simplesearchengine_3(): # All accesses to posting lists are logged here. accesses = [] # For testing. class AccessLoggedIterator(Iterator[Posting]): def __init__(self, term: str, wrapped: Iterator[Posting]): self._term = term self._wrapped = wrapped def __next__(self): posting = next(self._wrapped) accesses.append((self._term, posting.document_id)) return posting # For testing. class AccessLoggedInvertedIndex(InvertedIndex): def __init__(self, wrapped: InvertedIndex): self._wrapped = wrapped def get_terms(self, buffer: str) -> Iterator[str]: return self._wrapped.get_terms(buffer) def get_postings_iterator(self, term: str) -> Iterator[Posting]: return AccessLoggedIterator( term, self._wrapped.get_postings_iterator(term)) def get_document_frequency(self, term: str) -> int: return self._wrapped.get_document_frequency(term) # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Load and index MeSH terms. corpus = InMemoryCorpus("data/mesh.txt") inverted_index = AccessLoggedInvertedIndex( InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)) # Do ranked retrieval, using a simple ranker. engine = SimpleSearchEngine(corpus, inverted_index) simple_ranker = BrainDeadRanker() query = "Water polluTION" options = {"match_threshold": 0.5, "hit_count": 1, "debug": False} engine.evaluate(query, options, simple_ranker, lambda m: m) # Expected posting list traversal ordering if the implementation chooses to evaluate this as "water pollution". ordering1 = [('water', 3078), ('pollution', 788), ('pollution', 789), ('pollution', 790), ('pollution', 8079), ('water', 8635), ('pollution', 23837), ('water', 9379), ('water', 23234), ('water', 25265), ('pollution', 25274), ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270), ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274), ('water', 25275), ('pollution', 25275), ('water', 25276), ('pollution', 25276), ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280), ('water', 25281)] # Expected posting list traversal ordering if the implementation chooses to evaluate this as "pollution water". ordering2 = [('pollution', 788), ('water', 3078), ('pollution', 789), ('pollution', 790), ('pollution', 8079), ('water', 8635), ('pollution', 23837), ('water', 9379), ('water', 23234), ('water', 25265), ('pollution', 25274), ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270), ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274), ('pollution', 25275), ('water', 25275), ('pollution', 25276), ('water', 25276), ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280), ('water', 25281)] # Check that the posting lists have been accessed in a way that's consistent with document-at-a-time traversal. # Be somewhat robust to implementation details. This is a fairly strict test, and advanced (but valid) # implementations that for some reason do lookaheads or whatever might fail. assert accesses == ordering1 or accesses == ordering2
def assignment_c_simplesearchengine_2(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() ranker = BrainDeadRanker() # Used for comparing floating point numbers. epsilon = 0.0001 # Create a dummy test corpus. corpus = InMemoryCorpus() words = (''.join(term) for term in product("bcd", "aei", "jkl")) texts = (' '.join(word) for word in combinations_with_replacement(words, 3)) for text in texts: corpus.add_document(InMemoryDocument(corpus.size(), {'a': text})) # What we're testing. engine = SimpleSearchEngine( corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer)) # Where the callback will collect the matches. results = [] # Callback that collects matches. def collect(m): results.append((m['score'], m['document'].document_id)) # Executes a query. def search(q, t, n): results.clear() engine.evaluate(q, { 'match_threshold': t, 'hit_count': n }, ranker, collect) # Sorts the collected matches. def sort_results(): results.sort(key=lambda e: e[1]) results.sort(key=lambda e: e[0], reverse=True) # Test predicate. def check_at(i, expected): assert results[i] == expected # Test predicate. def check_range(indices, score, document_ids): for i, d in zip(indices, document_ids): check_at(i, (score, d)) # Test predicate. def check_hits(n): assert len(results) == n # Run tests! search('baj BAJ baj', 1.0, 27) check_hits(27) check_at(0, (9.0, 0)) sort_results() check_range(range(1, 27), 6.0, range(1, 27)) search('baj caj', 1.0, 100) check_hits(27) search('baj caj daj', 2 / 3 + epsilon, 100) check_hits(79) search('baj caj', 2 / 3 + epsilon, 100) check_hits(100) sort_results() check_at(0, (3.0, 0)) check_range(range(4, 12), 2.0, range(1, 9)) check_range(range(12, 29), 2.0, range(10, 27)) check_at(29, (2.0, 35)) check_at(78, (2.0, 2531)) search('baj cek dil', 1.0, 10) check_hits(1) check_at(0, (3.0, 286)) search('baj cek dil', 2 / 3 + epsilon, 80) check_hits(79) sort_results() check_at(0, (3.0, 13)) check_at(1, (3.0, 26)) check_at(2, (3.0, 273)) search('baj xxx yyy', 2 / 3 + epsilon, 100) check_hits(0) search('baj xxx yyy', 2 / 3 - epsilon, 100) check_hits(100)
def test_document_at_a_time_traversal_mesh_corpus(self): from typing import Iterator, List, Tuple import os.path from invertedindex import Posting, InvertedIndex, InMemoryInvertedIndex from corpus import InMemoryCorpus from searchengine import SimpleSearchEngine from ranking import BrainDeadRanker class AccessLoggedIterator(Iterator[Posting]): def __init__(self, term: str, history: List[Tuple[str, int]], wrapped: Iterator[Posting]): self._term = term self._history = history self._wrapped = wrapped def __next__(self): posting = next(self._wrapped) self._history.append((self._term, posting.document_id)) return posting class AccessLoggedInvertedIndex(InvertedIndex): def __init__(self, wrapped: InvertedIndex): self._wrapped = wrapped self._history = [] def get_terms(self, buffer: str) -> Iterator[str]: return self._wrapped.get_terms(buffer) def get_postings_iterator(self, term: str) -> Iterator[Posting]: return AccessLoggedIterator(term, self._history, self._wrapped.get_postings_iterator(term)) def get_document_frequency(self, term: str) -> int: return self._wrapped.get_document_frequency(term) def get_history(self) -> List[Tuple[str, int]]: return self._history corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt')) index = AccessLoggedInvertedIndex(InMemoryInvertedIndex(corpus, ["body"], self._normalizer, self._tokenizer)) engine = SimpleSearchEngine(corpus, index) ranker = BrainDeadRanker() query = "Water polluTION" options = {"match_threshold": 0.5, "hit_count": 1, "debug": False} engine.evaluate(query, options, ranker, lambda m: None) history = index.get_history() ordering1 = [('water', 3078), # Document-at-a-time ordering if evaluated as "water pollution". ('pollution', 788), ('pollution', 789), ('pollution', 790), ('pollution', 8079), ('water', 8635), ('pollution', 23837), ('water', 9379), ('water', 23234), ('water', 25265), ('pollution', 25274), ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270), ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274), ('water', 25275), ('pollution', 25275), ('water', 25276), ('pollution', 25276), ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280), ('water', 25281)] ordering2 = [('pollution', 788), # Document-at-a-time ordering if evaluated as "pollution water". ('water', 3078), ('pollution', 789), ('pollution', 790), ('pollution', 8079), ('water', 8635), ('pollution', 23837), ('water', 9379), ('water', 23234), ('water', 25265), ('pollution', 25274), ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270), ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274), ('pollution', 25275), ('water', 25275), ('pollution', 25276), ('water', 25276), ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280), ('water', 25281)] self.assertTrue(history == ordering1 or history == ordering2) # Strict. Advanced implementations might fail.
def setUp(self): from ranking import BrainDeadRanker self._ranker = BrainDeadRanker()
def assignment_d(): # Use these throughout below. normalizer = BrainDeadNormalizer() simple_tokenizer = BrainDeadTokenizer() # Load MeSH terms. print("LOADING...") corpus = InMemoryCorpus("data/mesh.txt") # Do ranked retrieval, using n-grams and a simple ranker. This allows for fuzzy retrieval. print("INDEXING...") shingle_generator = ShingleGenerator(3) shingle_inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, shingle_generator) shingle_engine = SimpleSearchEngine(corpus, shingle_inverted_index) simple_ranker = BrainDeadRanker() results = [] hit_count = 10 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Test with some mispelled queries. Be robust for arbitrary resolving of ties. for (query, winner_score, winner_document_ids) in [ ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]), ("synndrome", 7.0, [1275]) ]: print("SEARCHING for '" + query + "'...") results.clear() options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } shingle_engine.evaluate(query, options, simple_ranker, match_collector) assert 0 < len(results) <= hit_count assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids # Load and index some English news sentences. Look at the output and compare the two rankers! # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved # ranker does not. The test below for the improved ranker (with document #24 being the winner) # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the # textbook. print("LOADING...") corpus = InMemoryCorpus("data/en.txt") print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, simple_tokenizer) better_ranker = BetterRanker(corpus, inverted_index) engine = SimpleSearchEngine(corpus, inverted_index) for query in ["the terrorism attack and obama"]: options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } for ranker in [simple_ranker, better_ranker]: print("SEARCHING for '" + query + "' using " + ranker.__class__.__name__ + "...") results.clear() engine.evaluate(query, options, ranker, match_collector) winner_document_ids = { simple_ranker: [9221, 7263], better_ranker: [24] }[ranker] assert 0 < len(results) <= hit_count assert results[0]["document"].document_id in winner_document_ids
def test_simple_search_daat(): from typing import Iterator from ranking import BrainDeadRanker from searchengine import SimpleSearchEngine from invertedindex import InvertedIndex, Posting from corpus import Corpus, Document, InMemoryDocument inverted_index = None class DaatPostingList: def __init__(self, docid_list): self.docid_list = docid_list self.idx = 0 @property def current_docid(self): if self.idx >= len(self.docid_list): return 1000 return self.docid_list[self.idx] def __iter__(self): return self def __next__(self): if self.idx + 1 >= len(self.docid_list): raise StopIteration self.idx += 1 docid = self.docid_list[self.idx] if 106 <= docid <= 108: maxdiff = 0 for pl in (e for e in inverted_index.active_lists if e is not self): maxdiff = max(maxdiff, abs(pl.current_docid - docid)) if maxdiff > 20: inverted_index.is_daat = False return Posting(docid, 1) class DaatCorpus(Corpus): def __iter__(self): return (self.get_document(i) for i in range(0, self.size())) def size(self) -> int: return 300 def get_document(self, document_id: int) -> Document: return InMemoryDocument( document_id, {'body': inverted_index.get_doc_body(document_id)}) class DaatInvertedIndex(InvertedIndex): def __init__(self): self.is_daat = True self.active_lists = [] _data = { 'a': (1, 2, 7, 10, 11, 12, 20, 21, 22, 41, 42, 44, 45, 50, 100, 103, 106, 109, 112, 201, 203, 204, 205, 206, 207), 'b': (3, 4, 8, 16, 17, 18, 20, 22, 31, 41, 43, 101, 104, 107, 110, 113, 202, 204), 'c': (5, 6, 9, 13, 14, 15, 20, 21, 31, 41, 102, 105, 108, 111, 114, 203, 204, 208), } @classmethod def get_doc_body(cls, document_id): return ' '.join(k for k, a in cls._data.items() if document_id in a) def get_terms(self, buffer: str) -> Iterator[str]: return iter(buffer.split(' ')) def get_postings_iterator(self, term: str) -> Iterator[Posting]: if term in self._data: iterator = DaatPostingList(self._data[term]) self.active_lists.append(iterator) return iterator return iter(tuple()) def get_document_frequency(self, term: str) -> int: return len(self._data.get(term, [])) inverted_index = DaatInvertedIndex() engine = SimpleSearchEngine(DaatCorpus(), inverted_index) results = [] def match(m): results.append((m['score'], m['document'].document_id)) engine.evaluate('a b c', { 'recall_threshold': 1.0, 'hit_count': 100 }, BrainDeadRanker(), match) results.sort() if results != [(3.0, 20), (3.0, 41), (3.0, 204)]: print("FAILURE: search failed.") if not inverted_index.is_daat: print("WARNING: not performing proper Document-At-A-Time search!")