def test_access_documents(self): from corpus import InMemoryDocument, InMemoryCorpus corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"title": "prØve", "body": "en to tre"})) self.assertEqual(corpus.size(), 2) self.assertListEqual([d.document_id for d in corpus], [0, 1]) self.assertListEqual([corpus[i].document_id for i in range(0, corpus.size())], [0, 1]) self.assertListEqual([corpus.get_document(i).document_id for i in range(0, corpus.size())], [0, 1])
def test_synthetic_corpus(self): from itertools import product, combinations_with_replacement from corpus import InMemoryDocument, InMemoryCorpus from invertedindex import InMemoryInvertedIndex from searchengine import SimpleSearchEngine corpus = InMemoryCorpus() words = ("".join(term) for term in product("bcd", "aei", "jkl")) texts = (" ".join(word) for word in combinations_with_replacement(words, 3)) for text in texts: corpus.add_document(InMemoryDocument(corpus.size(), {"a": text})) engine = SimpleSearchEngine(corpus, InMemoryInvertedIndex(corpus, ["a"], self._normalizer, self._tokenizer)) epsilon = 0.0001 self._process_query_verify_matches("baj BAJ baj", engine, {"match_threshold": 1.0, "hit_count": 27}, (27, 9.0, [0])) self._process_query_verify_matches("baj caj", engine, {"match_threshold": 1.0, "hit_count": 100}, (27, None, None)) self._process_query_verify_matches("baj caj daj", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 100}, (79, None, None)) self._process_query_verify_matches("baj caj", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 100}, (100, 3.0, [0, 9, 207, 2514])) self._process_query_verify_matches("baj cek dil", engine, {"match_threshold": 1.0, "hit_count": 10}, (1, 3.0, [286])) self._process_query_verify_matches("baj cek dil", engine, {"match_threshold": 1.0, "hit_count": 10}, (1, None, None)) self._process_query_verify_matches("baj cek dil", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 80}, (79, 3.0, [13, 26, 273, 286, 377, 3107, 3198])) self._process_query_verify_matches("baj xxx yyy", engine, {"match_threshold": 2/3 + epsilon, "hit_count": 100}, (0, None, None)) self._process_query_verify_matches("baj xxx yyy", engine, {"match_threshold": 2/3 - epsilon, "hit_count": 100}, (100, None, None))
def assignment_c_simplesearchengine_2(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() ranker = BrainDeadRanker() # Used for comparing floating point numbers. epsilon = 0.0001 # Create a dummy test corpus. corpus = InMemoryCorpus() words = (''.join(term) for term in product("bcd", "aei", "jkl")) texts = (' '.join(word) for word in combinations_with_replacement(words, 3)) for text in texts: corpus.add_document(InMemoryDocument(corpus.size(), {'a': text})) # What we're testing. engine = SimpleSearchEngine( corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer)) # Where the callback will collect the matches. results = [] # Callback that collects matches. def collect(m): results.append((m['score'], m['document'].document_id)) # Executes a query. def search(q, t, n): results.clear() engine.evaluate(q, { 'match_threshold': t, 'hit_count': n }, ranker, collect) # Sorts the collected matches. def sort_results(): results.sort(key=lambda e: e[1]) results.sort(key=lambda e: e[0], reverse=True) # Test predicate. def check_at(i, expected): assert results[i] == expected # Test predicate. def check_range(indices, score, document_ids): for i, d in zip(indices, document_ids): check_at(i, (score, d)) # Test predicate. def check_hits(n): assert len(results) == n # Run tests! search('baj BAJ baj', 1.0, 27) check_hits(27) check_at(0, (9.0, 0)) sort_results() check_range(range(1, 27), 6.0, range(1, 27)) search('baj caj', 1.0, 100) check_hits(27) search('baj caj daj', 2 / 3 + epsilon, 100) check_hits(79) search('baj caj', 2 / 3 + epsilon, 100) check_hits(100) sort_results() check_at(0, (3.0, 0)) check_range(range(4, 12), 2.0, range(1, 9)) check_range(range(12, 29), 2.0, range(10, 27)) check_at(29, (2.0, 35)) check_at(78, (2.0, 2531)) search('baj cek dil', 1.0, 10) check_hits(1) check_at(0, (3.0, 286)) search('baj cek dil', 2 / 3 + epsilon, 80) check_hits(79) sort_results() check_at(0, (3.0, 13)) check_at(1, (3.0, 26)) check_at(2, (3.0, 273)) search('baj xxx yyy', 2 / 3 + epsilon, 100) check_hits(0) search('baj xxx yyy', 2 / 3 - epsilon, 100) check_hits(100)
def test_simple_search_engine(): from itertools import product, combinations_with_replacement from tokenization import BrainDeadTokenizer from normalization import BrainDeadNormalizer from corpus import InMemoryCorpus, InMemoryDocument from invertedindex import InMemoryInvertedIndex from searchengine import SimpleSearchEngine from ranking import BrainDeadRanker Ɛ = 0.0001 corpus = InMemoryCorpus() for txt in (' '.join(w) for w in combinations_with_replacement( list(''.join(t) for t in product( 'bcd', 'aei', 'jkl', )), 3)): corpus.add_document(InMemoryDocument(corpus.size(), {'a': txt})) engine = SimpleSearchEngine( corpus, InMemoryInvertedIndex(corpus, ('a', ), BrainDeadNormalizer(), BrainDeadTokenizer())) results = [] def search(q, r, n): results.clear() def match(m): results.append((m['score'], m['document'].document_id)) print('searching "' + q + '" at threshold', r, '…') engine.evaluate(q, { 'recall_threshold': r, 'hit_count': n }, BrainDeadRanker(), match) def sort_results(): results.sort(key=lambda e: e[1]) results.sort(key=lambda e: e[0], reverse=True) def check_at(i, expected): if results[i] != expected: print('FAILED, EXPECTED ', expected, ' RESULT', i, ' was', results[i]) def check_range(indices, score, docrange): for i, d in zip(indices, docrange): check_at(i, (score, d)) def check_hits(n): if len(results) != n: print('FAILED, expected', n, 'results, got', len(results)) search('baj BAJ baj', 1.0, 27) check_hits(27) check_at(0, (9.0, 0)) sort_results() check_range(range(1, 27), 6.0, range(1, 27)) search('baj CAj', 1.0, 100) check_hits(27) search('baj caj daj', 2 / 3 + Ɛ, 100) check_hits(79) search('baj caj', 2 / 3 + Ɛ, 100) # her check_hits(100) sort_results() check_at(0, (3.0, 0)) check_range(range(4, 12), 2.0, range(1, 9)) check_range(range(12, 29), 2.0, range(10, 27)) check_at(29, (2.0, 35)) check_at(78, (2.0, 2531)) search('baj cek dil', 1.0, 10) check_hits(1) check_at(0, (3.0, 286)) search('baj cek dil', 2 / 3 + Ɛ, 80) check_hits(79) sort_results() check_at(0, (3.0, 13)) check_at(1, (3.0, 26)) check_at(2, (3.0, 273)) search('baj xxx yyy', 2 / 3 + Ɛ, 100) check_hits(0) search('baj xxx yyy', 2 / 3 - Ɛ, 100) check_hits(100)