def test_multiple_fields(self): corpus = in3120.InMemoryCorpus() corpus.add_document( in3120.InMemoryDocument(0, { "field1": "a b c", "field2": "b c d" })) corpus.add_document( in3120.InMemoryDocument(1, { "field1": "x", "field2": "y" })) corpus.add_document( in3120.InMemoryDocument(2, { "field1": "y", "field2": "z" })) engine0 = in3120.SuffixArray(corpus, ["field1", "field2"], self.__normalizer, self.__tokenizer) engine1 = in3120.SuffixArray(corpus, ["field1"], self.__normalizer, self.__tokenizer) engine2 = in3120.SuffixArray(corpus, ["field2"], self.__normalizer, self.__tokenizer) self.__process_query_and_verify_winner(engine0, "b c", [0], 2) self.__process_query_and_verify_winner(engine0, "y", [1, 2], 1) self.__process_query_and_verify_winner(engine1, "x", [1], 1) self.__process_query_and_verify_winner(engine1, "y", [2], 1) self.__process_query_and_verify_winner(engine1, "z", [], None) self.__process_query_and_verify_winner(engine2, "z", [2], 1)
def setUp(self): normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus() corpus.add_document( in3120.InMemoryDocument(0, { "title": "the foo", "static_quality_score": 0.9 })) corpus.add_document( in3120.InMemoryDocument(1, { "title": "the foo", "static_quality_score": 0.2 })) corpus.add_document( in3120.InMemoryDocument(2, { "title": "the foo foo", "static_quality_score": 0.2 })) corpus.add_document(in3120.InMemoryDocument(3, {"title": "the bar"})) corpus.add_document( in3120.InMemoryDocument(4, {"title": "the bar bar"})) corpus.add_document(in3120.InMemoryDocument(5, {"title": "the baz"})) corpus.add_document(in3120.InMemoryDocument(6, {"title": "the baz"})) corpus.add_document( in3120.InMemoryDocument(7, {"title": "the baz baz"})) index = in3120.InMemoryInvertedIndex(corpus, ["title"], normalizer, tokenizer) self.__ranker = in3120.BetterRanker(corpus, index)
def test_access_documents(self): corpus = in3120.InMemoryCorpus() corpus.add_document( in3120.InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document( in3120.InMemoryDocument(1, { "title": "prØve", "body": "en to tre" })) self.assertEqual(corpus.size(), 2) self.assertListEqual([d.document_id for d in corpus], [0, 1]) self.assertListEqual( [corpus[i].document_id for i in range(0, corpus.size())], [0, 1]) self.assertListEqual([ corpus.get_document(i).document_id for i in range(0, corpus.size()) ], [0, 1])
def test_uses_yield(self): import types corpus = in3120.InMemoryCorpus() corpus.add_document(in3120.InMemoryDocument(0, {"a": "the foo bar"})) engine = in3120.SuffixArray(corpus, ["a"], self.__normalizer, self.__tokenizer) matches = engine.evaluate("foo", {}) self.assertIsInstance(matches, types.GeneratorType, "Are you using yield?")
def test_uses_yield(self): import types corpus = in3120.InMemoryCorpus() corpus.add_document(in3120.InMemoryDocument(0, {"a": "the foo bar"})) training_set = {c: corpus for c in ["x", "y"]} classifier = in3120.NaiveBayesClassifier(training_set, ["a"], self.__normalizer, self.__tokenizer) matches = classifier.classify("urg foo the gog") self.assertIsInstance(matches, types.GeneratorType, "Are you using yield?")
def test_uses_yield(self): import types corpus = in3120.InMemoryCorpus() corpus.add_document(in3120.InMemoryDocument(0, {"a": "foo bar"})) index = in3120.InMemoryInvertedIndex(corpus, ["a"], self.__normalizer, self.__tokenizer) engine = in3120.SimpleSearchEngine(corpus, index) ranker = in3120.BrainDeadRanker() matches = engine.evaluate("foo", {}, ranker) self.assertIsInstance(matches, types.GeneratorType, "Are you using yield?")
def test_access_postings(self): corpus = in3120.InMemoryCorpus() corpus.add_document( in3120.InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document( in3120.InMemoryDocument(1, {"body": "test TEST prØve"})) index = in3120.InMemoryInvertedIndex(corpus, ["body"], self.__normalizer, self.__tokenizer) self.assertListEqual(list(index.get_terms("PRøvE wtf tesT")), ["prøve", "wtf", "test"]) self.assertListEqual([(p.document_id, p.term_frequency) for p in index["prøve"]], [(1, 1)]) self.assertListEqual([(p.document_id, p.term_frequency) for p in index.get_postings_iterator("wtf")], []) self.assertListEqual([(p.document_id, p.term_frequency) for p in index["test"]], [(0, 1), (1, 2)]) self.assertEqual(index.get_document_frequency("wtf"), 0) self.assertEqual(index.get_document_frequency("prøve"), 1) self.assertEqual(index.get_document_frequency("test"), 2)
def test_memory_usage(self): import tracemalloc import inspect corpus = in3120.InMemoryCorpus() corpus.add_document( in3120.InMemoryDocument(0, { "a": "o o\n\n\no\n\no", "b": "o o\no \no" })) corpus.add_document( in3120.InMemoryDocument(1, { "a": "ba", "b": "b bab" })) corpus.add_document( in3120.InMemoryDocument(2, { "a": "o o O o", "b": "o o" })) corpus.add_document( in3120.InMemoryDocument(3, { "a": "oO" * 10000, "b": "o" })) corpus.add_document( in3120.InMemoryDocument(4, { "a": "cbab o obab O ", "b": "o o " * 10000 })) tracemalloc.start() snapshot1 = tracemalloc.take_snapshot() engine = in3120.SuffixArray(corpus, ["a", "b"], self.__normalizer, self.__tokenizer) self.assertIsNotNone(engine) snapshot2 = tracemalloc.take_snapshot() tracemalloc.stop() for statistic in snapshot2.compare_to(snapshot1, "filename"): if statistic.traceback[0].filename == inspect.getfile( in3120.SuffixArray): self.assertLessEqual(statistic.size_diff, 2000000, "Memory usage seems excessive.")
def test_china_example_from_textbook(self): import math china = in3120.InMemoryCorpus() china.add_document( in3120.InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( in3120.InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document( in3120.InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = in3120.InMemoryCorpus() not_china.add_document( in3120.InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = in3120.NaiveBayesClassifier(training_set, ["body"], self.__normalizer, self.__tokenizer) results = list( classifier.classify("Chinese Chinese Chinese Tokyo Japan")) self.assertEqual(len(results), 2) self.assertEqual(results[0]["category"], "china") self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4) self.assertEqual(results[1]["category"], "not china") self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
def test_synthetic_corpus(self): from itertools import product, combinations_with_replacement corpus = in3120.InMemoryCorpus() words = ("".join(term) for term in product("bcd", "aei", "jkl")) texts = (" ".join(word) for word in combinations_with_replacement(words, 3)) for text in texts: corpus.add_document( in3120.InMemoryDocument(corpus.size(), {"a": text})) index = in3120.InMemoryInvertedIndex(corpus, ["a"], self.__normalizer, self.__tokenizer) engine = in3120.SimpleSearchEngine(corpus, index) epsilon = 0.0001 self.__process_query_verify_matches("baj BAJ baj", engine, { "match_threshold": 1.0, "hit_count": 27 }, (27, 9.0, [0])) self.__process_query_verify_matches("baj caj", engine, { "match_threshold": 1.0, "hit_count": 100 }, (27, None, None)) self.__process_query_verify_matches("baj caj daj", engine, { "match_threshold": 2 / 3 + epsilon, "hit_count": 100 }, (79, None, None)) self.__process_query_verify_matches("baj caj", engine, { "match_threshold": 2 / 3 + epsilon, "hit_count": 100 }, (100, 3.0, [0, 9, 207, 2514])) self.__process_query_verify_matches("baj cek dil", engine, { "match_threshold": 1.0, "hit_count": 10 }, (1, 3.0, [286])) self.__process_query_verify_matches("baj cek dil", engine, { "match_threshold": 1.0, "hit_count": 10 }, (1, None, None)) self.__process_query_verify_matches("baj cek dil", engine, { "match_threshold": 2 / 3 + epsilon, "hit_count": 80 }, (79, 3.0, [13, 26, 273, 286, 377, 3107, 3198])) self.__process_query_verify_matches("baj xxx yyy", engine, { "match_threshold": 2 / 3 + epsilon, "hit_count": 100 }, (0, None, None)) self.__process_query_verify_matches("baj xxx yyy", engine, { "match_threshold": 2 / 3 - epsilon, "hit_count": 100 }, (100, None, None))
def test_multiple_fields(self): document = in3120.InMemoryDocument( 0, { 'felt1': 'Dette er en test. Test, sa jeg. TEST!', 'felt2': 'test er det', 'felt3': 'test TEsT', }) corpus = in3120.InMemoryCorpus() corpus.add_document(document) index = in3120.InMemoryInvertedIndex(corpus, ['felt1', 'felt3'], self.__normalizer, self.__tokenizer) posting = next(index.get_postings_iterator('test')) self.assertEqual(posting.document_id, 0) self.assertEqual(posting.term_frequency, 5)