コード例 #1
0
 def test_multiple_fields(self):
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {
             "field1": "a b c",
             "field2": "b c d"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "field1": "x",
             "field2": "y"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(2, {
             "field1": "y",
             "field2": "z"
         }))
     engine0 = in3120.SuffixArray(corpus, ["field1", "field2"],
                                  self.__normalizer, self.__tokenizer)
     engine1 = in3120.SuffixArray(corpus, ["field1"], self.__normalizer,
                                  self.__tokenizer)
     engine2 = in3120.SuffixArray(corpus, ["field2"], self.__normalizer,
                                  self.__tokenizer)
     self.__process_query_and_verify_winner(engine0, "b c", [0], 2)
     self.__process_query_and_verify_winner(engine0, "y", [1, 2], 1)
     self.__process_query_and_verify_winner(engine1, "x", [1], 1)
     self.__process_query_and_verify_winner(engine1, "y", [2], 1)
     self.__process_query_and_verify_winner(engine1, "z", [], None)
     self.__process_query_and_verify_winner(engine2, "z", [2], 1)
コード例 #2
0
 def setUp(self):
     normalizer = in3120.BrainDeadNormalizer()
     tokenizer = in3120.BrainDeadTokenizer()
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {
             "title": "the foo",
             "static_quality_score": 0.9
         }))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "title": "the foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(
         in3120.InMemoryDocument(2, {
             "title": "the foo foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(in3120.InMemoryDocument(3, {"title": "the bar"}))
     corpus.add_document(
         in3120.InMemoryDocument(4, {"title": "the bar bar"}))
     corpus.add_document(in3120.InMemoryDocument(5, {"title": "the baz"}))
     corpus.add_document(in3120.InMemoryDocument(6, {"title": "the baz"}))
     corpus.add_document(
         in3120.InMemoryDocument(7, {"title": "the baz baz"}))
     index = in3120.InMemoryInvertedIndex(corpus, ["title"], normalizer,
                                          tokenizer)
     self.__ranker = in3120.BetterRanker(corpus, index)
コード例 #3
0
 def test_access_documents(self):
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "title": "prØve",
             "body": "en to tre"
         }))
     self.assertEqual(corpus.size(), 2)
     self.assertListEqual([d.document_id for d in corpus], [0, 1])
     self.assertListEqual(
         [corpus[i].document_id for i in range(0, corpus.size())], [0, 1])
     self.assertListEqual([
         corpus.get_document(i).document_id
         for i in range(0, corpus.size())
     ], [0, 1])
コード例 #4
0
 def test_uses_yield(self):
     import types
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(in3120.InMemoryDocument(0, {"a": "the foo bar"}))
     engine = in3120.SuffixArray(corpus, ["a"], self.__normalizer,
                                 self.__tokenizer)
     matches = engine.evaluate("foo", {})
     self.assertIsInstance(matches, types.GeneratorType,
                           "Are you using yield?")
コード例 #5
0
 def test_uses_yield(self):
     import types
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(in3120.InMemoryDocument(0, {"a": "the foo bar"}))
     training_set = {c: corpus for c in ["x", "y"]}
     classifier = in3120.NaiveBayesClassifier(training_set, ["a"],
                                              self.__normalizer,
                                              self.__tokenizer)
     matches = classifier.classify("urg foo the gog")
     self.assertIsInstance(matches, types.GeneratorType,
                           "Are you using yield?")
コード例 #6
0
 def test_uses_yield(self):
     import types
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(in3120.InMemoryDocument(0, {"a": "foo bar"}))
     index = in3120.InMemoryInvertedIndex(corpus, ["a"], self.__normalizer,
                                          self.__tokenizer)
     engine = in3120.SimpleSearchEngine(corpus, index)
     ranker = in3120.BrainDeadRanker()
     matches = engine.evaluate("foo", {}, ranker)
     self.assertIsInstance(matches, types.GeneratorType,
                           "Are you using yield?")
コード例 #7
0
 def test_access_postings(self):
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(
         in3120.InMemoryDocument(1, {"body": "test TEST prØve"}))
     index = in3120.InMemoryInvertedIndex(corpus, ["body"],
                                          self.__normalizer,
                                          self.__tokenizer)
     self.assertListEqual(list(index.get_terms("PRøvE wtf tesT")),
                          ["prøve", "wtf", "test"])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index["prøve"]], [(1, 1)])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index.get_postings_iterator("wtf")], [])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index["test"]], [(0, 1), (1, 2)])
     self.assertEqual(index.get_document_frequency("wtf"), 0)
     self.assertEqual(index.get_document_frequency("prøve"), 1)
     self.assertEqual(index.get_document_frequency("test"), 2)
コード例 #8
0
 def test_memory_usage(self):
     import tracemalloc
     import inspect
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {
             "a": "o  o\n\n\no\n\no",
             "b": "o o\no   \no"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "a": "ba",
             "b": "b bab"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(2, {
             "a": "o  o O o",
             "b": "o o"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(3, {
             "a": "oO" * 10000,
             "b": "o"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(4, {
             "a": "cbab o obab O ",
             "b": "o o " * 10000
         }))
     tracemalloc.start()
     snapshot1 = tracemalloc.take_snapshot()
     engine = in3120.SuffixArray(corpus, ["a", "b"], self.__normalizer,
                                 self.__tokenizer)
     self.assertIsNotNone(engine)
     snapshot2 = tracemalloc.take_snapshot()
     tracemalloc.stop()
     for statistic in snapshot2.compare_to(snapshot1, "filename"):
         if statistic.traceback[0].filename == inspect.getfile(
                 in3120.SuffixArray):
             self.assertLessEqual(statistic.size_diff, 2000000,
                                  "Memory usage seems excessive.")
コード例 #9
0
 def test_china_example_from_textbook(self):
     import math
     china = in3120.InMemoryCorpus()
     china.add_document(
         in3120.InMemoryDocument(0, {"body": "Chinese Beijing Chinese"}))
     china.add_document(
         in3120.InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
     china.add_document(
         in3120.InMemoryDocument(2, {"body": "Chinese Macao"}))
     not_china = in3120.InMemoryCorpus()
     not_china.add_document(
         in3120.InMemoryDocument(0, {"body": "Tokyo Japan Chinese"}))
     training_set = {"china": china, "not china": not_china}
     classifier = in3120.NaiveBayesClassifier(training_set, ["body"],
                                              self.__normalizer,
                                              self.__tokenizer)
     results = list(
         classifier.classify("Chinese Chinese Chinese Tokyo Japan"))
     self.assertEqual(len(results), 2)
     self.assertEqual(results[0]["category"], "china")
     self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4)
     self.assertEqual(results[1]["category"], "not china")
     self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
コード例 #10
0
 def test_synthetic_corpus(self):
     from itertools import product, combinations_with_replacement
     corpus = in3120.InMemoryCorpus()
     words = ("".join(term) for term in product("bcd", "aei", "jkl"))
     texts = (" ".join(word)
              for word in combinations_with_replacement(words, 3))
     for text in texts:
         corpus.add_document(
             in3120.InMemoryDocument(corpus.size(), {"a": text}))
     index = in3120.InMemoryInvertedIndex(corpus, ["a"], self.__normalizer,
                                          self.__tokenizer)
     engine = in3120.SimpleSearchEngine(corpus, index)
     epsilon = 0.0001
     self.__process_query_verify_matches("baj BAJ    baj", engine, {
         "match_threshold": 1.0,
         "hit_count": 27
     }, (27, 9.0, [0]))
     self.__process_query_verify_matches("baj caj", engine, {
         "match_threshold": 1.0,
         "hit_count": 100
     }, (27, None, None))
     self.__process_query_verify_matches("baj caj daj", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 100
     }, (79, None, None))
     self.__process_query_verify_matches("baj caj", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 100
     }, (100, 3.0, [0, 9, 207, 2514]))
     self.__process_query_verify_matches("baj cek dil", engine, {
         "match_threshold": 1.0,
         "hit_count": 10
     }, (1, 3.0, [286]))
     self.__process_query_verify_matches("baj cek dil", engine, {
         "match_threshold": 1.0,
         "hit_count": 10
     }, (1, None, None))
     self.__process_query_verify_matches("baj cek dil", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 80
     }, (79, 3.0, [13, 26, 273, 286, 377, 3107, 3198]))
     self.__process_query_verify_matches("baj xxx yyy", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 100
     }, (0, None, None))
     self.__process_query_verify_matches("baj xxx yyy", engine, {
         "match_threshold": 2 / 3 - epsilon,
         "hit_count": 100
     }, (100, None, None))
コード例 #11
0
 def test_multiple_fields(self):
     document = in3120.InMemoryDocument(
         0, {
             'felt1': 'Dette er en test. Test, sa jeg. TEST!',
             'felt2': 'test er det',
             'felt3': 'test TEsT',
         })
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(document)
     index = in3120.InMemoryInvertedIndex(corpus, ['felt1', 'felt3'],
                                          self.__normalizer,
                                          self.__tokenizer)
     posting = next(index.get_postings_iterator('test'))
     self.assertEqual(posting.document_id, 0)
     self.assertEqual(posting.term_frequency, 5)