Beispiel #1
0
 def test_memory_usage(self):
     import tracemalloc
     import inspect
     from corpus import InMemoryDocument, InMemoryCorpus
     from suffixarray import SuffixArray
     corpus = InMemoryCorpus()
     corpus.add_document(
         InMemoryDocument(0, {
             "a": "o  o\n\n\no\n\no",
             "b": "o o\no   \no"
         }))
     corpus.add_document(InMemoryDocument(1, {"a": "ba", "b": "b bab"}))
     corpus.add_document(InMemoryDocument(2, {"a": "o  o O o", "b": "o o"}))
     corpus.add_document(InMemoryDocument(3, {"a": "oO" * 10000, "b": "o"}))
     corpus.add_document(
         InMemoryDocument(4, {
             "a": "cbab o obab O ",
             "b": "o o " * 10000
         }))
     tracemalloc.start()
     snapshot1 = tracemalloc.take_snapshot()
     engine = SuffixArray(corpus, ["a", "b"], self._normalizer,
                          self._tokenizer)
     snapshot2 = tracemalloc.take_snapshot()
     tracemalloc.stop()
     for statistic in snapshot2.compare_to(snapshot1, "filename"):
         if statistic.traceback[0].filename == inspect.getfile(SuffixArray):
             self.assertLessEqual(statistic.size_diff, 2000000,
                                  "Memory usage seems excessive.")
Beispiel #2
0
 def test_multiple_fields(self):
     from corpus import InMemoryDocument, InMemoryCorpus
     from suffixarray import SuffixArray
     corpus = InMemoryCorpus()
     corpus.add_document(
         InMemoryDocument(0, {
             "field1": "a b c",
             "field2": "b c d"
         }))
     corpus.add_document(InMemoryDocument(1, {
         "field1": "x",
         "field2": "y"
     }))
     corpus.add_document(InMemoryDocument(2, {
         "field1": "y",
         "field2": "z"
     }))
     engine0 = SuffixArray(corpus, ["field1", "field2"], self._normalizer,
                           self._tokenizer)
     engine1 = SuffixArray(corpus, ["field1"], self._normalizer,
                           self._tokenizer)
     engine2 = SuffixArray(corpus, ["field2"], self._normalizer,
                           self._tokenizer)
     self._process_query_and_verify_winner(engine0, "b c", [0], 2)
     self._process_query_and_verify_winner(engine0, "y", [1, 2], 1)
     self._process_query_and_verify_winner(engine1, "x", [1], 1)
     self._process_query_and_verify_winner(engine1, "y", [2], 1)
     self._process_query_and_verify_winner(engine1, "z", [], None)
     self._process_query_and_verify_winner(engine2, "z", [2], 1)
Beispiel #3
0
def assignment_a_inverted_index_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index[term])
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency) for p in postings] == expected
    print(index)

    # Document counts should be correct.
    assert index.get_document_frequency("wtf") == 0
    assert index.get_document_frequency("test") == 2
    assert index.get_document_frequency("prøve") == 1
Beispiel #4
0
def assignment_a():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"),
                                [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index.get_postings_iterator(term))
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency)
                for p in postings] == expected
    print(index)

    # Again, for a slightly bigger corpus.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")
    print("INDEXING...")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]:
        print(term)
        for posting in index.get_postings_iterator(term):
            print(posting)
        assert len(list(index.get_postings_iterator(term))) == expected_length

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [
            index.get_postings_iterator(terms[i]) for i in range(len(terms))
        ]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [
            corpus.get_document(posting.document_id) for posting in merged
        ]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.get_document_id()
                for d in documents] == expected_document_ids
Beispiel #5
0
def assignment_e_naivebayes_2():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
 def test_access_documents(self):
     from corpus import InMemoryDocument, InMemoryCorpus
     corpus = InMemoryCorpus()
     corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(InMemoryDocument(1, {"title": "prØve", "body": "en to tre"}))
     self.assertEqual(corpus.size(), 2)
     self.assertListEqual([d.document_id for d in corpus], [0, 1])
     self.assertListEqual([corpus[i].document_id for i in range(0, corpus.size())], [0, 1])
     self.assertListEqual([corpus.get_document(i).document_id for i in range(0, corpus.size())], [0, 1])
 def setUp(self):
     from normalization import BrainDeadNormalizer
     from tokenization import BrainDeadTokenizer
     from corpus import InMemoryDocument, InMemoryCorpus
     from invertedindex import InMemoryInvertedIndex
     from ranking import BetterRanker
     normalizer = BrainDeadNormalizer()
     tokenizer = BrainDeadTokenizer()
     corpus = InMemoryCorpus()
     corpus.add_document(
         InMemoryDocument(0, {
             "title": "the foo",
             "static_quality_score": 0.9
         }))
     corpus.add_document(
         InMemoryDocument(1, {
             "title": "the foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(
         InMemoryDocument(2, {
             "title": "the foo foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(InMemoryDocument(3, {"title": "the bar"}))
     corpus.add_document(InMemoryDocument(4, {"title": "the bar bar"}))
     corpus.add_document(InMemoryDocument(5, {"title": "the baz"}))
     corpus.add_document(InMemoryDocument(6, {"title": "the baz"}))
     corpus.add_document(InMemoryDocument(7, {"title": "the baz baz"}))
     index = InMemoryInvertedIndex(corpus, ["title"], normalizer, tokenizer)
     self._ranker = BetterRanker(corpus, index)
 def test_access_postings(self):
     from corpus import InMemoryDocument, InMemoryCorpus
     from invertedindex import InMemoryInvertedIndex
     corpus = InMemoryCorpus()
     corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
     index = InMemoryInvertedIndex(corpus, ["body"], self._normalizer,
                                   self._tokenizer)
     self.assertListEqual(list(index.get_terms("PRøvE wtf tesT")),
                          ["prøve", "wtf", "test"])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index["prøve"]], [(1, 1)])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index.get_postings_iterator("wtf")], [])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index["test"]], [(0, 1), (1, 2)])
     self.assertEqual(index.get_document_frequency("wtf"), 0)
     self.assertEqual(index.get_document_frequency("prøve"), 1)
     self.assertEqual(index.get_document_frequency("test"), 2)
Beispiel #9
0
 def test_china_example_from_textbook(self):
     import math
     from corpus import InMemoryDocument, InMemoryCorpus
     from naivebayesclassifier import NaiveBayesClassifier
     china = InMemoryCorpus()
     china.add_document(
         InMemoryDocument(0, {"body": "Chinese Beijing Chinese"}))
     china.add_document(
         InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
     china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
     not_china = InMemoryCorpus()
     not_china.add_document(
         InMemoryDocument(0, {"body": "Tokyo Japan Chinese"}))
     training_set = {"china": china, "not china": not_china}
     classifier = NaiveBayesClassifier(training_set, ["body"],
                                       self._normalizer, self._tokenizer)
     results = []
     classifier.classify("Chinese Chinese Chinese Tokyo Japan",
                         lambda m: results.append(m))
     self.assertEqual(len(results), 2)
     self.assertEqual(results[0]["category"], "china")
     self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4)
     self.assertEqual(results[1]["category"], "not china")
     self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
 def test_multiple_fields(self):
     from corpus import InMemoryDocument, InMemoryCorpus
     from invertedindex import InMemoryInvertedIndex
     document = InMemoryDocument(
         0, {
             'felt1': 'Dette er en test. Test, sa jeg. TEST!',
             'felt2': 'test er det',
             'felt3': 'test TEsT',
         })
     corpus = InMemoryCorpus()
     corpus.add_document(document)
     index = InMemoryInvertedIndex(corpus, ['felt1', 'felt3'],
                                   self._normalizer, self._tokenizer)
     posting = next(index.get_postings_iterator('test'))
     self.assertEqual(posting.document_id, 0)
     self.assertEqual(posting.term_frequency, 5)
Beispiel #11
0
def assignment_a_inverted_index_3():
    # tests that multiple fields are handled correctly

    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    doc = InMemoryDocument(document_id=0, fields={
        'felt 1': 'Dette er en test. Test, sa jeg. TEST!',
        'felt 2': 'test er det',
        'felt 3': 'test TEsT',
    })
    corpus = InMemoryCorpus()
    corpus.add_document(doc)

    index = InMemoryInvertedIndex(corpus, ['felt 1', 'felt 3'], normalizer, tokenizer)
    p = next(index.get_postings_iterator('test'))
    print(f"term-freq: {p.term_frequency} (correct is 5)")
    assert p.document_id == 0
    assert p.term_frequency == 5
 def test_synthetic_corpus(self):
     from itertools import product, combinations_with_replacement
     from corpus import InMemoryDocument, InMemoryCorpus
     from invertedindex import InMemoryInvertedIndex
     from searchengine import SimpleSearchEngine
     corpus = InMemoryCorpus()
     words = ("".join(term) for term in product("bcd", "aei", "jkl"))
     texts = (" ".join(word) for word in combinations_with_replacement(words, 3))
     for text in texts:
         corpus.add_document(InMemoryDocument(corpus.size(), {"a": text}))
     engine = SimpleSearchEngine(corpus, InMemoryInvertedIndex(corpus, ["a"], self._normalizer, self._tokenizer))
     epsilon = 0.0001
     self._process_query_verify_matches("baj BAJ    baj", engine,
                                        {"match_threshold": 1.0, "hit_count": 27},
                                        (27, 9.0, [0]))
     self._process_query_verify_matches("baj caj", engine,
                                        {"match_threshold": 1.0, "hit_count": 100},
                                        (27, None, None))
     self._process_query_verify_matches("baj caj daj", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 100},
                                        (79, None, None))
     self._process_query_verify_matches("baj caj", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 100},
                                        (100, 3.0, [0, 9, 207, 2514]))
     self._process_query_verify_matches("baj cek dil", engine,
                                        {"match_threshold": 1.0, "hit_count": 10},
                                        (1, 3.0, [286]))
     self._process_query_verify_matches("baj cek dil", engine,
                                        {"match_threshold": 1.0, "hit_count": 10},
                                        (1, None, None))
     self._process_query_verify_matches("baj cek dil", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 80},
                                        (79, 3.0, [13, 26, 273, 286, 377, 3107, 3198]))
     self._process_query_verify_matches("baj xxx yyy", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 100},
                                        (0, None, None))
     self._process_query_verify_matches("baj xxx yyy", engine,
                                        {"match_threshold": 2/3 - epsilon, "hit_count": 100},
                                        (100, None, None))
Beispiel #13
0
def assignment_c_simplesearchengine_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    ranker = BrainDeadRanker()

    # Used for comparing floating point numbers.
    epsilon = 0.0001

    # Create a dummy test corpus.
    corpus = InMemoryCorpus()
    words = (''.join(term) for term in product("bcd", "aei", "jkl"))
    texts = (' '.join(word)
             for word in combinations_with_replacement(words, 3))
    for text in texts:
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': text}))

    # What we're testing.
    engine = SimpleSearchEngine(
        corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer))

    # Where the callback will collect the matches.
    results = []

    # Callback that collects matches.
    def collect(m):
        results.append((m['score'], m['document'].document_id))

    # Executes a query.
    def search(q, t, n):
        results.clear()
        engine.evaluate(q, {
            'match_threshold': t,
            'hit_count': n
        }, ranker, collect)

    # Sorts the collected matches.
    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    # Test predicate.
    def check_at(i, expected):
        assert results[i] == expected

    # Test predicate.
    def check_range(indices, score, document_ids):
        for i, d in zip(indices, document_ids):
            check_at(i, (score, d))

    # Test predicate.
    def check_hits(n):
        assert len(results) == n

    # Run tests!
    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj caj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + epsilon, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + epsilon, 100)
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + epsilon, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + epsilon, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - epsilon, 100)
    check_hits(100)
Beispiel #14
0
def assignment_e():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language

    # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
Beispiel #15
0
def test_simple_search_engine():
    from itertools import product, combinations_with_replacement
    from tokenization import BrainDeadTokenizer
    from normalization import BrainDeadNormalizer
    from corpus import InMemoryCorpus, InMemoryDocument
    from invertedindex import InMemoryInvertedIndex
    from searchengine import SimpleSearchEngine
    from ranking import BrainDeadRanker

    Ɛ = 0.0001
    corpus = InMemoryCorpus()

    for txt in (' '.join(w) for w in combinations_with_replacement(
            list(''.join(t) for t in product(
                'bcd',
                'aei',
                'jkl',
            )), 3)):
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': txt}))

    engine = SimpleSearchEngine(
        corpus,
        InMemoryInvertedIndex(corpus, ('a', ), BrainDeadNormalizer(),
                              BrainDeadTokenizer()))

    results = []

    def search(q, r, n):
        results.clear()

        def match(m):
            results.append((m['score'], m['document'].document_id))

        print('searching "' + q + '" at threshold', r, '…')
        engine.evaluate(q, {
            'recall_threshold': r,
            'hit_count': n
        }, BrainDeadRanker(), match)

    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    def check_at(i, expected):
        if results[i] != expected:
            print('FAILED, EXPECTED ', expected, ' RESULT', i, ' was',
                  results[i])

    def check_range(indices, score, docrange):
        for i, d in zip(indices, docrange):
            check_at(i, (score, d))

    def check_hits(n):
        if len(results) != n:
            print('FAILED, expected', n, 'results, got', len(results))

    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj CAj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + Ɛ, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + Ɛ, 100)  # her
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + Ɛ, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + Ɛ, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - Ɛ, 100)
    check_hits(100)
Beispiel #16
0
 def get_document(self, document_id: int) -> Document:
     return InMemoryDocument(
         document_id,
         {'body': inverted_index.get_doc_body(document_id)})