コード例 #1
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from ahocorasick import Trie, StringFinder
    print("Building trie from MeSH corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
    dictionary = Trie()
    for document in corpus:
        dictionary.add(
            normalizer.normalize(normalizer.canonicalize(document["body"])),
            tokenizer)
    engine = StringFinder(dictionary, tokenizer)
    print("Enter some text and locate words and phrases that are MeSH terms.")

    def evaluator(text):
        matches = []
        engine.scan(normalizer.normalize(normalizer.canonicalize(text)),
                    lambda m: matches.append(m))
        return matches

    simple_repl("text", evaluator)
コード例 #2
0
class TestBrainDeadNormalizer(unittest.TestCase):
    def setUp(self):
        from normalization import BrainDeadNormalizer
        self._normalizer = BrainDeadNormalizer()

    def test_canonicalize(self):
        self.assertEqual(self._normalizer.canonicalize("Dette ER en\nprØve!"),
                         "Dette ER en\nprØve!")

    def test_normalize(self):
        self.assertEqual(
            self._normalizer.normalize("grÅFustaSJEOpphengsForKOBling"),
            "gråfustasjeopphengsforkobling")
コード例 #3
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_a():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"),
                                [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index.get_postings_iterator(term))
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency)
                for p in postings] == expected
    print(index)

    # Again, for a slightly bigger corpus.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")
    print("INDEXING...")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]:
        print(term)
        for posting in index.get_postings_iterator(term):
            print(posting)
        assert len(list(index.get_postings_iterator(term))) == expected_length

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [
            index.get_postings_iterator(terms[i]) for i in range(len(terms))
        ]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [
            corpus.get_document(posting.document_id) for posting in merged
        ]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.get_document_id()
                for d in documents] == expected_document_ids
コード例 #4
0
 def setUp(self):
     from normalization import BrainDeadNormalizer
     from tokenization import BrainDeadTokenizer
     from corpus import InMemoryDocument, InMemoryCorpus
     from invertedindex import InMemoryInvertedIndex
     from ranking import BetterRanker
     normalizer = BrainDeadNormalizer()
     tokenizer = BrainDeadTokenizer()
     corpus = InMemoryCorpus()
     corpus.add_document(
         InMemoryDocument(0, {
             "title": "the foo",
             "static_quality_score": 0.9
         }))
     corpus.add_document(
         InMemoryDocument(1, {
             "title": "the foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(
         InMemoryDocument(2, {
             "title": "the foo foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(InMemoryDocument(3, {"title": "the bar"}))
     corpus.add_document(InMemoryDocument(4, {"title": "the bar bar"}))
     corpus.add_document(InMemoryDocument(5, {"title": "the baz"}))
     corpus.add_document(InMemoryDocument(6, {"title": "the baz"}))
     corpus.add_document(InMemoryDocument(7, {"title": "the baz baz"}))
     index = InMemoryInvertedIndex(corpus, ["title"], normalizer, tokenizer)
     self._ranker = BetterRanker(corpus, index)
コード例 #5
0
def assignment_a_postingsmerger_1():

    # A small but real corpus.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus("./data/mesh.txt")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [index[terms[i]] for i in range(len(terms))]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [corpus[posting.document_id] for posting in merged]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.document_id for d in documents] == expected_document_ids
コード例 #6
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_e_naivebayes_2():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
コード例 #7
0
ファイル: repl1.py プロジェクト: 181221/IN4120-SOEK
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import ShingleGenerator
    from corpus import InMemoryCorpus
    from invertedindex import InMemoryInvertedIndex
    from ranking import BrainDeadRanker
    from searchengine import SimpleSearchEngine
    print("Indexing MeSH corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = ShingleGenerator(3)
    corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    ranker = BrainDeadRanker()
    engine = SimpleSearchEngine(corpus, index)
    options = {"debug": False, "hit_count": 5, "match_threshold": 0.5}
    print("Enter a query and find matching documents.")
    print(f"Lookup options are {options}.")
    print(f"Tokenizer is {tokenizer.__class__.__name__}.")
    print(f"Ranker is {ranker.__class__.__name__}.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, ranker, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)
コード例 #8
0
ファイル: assignments.py プロジェクト: 181221/IN4120-SOEK
def assignment_a_inverted_index_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index[term])
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency) for p in postings] == expected
    print(index)

    # Document counts should be correct.
    assert index.get_document_frequency("wtf") == 0
    assert index.get_document_frequency("test") == 2
    assert index.get_document_frequency("prøve") == 1
コード例 #9
0
def assignment_a_inverted_index_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a slightly bigger corpus.
    print("LOADING...")
    corpus = InMemoryCorpus("./data/mesh.txt")
    print("INDEXING...")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]:
        print(term)
        for posting in index[term]:
            print(posting)
        assert len(list(index[term])) == expected_length
コード例 #10
0
def assignment_c_simplesearchengine_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Load and index MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("../data/mesh.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)

    # Do ranked retrieval, using a simple ranker.
    engine = SimpleSearchEngine(corpus, inverted_index)
    simple_ranker = BrainDeadRanker()
    results = []

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    query = "polluTION Water"
    for match_threshold in [0.1, 1.0]:
        print(
            f"SEARCHING for '{query}' with match threshold {str(match_threshold)}..."
        )
        results.clear()
        options = {
            "match_threshold": match_threshold,
            "hit_count": 10,
            "debug": False
        }
        engine.evaluate(query, options, simple_ranker, match_collector)
        assert len(results) == {0.1: 10, 1.0: 3}[match_threshold]
        for (score, document_id) in [(match["score"],
                                      match["document"].document_id)
                                     for match in results[:3]]:
            assert score == 2.0  # Both 'pollution' and 'water'.
            assert document_id in [25274, 25275, 25276]
        for score in [match["score"] for match in results[3:]]:
            assert score == 1.0  # Only 'pollution' or 'water', but not both.
コード例 #11
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_d_betterranker():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []
    hit_count = 10

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Load and index some English news sentences. Look at the output and compare the two rankers!
    # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved
    # ranker does not. The test below for the improved ranker (with document #24 being the winner)
    # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the
    # textbook.
    print("LOADING...")
    corpus = InMemoryCorpus("data/en.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)
    simple_ranker = BrainDeadRanker()
    better_ranker = BetterRanker(corpus, inverted_index)
    engine = SimpleSearchEngine(corpus, inverted_index)
    for query in ["the terrorism attack and obama"]:
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        for ranker in [simple_ranker, better_ranker]:
            print("SEARCHING for '" + query + "' using " +
                  ranker.__class__.__name__ + "...")
            results.clear()
            engine.evaluate(query, options, ranker, match_collector)
            winner_document_ids = {
                simple_ranker: [9221, 7263],
                better_ranker: [24]
            }[ranker]
            assert 0 < len(results) <= hit_count
            assert results[0]["document"].document_id in winner_document_ids
コード例 #12
0
ファイル: assignments.py プロジェクト: 181221/IN4120-SOEK
def assignment_a_inverted_index_3():
    # tests that multiple fields are handled correctly

    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    doc = InMemoryDocument(document_id=0, fields={
        'felt 1': 'Dette er en test. Test, sa jeg. TEST!',
        'felt 2': 'test er det',
        'felt 3': 'test TEsT',
    })
    corpus = InMemoryCorpus()
    corpus.add_document(doc)

    index = InMemoryInvertedIndex(corpus, ['felt 1', 'felt 3'], normalizer, tokenizer)
    p = next(index.get_postings_iterator('test'))
    print(f"term-freq: {p.term_frequency} (correct is 5)")
    assert p.document_id == 0
    assert p.term_frequency == 5
コード例 #13
0
ファイル: repl.py プロジェクト: 181221/IN4120-SOEK
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from naivebayesclassifier import NaiveBayesClassifier
    print("Initializing naive Bayes classifier from news corpora...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    languages = ["en", "no", "da", "de"]
    training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer)
    print(f"Enter some text and classify it into {languages}.")
    print(f"Returned scores are log-probabilities.")

    def evaluator(text):
        results = []
        classifier.classify(text, lambda m: results.append(m))
        return results
    simple_repl("text", evaluator)
コード例 #14
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_b_suffixarray_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Prepare for some suffix array lookups.
    print("LOADING...")
    corpus = InMemoryCorpus("data/cran.xml")
    print("INDEXING...")
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    results = []
    hit_count = 5

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Define the actual test queries.
    test1 = ("visc", 11, [328])  # Look for {'viscous', 'viscosity', ...}.
    test2 = ("Of  A", 10, [946])  # Test robustness for case and whitespace.
    test3 = ("", 0, [])  # Safety feature: Match nothing instead of everything.
    test4 = ("approximate solution", 3, [1374, 159])  # Multiple winners.

    # Test that the simple occurrence ranking works. Be robust towards how ties are resolved.
    for (query, winner_score,
         winner_document_ids) in [test1, test2, test3, test4]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        engine.evaluate(query, {
            "debug": False,
            "hit_count": hit_count
        }, match_collector)
        if winner_document_ids:
            assert results[0]["score"] == winner_score
            assert results[0]["document"].document_id in winner_document_ids
            assert len(results) <= hit_count
        else:
            assert len(results) == 0
コード例 #15
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_d_shinglegenerator_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = ShingleGenerator(3)
    ranker = BrainDeadRanker()
    results = []
    hit_count = 10

    # Load MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")

    # Do ranked retrieval, using n-grams (shingles) and a simple ranker. This allows for fuzzy retrieval.
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)
    engine = SimpleSearchEngine(corpus, inverted_index)

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Test with some mispelled queries. Be robust for arbitrary resolving of ties.
    for (query, winner_score, winner_document_ids) in [
        ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]),
        ("synndrome", 7.0, [1275])
    ]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        engine.evaluate(query, options, ranker, match_collector)
        assert 0 < len(results) <= hit_count
        assert results[0]["score"] == winner_score
        assert results[0]["document"].document_id in winner_document_ids
コード例 #16
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from suffixarray import SuffixArray
    print("Building suffix array from Cranfield corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    options = {"debug": False, "hit_count": 5}
    print("Enter a prefix phrase query and find matching documents.")
    print(f"Lookup options are {options}.")
    print("Returned scores are occurrence counts.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)
コード例 #17
0
ファイル: repl.py プロジェクト: 181221/IN4120-SOEK
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from invertedindex import InMemoryInvertedIndex

    print("Building inverted index from Cranfield corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    print("Enter one or more index terms and inspect their posting lists.")

    def evaluator(terms):
        terms = index.get_terms(terms)
        return {
            term: list(index.get_postings_iterator(term))
            for term in terms
        }

    simple_repl("terms", evaluator)
コード例 #18
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_e_naivebayes_1():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language
コード例 #19
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_c_simplesearchengine_3():

    # All accesses to posting lists are logged here.
    accesses = []

    # For testing.
    class AccessLoggedIterator(Iterator[Posting]):
        def __init__(self, term: str, wrapped: Iterator[Posting]):
            self._term = term
            self._wrapped = wrapped

        def __next__(self):
            posting = next(self._wrapped)
            accesses.append((self._term, posting.document_id))
            return posting

    # For testing.
    class AccessLoggedInvertedIndex(InvertedIndex):
        def __init__(self, wrapped: InvertedIndex):
            self._wrapped = wrapped

        def get_terms(self, buffer: str) -> Iterator[str]:
            return self._wrapped.get_terms(buffer)

        def get_postings_iterator(self, term: str) -> Iterator[Posting]:
            return AccessLoggedIterator(
                term, self._wrapped.get_postings_iterator(term))

        def get_document_frequency(self, term: str) -> int:
            return self._wrapped.get_document_frequency(term)

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Load and index MeSH terms.
    corpus = InMemoryCorpus("data/mesh.txt")
    inverted_index = AccessLoggedInvertedIndex(
        InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer))

    # Do ranked retrieval, using a simple ranker.
    engine = SimpleSearchEngine(corpus, inverted_index)
    simple_ranker = BrainDeadRanker()
    query = "Water  polluTION"
    options = {"match_threshold": 0.5, "hit_count": 1, "debug": False}
    engine.evaluate(query, options, simple_ranker, lambda m: m)

    # Expected posting list traversal ordering if the implementation chooses to evaluate this as "water pollution".
    ordering1 = [('water', 3078), ('pollution', 788), ('pollution', 789),
                 ('pollution', 790), ('pollution', 8079), ('water', 8635),
                 ('pollution', 23837), ('water', 9379), ('water', 23234),
                 ('water', 25265), ('pollution', 25274), ('water', 25266),
                 ('water', 25267), ('water', 25268), ('water', 25269),
                 ('water', 25270), ('water', 25271), ('water', 25272),
                 ('water', 25273), ('water', 25274), ('water', 25275),
                 ('pollution', 25275), ('water', 25276), ('pollution', 25276),
                 ('water', 25277), ('water', 25278), ('water', 25279),
                 ('water', 25280), ('water', 25281)]

    # Expected posting list traversal ordering if the implementation chooses to evaluate this as "pollution water".
    ordering2 = [('pollution', 788), ('water', 3078), ('pollution', 789),
                 ('pollution', 790), ('pollution', 8079), ('water', 8635),
                 ('pollution', 23837), ('water', 9379), ('water', 23234),
                 ('water', 25265), ('pollution', 25274), ('water', 25266),
                 ('water', 25267), ('water', 25268), ('water', 25269),
                 ('water', 25270), ('water', 25271), ('water', 25272),
                 ('water', 25273), ('water', 25274), ('pollution', 25275),
                 ('water', 25275), ('pollution', 25276), ('water', 25276),
                 ('water', 25277), ('water', 25278), ('water', 25279),
                 ('water', 25280), ('water', 25281)]

    # Check that the posting lists have been accessed in a way that's consistent with document-at-a-time traversal.
    # Be somewhat robust to implementation details. This is a fairly strict test, and advanced (but valid)
    # implementations that for some reason do lookaheads or whatever might fail.
    assert accesses == ordering1 or accesses == ordering2
コード例 #20
0
ファイル: assignments.py プロジェクト: havarf/s-kemotor
def assignment_c_simplesearchengine_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    ranker = BrainDeadRanker()

    # Used for comparing floating point numbers.
    epsilon = 0.0001

    # Create a dummy test corpus.
    corpus = InMemoryCorpus()
    words = (''.join(term) for term in product("bcd", "aei", "jkl"))
    texts = (' '.join(word)
             for word in combinations_with_replacement(words, 3))
    for text in texts:
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': text}))

    # What we're testing.
    engine = SimpleSearchEngine(
        corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer))

    # Where the callback will collect the matches.
    results = []

    # Callback that collects matches.
    def collect(m):
        results.append((m['score'], m['document'].document_id))

    # Executes a query.
    def search(q, t, n):
        results.clear()
        engine.evaluate(q, {
            'match_threshold': t,
            'hit_count': n
        }, ranker, collect)

    # Sorts the collected matches.
    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    # Test predicate.
    def check_at(i, expected):
        assert results[i] == expected

    # Test predicate.
    def check_range(indices, score, document_ids):
        for i, d in zip(indices, document_ids):
            check_at(i, (score, d))

    # Test predicate.
    def check_hits(n):
        assert len(results) == n

    # Run tests!
    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj caj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + epsilon, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + epsilon, 100)
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + epsilon, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + epsilon, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - epsilon, 100)
    check_hits(100)
コード例 #21
0
 def setUp(self):
     from normalization import BrainDeadNormalizer
     self._normalizer = BrainDeadNormalizer()
コード例 #22
0
ファイル: testc2.py プロジェクト: havarf/s-kemotor
def test_simple_search_engine():
    from itertools import product, combinations_with_replacement
    from tokenization import BrainDeadTokenizer
    from normalization import BrainDeadNormalizer
    from corpus import InMemoryCorpus, InMemoryDocument
    from invertedindex import InMemoryInvertedIndex
    from searchengine import SimpleSearchEngine
    from ranking import BrainDeadRanker

    Ɛ = 0.0001
    corpus = InMemoryCorpus()

    for txt in (' '.join(w) for w in combinations_with_replacement(
            list(''.join(t) for t in product(
                'bcd',
                'aei',
                'jkl',
            )), 3)):
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': txt}))

    engine = SimpleSearchEngine(
        corpus,
        InMemoryInvertedIndex(corpus, ('a', ), BrainDeadNormalizer(),
                              BrainDeadTokenizer()))

    results = []

    def search(q, r, n):
        results.clear()

        def match(m):
            results.append((m['score'], m['document'].document_id))

        print('searching "' + q + '" at threshold', r, '…')
        engine.evaluate(q, {
            'recall_threshold': r,
            'hit_count': n
        }, BrainDeadRanker(), match)

    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    def check_at(i, expected):
        if results[i] != expected:
            print('FAILED, EXPECTED ', expected, ' RESULT', i, ' was',
                  results[i])

    def check_range(indices, score, docrange):
        for i, d in zip(indices, docrange):
            check_at(i, (score, d))

    def check_hits(n):
        if len(results) != n:
            print('FAILED, expected', n, 'results, got', len(results))

    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj CAj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + Ɛ, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + Ɛ, 100)  # her
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + Ɛ, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + Ɛ, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - Ɛ, 100)
    check_hits(100)
コード例 #23
0
def assignment_d():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    simple_tokenizer = BrainDeadTokenizer()

    # Load MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")

    # Do ranked retrieval, using n-grams and a simple ranker. This allows for fuzzy retrieval.
    print("INDEXING...")
    shingle_generator = ShingleGenerator(3)
    shingle_inverted_index = InMemoryInvertedIndex(corpus, ["body"],
                                                   normalizer,
                                                   shingle_generator)
    shingle_engine = SimpleSearchEngine(corpus, shingle_inverted_index)
    simple_ranker = BrainDeadRanker()
    results = []
    hit_count = 10

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Test with some mispelled queries. Be robust for arbitrary resolving of ties.
    for (query, winner_score, winner_document_ids) in [
        ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]),
        ("synndrome", 7.0, [1275])
    ]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        shingle_engine.evaluate(query, options, simple_ranker, match_collector)
        assert 0 < len(results) <= hit_count
        assert results[0]["score"] == winner_score
        assert results[0]["document"].document_id in winner_document_ids

    # Load and index some English news sentences. Look at the output and compare the two rankers!
    # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved
    # ranker does not. The test below for the improved ranker (with document #24 being the winner)
    # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the
    # textbook.
    print("LOADING...")
    corpus = InMemoryCorpus("data/en.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           simple_tokenizer)
    better_ranker = BetterRanker(corpus, inverted_index)
    engine = SimpleSearchEngine(corpus, inverted_index)
    for query in ["the terrorism attack and obama"]:
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        for ranker in [simple_ranker, better_ranker]:
            print("SEARCHING for '" + query + "' using " +
                  ranker.__class__.__name__ + "...")
            results.clear()
            engine.evaluate(query, options, ranker, match_collector)
            winner_document_ids = {
                simple_ranker: [9221, 7263],
                better_ranker: [24]
            }[ranker]
            assert 0 < len(results) <= hit_count
            assert results[0]["document"].document_id in winner_document_ids
コード例 #24
0
def assignment_e():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language

    # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
コード例 #25
0
ファイル: test_naivebayes.py プロジェクト: 181221/IN4120-SOEK
 def setUp(self):
     from normalization import BrainDeadNormalizer
     from tokenization import BrainDeadTokenizer
     self._normalizer = BrainDeadNormalizer()
     self._tokenizer = BrainDeadTokenizer()
コード例 #26
0
def assignment_b():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Prepare for some suffix array lookups.
    print("LOADING...")
    corpus = InMemoryCorpus("data/cran.xml")
    print("INDEXING...")
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    results = []
    hit_count = 5

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Define the actual test queries.
    test1 = ("visc", 11, [328])  # Look for {'viscous', 'viscosity', ...}.
    test2 = ("Of  A", 10, [946])  # Test robustness for case and whitespace.
    test3 = ("", 0, [])  # Safety feature: Match nothing instead of everything.
    test4 = ("approximate solution", 3, [1374, 159])  # Multiple winners.

    # Test that the simple occurrence ranking works. Be robust towards how ties are resolved.
    for (query, winner_score,
         winner_document_ids) in [test1, test2, test3, test4]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        engine.evaluate(query, {
            "debug": False,
            "hit_count": hit_count
        }, match_collector)
        if winner_document_ids:
            assert results[0]["score"] == winner_score
            assert results[0]["document"].document_id in winner_document_ids
            assert len(results) <= hit_count
        else:
            assert len(results) == 0

    # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer.
    trie = Trie()
    for s in [
            "romerike", "apple computer", "norsk", "norsk ørret", "sverige",
            "ørret", "banan"
    ]:
        trie.add(s, tokenizer)
    finder = StringFinder(trie, tokenizer)
    buffer = "det var en gang en norsk  ørret fra romerike som likte abba fra sverige"
    print("SCANNING...")
    results.clear()
    finder.scan(buffer, lambda m: results.append(m))
    print("Buffer \"" + buffer + "\" contains", results)
    assert [m["match"] for m in results
            ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"]

    # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH
    # documents are medical terms and the Cranfield documents have technical content, the
    # overlap probably isn't that big.
    print("LOADING...")
    mesh = InMemoryCorpus("data/mesh.txt")
    cranfield = InMemoryCorpus("data/cran.xml")
    print("BUILDING...")
    trie = Trie()
    for d in mesh:
        trie.add(d["body"] or "", tokenizer)
    finder = StringFinder(trie, tokenizer)
    print("SCANNING...")
    for (document_id,
         expected_matches) in [(0, ["wing", "wing"]),
                               (3, ["solutions", "skin", "friction"]),
                               (1254, ["electrons", "ions"])]:
        document = cranfield[document_id]
        buffer = document["body"] or ""
        results.clear()
        finder.scan(buffer, lambda m: results.append(m))
        print("Cranfield document", document, "contains MeSH terms", results)
        assert [m["match"] for m in results] == expected_matches