Ejemplo n.º 1
0
class TestBrainDeadRanker(unittest.TestCase):
    def setUp(self):
        from ranking import BrainDeadRanker
        self._ranker = BrainDeadRanker()

    def test_term_frequency(self):
        from invertedindex import Posting
        self._ranker.reset(21)
        self._ranker.update("foo", 2, Posting(21, 4))
        self._ranker.update("bar", 1, Posting(21, 3))
        self.assertEqual(self._ranker.evaluate(), 11)
        self._ranker.reset(42)
        self._ranker.update("foo", 1, Posting(42, 1))
        self._ranker.update("baz", 2, Posting(42, 2))
        self.assertEqual(self._ranker.evaluate(), 5)
Ejemplo n.º 2
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import ShingleGenerator
    from corpus import InMemoryCorpus
    from invertedindex import InMemoryInvertedIndex
    from ranking import BrainDeadRanker
    from searchengine import SimpleSearchEngine
    print("Indexing MeSH corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = ShingleGenerator(3)
    corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    ranker = BrainDeadRanker()
    engine = SimpleSearchEngine(corpus, index)
    options = {"debug": False, "hit_count": 5, "match_threshold": 0.5}
    print("Enter a query and find matching documents.")
    print(f"Lookup options are {options}.")
    print(f"Tokenizer is {tokenizer.__class__.__name__}.")
    print(f"Ranker is {ranker.__class__.__name__}.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, ranker, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)
Ejemplo n.º 3
0
    def search(q, r, n):
        results.clear()

        def match(m):
            results.append((m['score'], m['document'].document_id))

        print('searching "' + q + '" at threshold', r, '…')
        engine.evaluate(q, {
            'recall_threshold': r,
            'hit_count': n
        }, BrainDeadRanker(), match)
Ejemplo n.º 4
0
 def _process_two_term_query_verify_matches(self, query, engine, options, expected):
     from ranking import BrainDeadRanker
     ranker = BrainDeadRanker()
     matches = []
     hits, winners = expected
     engine.evaluate(query, options, ranker, lambda m: matches.append((m["score"], m["document"].document_id)))
     self.assertEqual(len(matches), hits)
     for (score, winner) in matches[:len(winners)]:
         self.assertEqual(score, 2.0)
         self.assertIn(winner, winners)
     for (score, contender) in matches[len(winners):]:
         self.assertEqual(score, 1.0)
Ejemplo n.º 5
0
 def _process_query_verify_matches(self, query, engine, options, expected):
     from itertools import takewhile
     from ranking import BrainDeadRanker
     ranker = BrainDeadRanker()
     matches = []
     hits, score, winners = expected
     engine.evaluate(query, options, ranker, lambda m: matches.append((m["score"], m["document"].document_id)))
     self.assertEqual(len(matches), hits)
     if matches:
         for i in range(1, hits):
             self.assertGreaterEqual(matches[i - 1][0], matches[i][0])
         if score:
             self.assertEqual(matches[0][0], score)
         if winners:
             top = takewhile(lambda m: m[0] == matches[0][0], matches)
             self.assertListEqual(winners, list(sorted([m[1] for m in top])))
Ejemplo n.º 6
0
def assignment_d_betterranker():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []
    hit_count = 10

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Load and index some English news sentences. Look at the output and compare the two rankers!
    # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved
    # ranker does not. The test below for the improved ranker (with document #24 being the winner)
    # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the
    # textbook.
    print("LOADING...")
    corpus = InMemoryCorpus("data/en.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)
    simple_ranker = BrainDeadRanker()
    better_ranker = BetterRanker(corpus, inverted_index)
    engine = SimpleSearchEngine(corpus, inverted_index)
    for query in ["the terrorism attack and obama"]:
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        for ranker in [simple_ranker, better_ranker]:
            print("SEARCHING for '" + query + "' using " +
                  ranker.__class__.__name__ + "...")
            results.clear()
            engine.evaluate(query, options, ranker, match_collector)
            winner_document_ids = {
                simple_ranker: [9221, 7263],
                better_ranker: [24]
            }[ranker]
            assert 0 < len(results) <= hit_count
            assert results[0]["document"].document_id in winner_document_ids
Ejemplo n.º 7
0
def assignment_c_simplesearchengine_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Load and index MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("../data/mesh.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)

    # Do ranked retrieval, using a simple ranker.
    engine = SimpleSearchEngine(corpus, inverted_index)
    simple_ranker = BrainDeadRanker()
    results = []

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    query = "polluTION Water"
    for match_threshold in [0.1, 1.0]:
        print(
            f"SEARCHING for '{query}' with match threshold {str(match_threshold)}..."
        )
        results.clear()
        options = {
            "match_threshold": match_threshold,
            "hit_count": 10,
            "debug": False
        }
        engine.evaluate(query, options, simple_ranker, match_collector)
        assert len(results) == {0.1: 10, 1.0: 3}[match_threshold]
        for (score, document_id) in [(match["score"],
                                      match["document"].document_id)
                                     for match in results[:3]]:
            assert score == 2.0  # Both 'pollution' and 'water'.
            assert document_id in [25274, 25275, 25276]
        for score in [match["score"] for match in results[3:]]:
            assert score == 1.0  # Only 'pollution' or 'water', but not both.
Ejemplo n.º 8
0
def assignment_d_shinglegenerator_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = ShingleGenerator(3)
    ranker = BrainDeadRanker()
    results = []
    hit_count = 10

    # Load MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")

    # Do ranked retrieval, using n-grams (shingles) and a simple ranker. This allows for fuzzy retrieval.
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)
    engine = SimpleSearchEngine(corpus, inverted_index)

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Test with some mispelled queries. Be robust for arbitrary resolving of ties.
    for (query, winner_score, winner_document_ids) in [
        ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]),
        ("synndrome", 7.0, [1275])
    ]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        engine.evaluate(query, options, ranker, match_collector)
        assert 0 < len(results) <= hit_count
        assert results[0]["score"] == winner_score
        assert results[0]["document"].document_id in winner_document_ids
Ejemplo n.º 9
0
def assignment_c_simplesearchengine_3():

    # All accesses to posting lists are logged here.
    accesses = []

    # For testing.
    class AccessLoggedIterator(Iterator[Posting]):
        def __init__(self, term: str, wrapped: Iterator[Posting]):
            self._term = term
            self._wrapped = wrapped

        def __next__(self):
            posting = next(self._wrapped)
            accesses.append((self._term, posting.document_id))
            return posting

    # For testing.
    class AccessLoggedInvertedIndex(InvertedIndex):
        def __init__(self, wrapped: InvertedIndex):
            self._wrapped = wrapped

        def get_terms(self, buffer: str) -> Iterator[str]:
            return self._wrapped.get_terms(buffer)

        def get_postings_iterator(self, term: str) -> Iterator[Posting]:
            return AccessLoggedIterator(
                term, self._wrapped.get_postings_iterator(term))

        def get_document_frequency(self, term: str) -> int:
            return self._wrapped.get_document_frequency(term)

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Load and index MeSH terms.
    corpus = InMemoryCorpus("data/mesh.txt")
    inverted_index = AccessLoggedInvertedIndex(
        InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer))

    # Do ranked retrieval, using a simple ranker.
    engine = SimpleSearchEngine(corpus, inverted_index)
    simple_ranker = BrainDeadRanker()
    query = "Water  polluTION"
    options = {"match_threshold": 0.5, "hit_count": 1, "debug": False}
    engine.evaluate(query, options, simple_ranker, lambda m: m)

    # Expected posting list traversal ordering if the implementation chooses to evaluate this as "water pollution".
    ordering1 = [('water', 3078), ('pollution', 788), ('pollution', 789),
                 ('pollution', 790), ('pollution', 8079), ('water', 8635),
                 ('pollution', 23837), ('water', 9379), ('water', 23234),
                 ('water', 25265), ('pollution', 25274), ('water', 25266),
                 ('water', 25267), ('water', 25268), ('water', 25269),
                 ('water', 25270), ('water', 25271), ('water', 25272),
                 ('water', 25273), ('water', 25274), ('water', 25275),
                 ('pollution', 25275), ('water', 25276), ('pollution', 25276),
                 ('water', 25277), ('water', 25278), ('water', 25279),
                 ('water', 25280), ('water', 25281)]

    # Expected posting list traversal ordering if the implementation chooses to evaluate this as "pollution water".
    ordering2 = [('pollution', 788), ('water', 3078), ('pollution', 789),
                 ('pollution', 790), ('pollution', 8079), ('water', 8635),
                 ('pollution', 23837), ('water', 9379), ('water', 23234),
                 ('water', 25265), ('pollution', 25274), ('water', 25266),
                 ('water', 25267), ('water', 25268), ('water', 25269),
                 ('water', 25270), ('water', 25271), ('water', 25272),
                 ('water', 25273), ('water', 25274), ('pollution', 25275),
                 ('water', 25275), ('pollution', 25276), ('water', 25276),
                 ('water', 25277), ('water', 25278), ('water', 25279),
                 ('water', 25280), ('water', 25281)]

    # Check that the posting lists have been accessed in a way that's consistent with document-at-a-time traversal.
    # Be somewhat robust to implementation details. This is a fairly strict test, and advanced (but valid)
    # implementations that for some reason do lookaheads or whatever might fail.
    assert accesses == ordering1 or accesses == ordering2
Ejemplo n.º 10
0
def assignment_c_simplesearchengine_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    ranker = BrainDeadRanker()

    # Used for comparing floating point numbers.
    epsilon = 0.0001

    # Create a dummy test corpus.
    corpus = InMemoryCorpus()
    words = (''.join(term) for term in product("bcd", "aei", "jkl"))
    texts = (' '.join(word)
             for word in combinations_with_replacement(words, 3))
    for text in texts:
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': text}))

    # What we're testing.
    engine = SimpleSearchEngine(
        corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer))

    # Where the callback will collect the matches.
    results = []

    # Callback that collects matches.
    def collect(m):
        results.append((m['score'], m['document'].document_id))

    # Executes a query.
    def search(q, t, n):
        results.clear()
        engine.evaluate(q, {
            'match_threshold': t,
            'hit_count': n
        }, ranker, collect)

    # Sorts the collected matches.
    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    # Test predicate.
    def check_at(i, expected):
        assert results[i] == expected

    # Test predicate.
    def check_range(indices, score, document_ids):
        for i, d in zip(indices, document_ids):
            check_at(i, (score, d))

    # Test predicate.
    def check_hits(n):
        assert len(results) == n

    # Run tests!
    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj caj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + epsilon, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + epsilon, 100)
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + epsilon, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + epsilon, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - epsilon, 100)
    check_hits(100)
Ejemplo n.º 11
0
    def test_document_at_a_time_traversal_mesh_corpus(self):
        from typing import Iterator, List, Tuple
        import os.path
        from invertedindex import Posting, InvertedIndex, InMemoryInvertedIndex
        from corpus import InMemoryCorpus
        from searchengine import SimpleSearchEngine
        from ranking import BrainDeadRanker

        class AccessLoggedIterator(Iterator[Posting]):
            def __init__(self, term: str, history: List[Tuple[str, int]], wrapped: Iterator[Posting]):
                self._term = term
                self._history = history
                self._wrapped = wrapped

            def __next__(self):
                posting = next(self._wrapped)
                self._history.append((self._term, posting.document_id))
                return posting

        class AccessLoggedInvertedIndex(InvertedIndex):
            def __init__(self, wrapped: InvertedIndex):
                self._wrapped = wrapped
                self._history = []

            def get_terms(self, buffer: str) -> Iterator[str]:
                return self._wrapped.get_terms(buffer)

            def get_postings_iterator(self, term: str) -> Iterator[Posting]:
                return AccessLoggedIterator(term, self._history, self._wrapped.get_postings_iterator(term))

            def get_document_frequency(self, term: str) -> int:
                return self._wrapped.get_document_frequency(term)

            def get_history(self) -> List[Tuple[str, int]]:
                return self._history

        corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
        index = AccessLoggedInvertedIndex(InMemoryInvertedIndex(corpus, ["body"], self._normalizer, self._tokenizer))
        engine = SimpleSearchEngine(corpus, index)
        ranker = BrainDeadRanker()
        query = "Water  polluTION"
        options = {"match_threshold": 0.5, "hit_count": 1, "debug": False}
        engine.evaluate(query, options, ranker, lambda m: None)
        history = index.get_history()
        ordering1 = [('water', 3078),  # Document-at-a-time ordering if evaluated as "water pollution".
                     ('pollution', 788), ('pollution', 789), ('pollution', 790), ('pollution', 8079),
                     ('water', 8635),
                     ('pollution', 23837),
                     ('water', 9379), ('water', 23234), ('water', 25265),
                     ('pollution', 25274),
                     ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270),
                     ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274), ('water', 25275),
                     ('pollution', 25275),
                     ('water', 25276),
                     ('pollution', 25276),
                     ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280), ('water', 25281)]
        ordering2 = [('pollution', 788),  # Document-at-a-time ordering if evaluated as "pollution water".
                     ('water', 3078),
                     ('pollution', 789), ('pollution', 790), ('pollution', 8079),
                     ('water', 8635),
                     ('pollution', 23837),
                     ('water', 9379), ('water', 23234), ('water', 25265),
                     ('pollution', 25274),
                     ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270),
                     ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274),
                     ('pollution', 25275),
                     ('water', 25275),
                     ('pollution', 25276),
                     ('water', 25276), ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280),
                     ('water', 25281)]
        self.assertTrue(history == ordering1 or history == ordering2)  # Strict. Advanced implementations might fail.
Ejemplo n.º 12
0
 def setUp(self):
     from ranking import BrainDeadRanker
     self._ranker = BrainDeadRanker()
Ejemplo n.º 13
0
def assignment_d():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    simple_tokenizer = BrainDeadTokenizer()

    # Load MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")

    # Do ranked retrieval, using n-grams and a simple ranker. This allows for fuzzy retrieval.
    print("INDEXING...")
    shingle_generator = ShingleGenerator(3)
    shingle_inverted_index = InMemoryInvertedIndex(corpus, ["body"],
                                                   normalizer,
                                                   shingle_generator)
    shingle_engine = SimpleSearchEngine(corpus, shingle_inverted_index)
    simple_ranker = BrainDeadRanker()
    results = []
    hit_count = 10

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Test with some mispelled queries. Be robust for arbitrary resolving of ties.
    for (query, winner_score, winner_document_ids) in [
        ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]),
        ("synndrome", 7.0, [1275])
    ]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        shingle_engine.evaluate(query, options, simple_ranker, match_collector)
        assert 0 < len(results) <= hit_count
        assert results[0]["score"] == winner_score
        assert results[0]["document"].document_id in winner_document_ids

    # Load and index some English news sentences. Look at the output and compare the two rankers!
    # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved
    # ranker does not. The test below for the improved ranker (with document #24 being the winner)
    # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the
    # textbook.
    print("LOADING...")
    corpus = InMemoryCorpus("data/en.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           simple_tokenizer)
    better_ranker = BetterRanker(corpus, inverted_index)
    engine = SimpleSearchEngine(corpus, inverted_index)
    for query in ["the terrorism attack and obama"]:
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        for ranker in [simple_ranker, better_ranker]:
            print("SEARCHING for '" + query + "' using " +
                  ranker.__class__.__name__ + "...")
            results.clear()
            engine.evaluate(query, options, ranker, match_collector)
            winner_document_ids = {
                simple_ranker: [9221, 7263],
                better_ranker: [24]
            }[ranker]
            assert 0 < len(results) <= hit_count
            assert results[0]["document"].document_id in winner_document_ids
Ejemplo n.º 14
0
def test_simple_search_daat():
    from typing import Iterator
    from ranking import BrainDeadRanker
    from searchengine import SimpleSearchEngine
    from invertedindex import InvertedIndex, Posting
    from corpus import Corpus, Document, InMemoryDocument

    inverted_index = None

    class DaatPostingList:
        def __init__(self, docid_list):
            self.docid_list = docid_list
            self.idx = 0

        @property
        def current_docid(self):
            if self.idx >= len(self.docid_list):
                return 1000
            return self.docid_list[self.idx]

        def __iter__(self):
            return self

        def __next__(self):
            if self.idx + 1 >= len(self.docid_list):
                raise StopIteration
            self.idx += 1
            docid = self.docid_list[self.idx]
            if 106 <= docid <= 108:
                maxdiff = 0
                for pl in (e for e in inverted_index.active_lists
                           if e is not self):
                    maxdiff = max(maxdiff, abs(pl.current_docid - docid))
                if maxdiff > 20:
                    inverted_index.is_daat = False
            return Posting(docid, 1)

    class DaatCorpus(Corpus):
        def __iter__(self):
            return (self.get_document(i) for i in range(0, self.size()))

        def size(self) -> int:
            return 300

        def get_document(self, document_id: int) -> Document:
            return InMemoryDocument(
                document_id,
                {'body': inverted_index.get_doc_body(document_id)})

    class DaatInvertedIndex(InvertedIndex):
        def __init__(self):
            self.is_daat = True
            self.active_lists = []

        _data = {
            'a': (1, 2, 7, 10, 11, 12, 20, 21, 22, 41, 42, 44, 45, 50, 100,
                  103, 106, 109, 112, 201, 203, 204, 205, 206, 207),
            'b': (3, 4, 8, 16, 17, 18, 20, 22, 31, 41, 43, 101, 104, 107, 110,
                  113, 202, 204),
            'c': (5, 6, 9, 13, 14, 15, 20, 21, 31, 41, 102, 105, 108, 111, 114,
                  203, 204, 208),
        }

        @classmethod
        def get_doc_body(cls, document_id):
            return ' '.join(k for k, a in cls._data.items()
                            if document_id in a)

        def get_terms(self, buffer: str) -> Iterator[str]:
            return iter(buffer.split(' '))

        def get_postings_iterator(self, term: str) -> Iterator[Posting]:
            if term in self._data:
                iterator = DaatPostingList(self._data[term])
                self.active_lists.append(iterator)
                return iterator
            return iter(tuple())

        def get_document_frequency(self, term: str) -> int:
            return len(self._data.get(term, []))

    inverted_index = DaatInvertedIndex()
    engine = SimpleSearchEngine(DaatCorpus(), inverted_index)
    results = []

    def match(m):
        results.append((m['score'], m['document'].document_id))

    engine.evaluate('a b c', {
        'recall_threshold': 1.0,
        'hit_count': 100
    }, BrainDeadRanker(), match)
    results.sort()
    if results != [(3.0, 20), (3.0, 41), (3.0, 204)]:
        print("FAILURE: search failed.")
    if not inverted_index.is_daat:
        print("WARNING: not performing proper Document-At-A-Time search!")