Esempio n. 1
0
def assignment_a_postingsmerger_1():

    # A small but real corpus.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus("./data/mesh.txt")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [index[terms[i]] for i in range(len(terms))]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [corpus[posting.document_id] for posting in merged]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.document_id for d in documents] == expected_document_ids
Esempio n. 2
0
def assignment_a():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"),
                                [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index.get_postings_iterator(term))
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency)
                for p in postings] == expected
    print(index)

    # Again, for a slightly bigger corpus.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")
    print("INDEXING...")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]:
        print(term)
        for posting in index.get_postings_iterator(term):
            print(posting)
        assert len(list(index.get_postings_iterator(term))) == expected_length

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [
            index.get_postings_iterator(terms[i]) for i in range(len(terms))
        ]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [
            corpus.get_document(posting.document_id) for posting in merged
        ]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.get_document_id()
                for d in documents] == expected_document_ids
Esempio n. 3
0
def assignment_a_postingsmerger_3():

    # Argument order shouldn't matter.
    merger = PostingsMerger()
    postings1 = [Posting(1, 0), Posting(2, 0), Posting(3, 0)]
    postings2 = [Posting(2, 0), Posting(3, 0), Posting(6, 0)]
    result12 = list(map(lambda p: p.document_id, merger.intersection(iter(postings1), iter(postings2))))
    result21 = list(map(lambda p: p.document_id, merger.intersection(iter(postings2), iter(postings1))))
    print(result12)
    print(result21)
    assert len(result12) == 2
    assert result12 == result21
    result12 = list(map(lambda p: p.document_id, merger.union(iter(postings1), iter(postings2))))
    result21 = list(map(lambda p: p.document_id, merger.union(iter(postings2), iter(postings1))))
    print(result12)
    print(result21)
    assert len(result12) == 4
    assert result12 == result21
Esempio n. 4
0
def assignment_a_postingsmerger_2():
    # Test some corner cases with empty lists.
    merger = PostingsMerger()
    posting = Posting(123, 4)
    assert list(merger.intersection(iter([]), iter([]))) == []
    assert list(merger.intersection(iter([]), iter([posting]))) == []
    assert list(merger.intersection(iter([posting]), iter([]))) == []
    assert list(merger.union(iter([]), iter([]))) == []
    assert [p.document_id for p in merger.union(iter([]), iter([posting]))
            ] == [posting.document_id]
    assert [p.document_id for p in merger.union(iter([posting]), iter([]))
            ] == [posting.document_id]
Esempio n. 5
0
def assignment_a_postingsmerger_2():

    # Test some corner cases with empty lists.
    merger = PostingsMerger()
    posting = Posting(0, 0)
    assert list(merger.intersection(iter([]), iter([]))) == []
    assert list(merger.intersection(iter([]), iter([posting]))) == []
    assert list(merger.intersection(iter([posting]), iter([]))) == []
    assert list(merger.union(iter([]), iter([]))) == []
    assert list(merger.union(iter([]), iter([posting]))) == [posting]
    assert list(merger.union(iter([posting]), iter([]))) == [posting]