Ejemplo n.º 1
0
 def test_multiple_fields(self):
     from corpus import InMemoryDocument, InMemoryCorpus
     from suffixarray import SuffixArray
     corpus = InMemoryCorpus()
     corpus.add_document(
         InMemoryDocument(0, {
             "field1": "a b c",
             "field2": "b c d"
         }))
     corpus.add_document(InMemoryDocument(1, {
         "field1": "x",
         "field2": "y"
     }))
     corpus.add_document(InMemoryDocument(2, {
         "field1": "y",
         "field2": "z"
     }))
     engine0 = SuffixArray(corpus, ["field1", "field2"], self._normalizer,
                           self._tokenizer)
     engine1 = SuffixArray(corpus, ["field1"], self._normalizer,
                           self._tokenizer)
     engine2 = SuffixArray(corpus, ["field2"], self._normalizer,
                           self._tokenizer)
     self._process_query_and_verify_winner(engine0, "b c", [0], 2)
     self._process_query_and_verify_winner(engine0, "y", [1, 2], 1)
     self._process_query_and_verify_winner(engine1, "x", [1], 1)
     self._process_query_and_verify_winner(engine1, "y", [2], 1)
     self._process_query_and_verify_winner(engine1, "z", [], None)
     self._process_query_and_verify_winner(engine2, "z", [2], 1)
Ejemplo n.º 2
0
def populateSuffixArray(keys):
    """
    Takes a list of keys and initializes and populates a 
    suffixarray.SuffixArray object
    """
    suffix_array = SuffixArray()
    for key in keys:
        suffix_array.insert(key)
    return suffix_array
Ejemplo n.º 3
0
def populateSuffixArray(keys):
    """
    Takes a list of keys and initializes and populates a 
    suffixarray.SuffixArray object
    """
    suffix_array = SuffixArray()
    for key in keys:
        suffix_array.insert(key)
    return suffix_array
Ejemplo n.º 4
0
 def test_memory_usage(self):
     import tracemalloc
     import inspect
     from corpus import InMemoryDocument, InMemoryCorpus
     from suffixarray import SuffixArray
     corpus = InMemoryCorpus()
     corpus.add_document(
         InMemoryDocument(0, {
             "a": "o  o\n\n\no\n\no",
             "b": "o o\no   \no"
         }))
     corpus.add_document(InMemoryDocument(1, {"a": "ba", "b": "b bab"}))
     corpus.add_document(InMemoryDocument(2, {"a": "o  o O o", "b": "o o"}))
     corpus.add_document(InMemoryDocument(3, {"a": "oO" * 10000, "b": "o"}))
     corpus.add_document(
         InMemoryDocument(4, {
             "a": "cbab o obab O ",
             "b": "o o " * 10000
         }))
     tracemalloc.start()
     snapshot1 = tracemalloc.take_snapshot()
     engine = SuffixArray(corpus, ["a", "b"], self._normalizer,
                          self._tokenizer)
     snapshot2 = tracemalloc.take_snapshot()
     tracemalloc.stop()
     for statistic in snapshot2.compare_to(snapshot1, "filename"):
         if statistic.traceback[0].filename == inspect.getfile(SuffixArray):
             self.assertLessEqual(statistic.size_diff, 2000000,
                                  "Memory usage seems excessive.")
Ejemplo n.º 5
0
 def test_cran_corpus(self):
     import os.path
     from corpus import InMemoryCorpus
     from suffixarray import SuffixArray
     corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
     engine = SuffixArray(corpus, ["body"], self._normalizer,
                          self._tokenizer)
     self._process_query_and_verify_winner(engine, "visc", [328], 11)
     self._process_query_and_verify_winner(engine, "Of  A", [946], 10)
     self._process_query_and_verify_winner(engine, "", [], None)
     self._process_query_and_verify_winner(engine, "approximate solution",
                                           [159, 1374], 3)
Ejemplo n.º 6
0
def assignment_b_suffixarray_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Prepare for some suffix array lookups.
    print("LOADING...")
    corpus = InMemoryCorpus("data/cran.xml")
    print("INDEXING...")
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    results = []
    hit_count = 5

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Define the actual test queries.
    test1 = ("visc", 11, [328])  # Look for {'viscous', 'viscosity', ...}.
    test2 = ("Of  A", 10, [946])  # Test robustness for case and whitespace.
    test3 = ("", 0, [])  # Safety feature: Match nothing instead of everything.
    test4 = ("approximate solution", 3, [1374, 159])  # Multiple winners.

    # Test that the simple occurrence ranking works. Be robust towards how ties are resolved.
    for (query, winner_score,
         winner_document_ids) in [test1, test2, test3, test4]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        engine.evaluate(query, {
            "debug": False,
            "hit_count": hit_count
        }, match_collector)
        if winner_document_ids:
            assert results[0]["score"] == winner_score
            assert results[0]["document"].document_id in winner_document_ids
            assert len(results) <= hit_count
        else:
            assert len(results) == 0
Ejemplo n.º 7
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from suffixarray import SuffixArray
    print("Building suffix array from Cranfield corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    options = {"debug": False, "hit_count": 5}
    print("Enter a prefix phrase query and find matching documents.")
    print(f"Lookup options are {options}.")
    print("Returned scores are occurrence counts.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)
Ejemplo n.º 8
0
def assignment_b_suffixarray_2():

    # For testing.
    class TestNormalizer(Normalizer):

        _table = str.maketrans({'Ø': 'O'})

        def canonicalize(self, buffer: str) -> str:
            return buffer

        def normalize(self, token: str) -> str:
            return token.upper().translate(self._table)

    # For testing.
    class TestDocument(Document):

        def __init__(self, document_id: int, a: str, b: str):
            self._document_id = document_id
            self._a = a
            self._b = b

        def get_document_id(self) -> int:
            return self._document_id

        def get_field(self, field_name: str, default: str) -> str:
            if field_name == "a":
                return self._a
            if field_name == "b":
                return self._b
            return default

    # For testing.
    class TestCorpus(Corpus):
        def __init__(self):
            self._docs = []
            self._docs.append(TestDocument(len(self._docs), "ø  o\n\n\nø\n\no", "ø o\nø   \no"))
            self._docs.append(TestDocument(len(self._docs), "ba", "b bab"))
            self._docs.append(TestDocument(len(self._docs), "ø  o Ø o", "ø o"))
            self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o"))
            self._docs.append(TestDocument(len(self._docs), "cbab o øbab Ø ", "ø o " * 10000))

        def __iter__(self):
            return iter(self._docs)

        def size(self) -> int:
            return len(self._docs)

        def get_document(self, document_id: int) -> Document:
            return self._docs[document_id]

    # Run the tests!
    for fields in [("b",), ("a", "b")]:

        # Create the suffix array over the given set of fields. Measure memory usage. If memory usage is
        # excessive, most likely the implementation is copying strings or doing other silly stuff instead
        # of working with buffer indices. The naive reference implementation is not in any way optimized,
        # and uses about 1.5 MB of memory on this corpus.
        tracemalloc.start()
        snapshot1 = tracemalloc.take_snapshot()
        engine = SuffixArray(TestCorpus(), fields, TestNormalizer(), BrainDeadTokenizer())
        snapshot2 = tracemalloc.take_snapshot()
        for statistic in snapshot2.compare_to(snapshot1, "filename"):
            if statistic.traceback[0].filename == inspect.getfile(SuffixArray):
                assert statistic.size_diff < 2000000, f"Memory usage is {statistic.size_diff}"
        tracemalloc.stop()
        results = []

        def process(m):
            results.append((m['document'].document_id, m['score']))

        expected_results = {
            ('b',): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 19999), (0, 3), (2, 1)]),
                ('o O', [(4, 19999), (0, 3), (2, 1)]),
                ('oooooo', []),
                ('o o o o', [(4, 19997), (0, 1)]),
            ),
            ('a', 'b'): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 20000), (0, 6), (2, 4)]),
                ('o O', [(4, 20000), (0, 6), (2, 4)]),
                ('oøØOøO', [(3, 1), ]),
                ('o o o o', [(4, 19997), (0, 2), (2, 1)]),
            )
        }

        for query, expected in expected_results[fields]:
            results.clear()
            engine.evaluate(query, {'hit_count': 10}, process)
            assert results == expected
Ejemplo n.º 9
0
def assignment_b_suffixarray_2():

    # For testing.
    class TestNormalizer(Normalizer):

        _table = str.maketrans({'Ø': 'O'})

        def canonicalize(self, buffer: str) -> str:
            return buffer

        def normalize(self, token: str) -> str:
            return token.upper().translate(self._table)

    # For testing.
    class TestDocument(Document):
        def __init__(self, document_id: int, a: str, b: str):
            self._document_id = document_id
            self._a = a
            self._b = b

        def get_document_id(self) -> int:
            return self._document_id

        def get_field(self, field_name: str, default: str) -> str:
            if field_name == "a":
                return self._a
            if field_name == "b":
                return self._b
            return default

    # For testing.
    class TestCorpus(Corpus):
        def __init__(self):
            self._docs = []
            self._docs.append(
                TestDocument(len(self._docs), "ø  o\n\n\nø\n\no",
                             "ø o\nø   \no"))
            self._docs.append(TestDocument(len(self._docs), "ba", "b bab"))
            self._docs.append(TestDocument(len(self._docs), "ø  o Ø o", "ø o"))
            self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o"))
            self._docs.append(
                TestDocument(len(self._docs), "cbab o øbab Ø ",
                             "ø o " * 10000))

        def __iter__(self):
            return iter(self._docs)

        def size(self) -> int:
            return len(self._docs)

        def get_document(self, document_id: int) -> Document:
            return self._docs[document_id]

    # Run the tests!
    for fields in [("b", ), ("a", "b")]:

        engine = SuffixArray(TestCorpus(), fields, TestNormalizer(),
                             BrainDeadTokenizer())
        results = []

        def process(m):
            results.append((m['document'].document_id, m['score']))

        expected_results = {
            ('b', ): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 19999), (0, 3), (2, 1)]),
                ('o O', [(4, 19999), (0, 3), (2, 1)]),
                ('oooooo', []),
                ('o o o o', [(4, 19997), (0, 1)]),
            ),
            ('a', 'b'): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 20000), (0, 6), (2, 4)]),
                ('o O', [(4, 20000), (0, 6), (2, 4)]),
                ('oøØOøO', [
                    (3, 1),
                ]),
                ('o o o o', [(4, 19997), (0, 2), (2, 1)]),
            )
        }

        for query, expected in expected_results[fields]:
            results.clear()
            engine.evaluate(query, {'hit_count': 10}, process)
            assert results == expected
Ejemplo n.º 10
0
def assignment_b():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Prepare for some suffix array lookups.
    print("LOADING...")
    corpus = InMemoryCorpus("data/cran.xml")
    print("INDEXING...")
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    results = []
    hit_count = 5

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Define the actual test queries.
    test1 = ("visc", 11, [328])  # Look for {'viscous', 'viscosity', ...}.
    test2 = ("Of  A", 10, [946])  # Test robustness for case and whitespace.
    test3 = ("", 0, [])  # Safety feature: Match nothing instead of everything.
    test4 = ("approximate solution", 3, [1374, 159])  # Multiple winners.

    # Test that the simple occurrence ranking works. Be robust towards how ties are resolved.
    for (query, winner_score,
         winner_document_ids) in [test1, test2, test3, test4]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        engine.evaluate(query, {
            "debug": False,
            "hit_count": hit_count
        }, match_collector)
        if winner_document_ids:
            assert results[0]["score"] == winner_score
            assert results[0]["document"].document_id in winner_document_ids
            assert len(results) <= hit_count
        else:
            assert len(results) == 0

    # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer.
    trie = Trie()
    for s in [
            "romerike", "apple computer", "norsk", "norsk ørret", "sverige",
            "ørret", "banan"
    ]:
        trie.add(s, tokenizer)
    finder = StringFinder(trie, tokenizer)
    buffer = "det var en gang en norsk  ørret fra romerike som likte abba fra sverige"
    print("SCANNING...")
    results.clear()
    finder.scan(buffer, lambda m: results.append(m))
    print("Buffer \"" + buffer + "\" contains", results)
    assert [m["match"] for m in results
            ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"]

    # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH
    # documents are medical terms and the Cranfield documents have technical content, the
    # overlap probably isn't that big.
    print("LOADING...")
    mesh = InMemoryCorpus("data/mesh.txt")
    cranfield = InMemoryCorpus("data/cran.xml")
    print("BUILDING...")
    trie = Trie()
    for d in mesh:
        trie.add(d["body"] or "", tokenizer)
    finder = StringFinder(trie, tokenizer)
    print("SCANNING...")
    for (document_id,
         expected_matches) in [(0, ["wing", "wing"]),
                               (3, ["solutions", "skin", "friction"]),
                               (1254, ["electrons", "ions"])]:
        document = cranfield[document_id]
        buffer = document["body"] or ""
        results.clear()
        finder.scan(buffer, lambda m: results.append(m))
        print("Cranfield document", document, "contains MeSH terms", results)
        assert [m["match"] for m in results] == expected_matches