Esempio n. 1
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from ahocorasick import Trie, StringFinder
    print("Building trie from MeSH corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
    dictionary = Trie()
    for document in corpus:
        dictionary.add(
            normalizer.normalize(normalizer.canonicalize(document["body"])),
            tokenizer)
    engine = StringFinder(dictionary, tokenizer)
    print("Enter some text and locate words and phrases that are MeSH terms.")

    def evaluator(text):
        matches = []
        engine.scan(normalizer.normalize(normalizer.canonicalize(text)),
                    lambda m: matches.append(m))
        return matches

    simple_repl("text", evaluator)
Esempio n. 2
0
class TestBrainDeadNormalizer(unittest.TestCase):
    def setUp(self):
        from normalization import BrainDeadNormalizer
        self._normalizer = BrainDeadNormalizer()

    def test_canonicalize(self):
        self.assertEqual(self._normalizer.canonicalize("Dette ER en\nprØve!"),
                         "Dette ER en\nprØve!")

    def test_normalize(self):
        self.assertEqual(
            self._normalizer.normalize("grÅFustaSJEOpphengsForKOBling"),
            "gråfustasjeopphengsforkobling")