def test_uses_yield(self): from types import GeneratorType trie = in3120.Trie() trie.add(["foo"], self.__tokenizer) finder = in3120.StringFinder(trie, self.__tokenizer) matches = finder.scan("the foo bar") self.assertIsInstance(matches, GeneratorType, "Are you using yield?")
def repl_b_2(): print("Building trie from MeSH corpus...") normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus(data_path("mesh.txt")) dictionary = in3120.Trie() dictionary.add((normalizer.normalize(normalizer.canonicalize(d["body"])) for d in corpus), tokenizer) engine = in3120.StringFinder(dictionary, tokenizer) print("Enter some text and locate words and phrases that are MeSH terms.") simple_repl("text", lambda t: list(engine.scan(normalizer.normalize(normalizer.canonicalize(t)))))
def test_access_nodes(self): tokenizer = in3120.BrainDeadTokenizer() root = in3120.Trie() root.add(["abba", "ørret", "abb", "abbab", "abbor"], tokenizer) self.assertFalse(root.is_final()) self.assertIsNone(root.consume("snegle")) node = root.consume("ab") self.assertFalse(node.is_final()) node = node.consume("b") self.assertTrue(node.is_final()) self.assertEqual(node, root.consume("abb"))
def test_mesh_terms_in_cran_corpus(self): mesh = in3120.InMemoryCorpus("../data/mesh.txt") cran = in3120.InMemoryCorpus("../data/cran.xml") trie = in3120.Trie() trie.add((d["body"] or "" for d in mesh), self.__tokenizer) finder = in3120.StringFinder(trie, self.__tokenizer) self.__scan_buffer_verify_matches(finder, cran[0]["body"], ["wing", "wing"]) self.__scan_buffer_verify_matches(finder, cran[3]["body"], ["solutions", "skin", "friction"]) self.__scan_buffer_verify_matches(finder, cran[1254]["body"], ["electrons", "ions"])
def test_scan(self): dictionary = in3120.Trie() dictionary.add([ "romerike", "apple computer", "norsk", "norsk ørret", "sverige", "ørret", "banan", "a", "a b" ], self.__tokenizer) finder = in3120.StringFinder(dictionary, self.__tokenizer) self.__scan_buffer_verify_matches( finder, "en norsk ørret fra romerike likte abba fra sverige", ["norsk", "norsk ørret", "ørret", "romerike", "sverige"]) self.__scan_buffer_verify_matches(finder, "the apple is red", []) self.__scan_buffer_verify_matches(finder, "", []) self.__scan_buffer_verify_matches( finder, "apple computer banan foo sverige ben reddik fy fasan", ["apple computer", "banan", "sverige"]) self.__scan_buffer_verify_matches(finder, "a a b", ["a", "a", "a b"])