def decode(self, term): try: inflection = Term(term.split(" ")) return self.inflections.to_lemma(inflection) except KeyError as e: inflection = Term(term.lower().split(" ")) return self.inflections.to_lemma(inflection)
def test_equivalence1(self): child_a = Node({}, True, Term(["child"])) child_b = Node({}, True, Term(["child"])) node = Node({"child_a": child_a, "child_b": child_b}) self.assertEqual(len(node.children), 2) self.assertEqual(len(node.children["child_a"].children), 0) self.assertEqual(len(node.children["child_b"].children), 0)
def test_extract_terms_overlap(self): corpus = "once there was a little man" once_there = Term(["once", "there"]) there_was = Term(["there", "was"]) was_a = Term(["was", "a"]) terms = set([once_there, there_was, was_a]) terms_trie = build_trie(terms) self.assertEqual(extract_terms(split_words(corpus), terms_trie), set([once_there, was_a]))
def test_term(self): a = Term(["fox", "dog"]) b = Term(["Fox", "dOG"]) self.assertEqual(a, a) self.assertEqual(hash(a), hash(a)) self.assertNotEqual(a, b) self.assertNotEqual(hash(a), hash(b)) self.assertEqual(b.lower(), a) self.assertEqual(hash(b.lower()), hash(a))
def test_extract_terms_end(self): corpus = "once there was a little man" man = Term(["man"]) man_trie = build_trie([man]) little_man = Term(["little", "man"]) little_man_trie = build_trie([little_man]) self.assertEqual(extract_terms(split_words(corpus), man_trie), set([man])) self.assertEqual(extract_terms(split_words(corpus), little_man_trie), set([little_man]))
def test_node(self): child = Node({}, True, Term(["child"])) child_x = Node({}, False) node = Node({"child": child}) self.assertEqual(node, Node({"child": child})) self.assertNotEqual(node, Node({"other": child})) self.assertNotEqual(node, Node({"child": child_x}))
def test_node_nonterminal(self): child = Node({}, True, Term(["root"])) child_x = Node({}, False) node = Node({"child": child}, False) self.assertEqual(node, Node({"child": child}, False)) self.assertNotEqual(node, Node({"other": child}, False)) self.assertNotEqual(node, Node({"child": child_x}, False))
def test_extract_terms(self): corpus1 = "once there was a little man, a little wooden man" corpus2 = "once there was a man, a little wooden man" corpus3 = "once there was a little man, a wooden man" corpus4 = "little <= man" corpus5 = "little > man" little = Term(["little"]) little_man = Term(["little", "man"]) terms = set([little, little_man]) terms_trie = build_trie(terms) self.assertEqual(extract_terms(split_words(corpus1), terms_trie), set([little, little_man])) self.assertEqual(extract_terms(split_words(corpus2), terms_trie), set([little])) self.assertEqual(extract_terms(split_words(corpus3), terms_trie), set([little_man])) self.assertEqual(extract_terms(split_words(corpus4), terms_trie), set([little])) self.assertEqual(extract_terms(split_words(corpus5), terms_trie), set([little_man]))
def test_node_empty(self): node = Node({}, True, Term(["root"])) self.assertEqual(node, Node({}, True, Term(["root"]))) self.assertNotEqual(node, Node({}, False))
def test_term_content_text_one_sentence(self): stream = ["Apple. Goat .\nexplore. \ncrater Sphere" \ + TermsContentText.TERMS_CONTENT_SEPARATOR \ + "Apples something explores and goats."] for window in range(1, 4): parse = parse_input(stream, TERMS_CONTENT_TEXT, window) self.assertEqual(parse.terms, set([ Term(["appl"]), Term(["goat"]), Term(["explor"]), Term(["crater", "sphere"]), ]), "window %d" % window) self.assertEqual(parse.cooccurrences[Term(["appl"])], { Term(["goat"]): ["Apples something explores and goats".split()], Term(["explor"]): ["Apples something explores and goats".split()], }, "window %d" % window) self.assertEqual(parse.cooccurrences[Term(["goat"])], { Term(["appl"]): ["Apples something explores and goats".split()], Term(["explor"]): ["Apples something explores and goats".split()], }, "window %d" % window) self.assertEqual(parse.cooccurrences[Term(["explor"])], { Term(["appl"]): ["Apples something explores and goats".split()], Term(["goat"]): ["Apples something explores and goats".split()], }, "window %d" % window)
def test_term_content_text(self): stream = ["Apple. Goat .\nexplore. \ncrater Sphere" \ + TermsContentText.TERMS_CONTENT_SEPARATOR \ + "Apples something explores and goats.\nxyz querty apples crater. Explore Crater sphere\ntermy . explores apples"] parse = parse_input(stream, TERMS_CONTENT_TEXT) self.assertEqual(parse.terms, set([ Term(["appl"]), Term(["goat"]), Term(["explor"]), Term(["crater", "sphere"]), ])) self.assertEqual(parse.cooccurrences[Term(["appl"])], { Term(["goat"]): ["Apples something explores and goats".split()], Term(["explor"]): ["Apples something explores and goats".split(), "explores apples".split()], }) self.assertEqual(parse.cooccurrences[Term(["goat"])], { Term(["appl"]): ["Apples something explores and goats".split()], Term(["explor"]): ["Apples something explores and goats".split()], }) self.assertEqual(parse.cooccurrences[Term(["explor"])], { Term(["appl"]): ["Apples something explores and goats".split(), "explores apples".split()], Term(["goat"]): ["Apples something explores and goats".split()], Term(["crater", "sphere"]): ["Explore Crater sphere termy".split()], }) self.assertEqual(parse.cooccurrences[Term(["crater", "sphere"])], { Term(["explor"]): ["Explore Crater sphere termy".split()], })
def test_wikipedia_articles_list(self): # This test will rely on network connectivity to first locally save the wikipedia articles. stream = [ "Paleozoic", "Gravity" ] parse = parse_input(stream, WIKIPEDIA_ARTICLES_LIST) self.assertIn(Term(["phanerozo"]), parse.terms) self.assertIn(Term(["permian"]), parse.terms) self.assertIn(Term(["gravit"]), parse.terms) self.assertIn(Term(["mass"]), parse.terms) self.assertGreater(len(parse.terms), 200) self.assertGreaterEqual(len(parse.cooccurrences[Term(["phanerozo"])][Term(["permian"])]), 1) self.assertEqual(len(parse.cooccurrences[Term(["permian"])][Term(["phanerozo"])]), len(parse.cooccurrences[Term(["phanerozo"])][Term(["permian"])])) self.assertGreaterEqual(len(parse.cooccurrences[Term(["gravit"])][Term(["mass"])]), 1) self.assertEqual(len(parse.cooccurrences[Term(["mass"])][Term(["gravit"])]), len(parse.cooccurrences[Term(["gravit"])][Term(["mass"])])) self.assertEqual(parse.inflections.to_dominant_inflection(Term(["phanerozo"])), Term(["phanerozoic"])) self.assertEqual(parse.inflections.to_dominant_inflection(Term(["gravit"])), Term(["gravitational"]))
def test_glossary_csv(self): stream = [ ["Apple", "Goat explores. xyz querty apples crater . Explore apples"], ["Goat", "nadda."], ["explore", "Crater sphere\ntermy."], ["crater Sphere", "nadda."] ] parse = parse_input(stream, GLOSSARY_CSV) self.assertEqual(parse.terms, set([ Term(["appl"]), Term(["goat"]), Term(["explor"]), Term(["crater", "sphere"]), ])) self.assertEqual(parse.cooccurrences[Term(["appl"])], { Term(["goat"]): ["Goat explores".split()], Term(["explor"]): ["Goat explores".split(), "Explore apples".split()], }) self.assertEqual(parse.cooccurrences[Term(["goat"])], {}) self.assertEqual(parse.cooccurrences[Term(["explor"])], { Term(["crater", "sphere"]): ["Crater sphere termy".split()], }) self.assertEqual(parse.cooccurrences[Term(["crater", "sphere"])], {})