Ejemplo n.º 1
0
 def decode(self, term):
     try:
         inflection = Term(term.split(" "))
         return self.inflections.to_lemma(inflection)
     except KeyError as e:
         inflection = Term(term.lower().split(" "))
         return self.inflections.to_lemma(inflection)
Ejemplo n.º 2
0
 def test_equivalence1(self):
     child_a = Node({}, True, Term(["child"]))
     child_b = Node({}, True, Term(["child"]))
     node = Node({"child_a": child_a, "child_b": child_b})
     self.assertEqual(len(node.children), 2)
     self.assertEqual(len(node.children["child_a"].children), 0)
     self.assertEqual(len(node.children["child_b"].children), 0)
Ejemplo n.º 3
0
 def test_extract_terms_overlap(self):
     corpus = "once there was a little man"
     once_there = Term(["once", "there"])
     there_was = Term(["there", "was"])
     was_a = Term(["was", "a"])
     terms = set([once_there, there_was, was_a])
     terms_trie = build_trie(terms)
     self.assertEqual(extract_terms(split_words(corpus), terms_trie),
                      set([once_there, was_a]))
Ejemplo n.º 4
0
    def test_term(self):
        a = Term(["fox", "dog"])
        b = Term(["Fox", "dOG"])
        self.assertEqual(a, a)
        self.assertEqual(hash(a), hash(a))
        self.assertNotEqual(a, b)
        self.assertNotEqual(hash(a), hash(b))

        self.assertEqual(b.lower(), a)
        self.assertEqual(hash(b.lower()), hash(a))
Ejemplo n.º 5
0
 def test_extract_terms_end(self):
     corpus = "once there was a little man"
     man = Term(["man"])
     man_trie = build_trie([man])
     little_man = Term(["little", "man"])
     little_man_trie = build_trie([little_man])
     self.assertEqual(extract_terms(split_words(corpus), man_trie),
                      set([man]))
     self.assertEqual(extract_terms(split_words(corpus), little_man_trie),
                      set([little_man]))
Ejemplo n.º 6
0
 def test_node(self):
     child = Node({}, True, Term(["child"]))
     child_x = Node({}, False)
     node = Node({"child": child})
     self.assertEqual(node, Node({"child": child}))
     self.assertNotEqual(node, Node({"other": child}))
     self.assertNotEqual(node, Node({"child": child_x}))
Ejemplo n.º 7
0
 def test_node_nonterminal(self):
     child = Node({}, True, Term(["root"]))
     child_x = Node({}, False)
     node = Node({"child": child}, False)
     self.assertEqual(node, Node({"child": child}, False))
     self.assertNotEqual(node, Node({"other": child}, False))
     self.assertNotEqual(node, Node({"child": child_x}, False))
Ejemplo n.º 8
0
 def test_extract_terms(self):
     corpus1 = "once there was a little man, a little wooden man"
     corpus2 = "once there was a man, a little wooden man"
     corpus3 = "once there was a little man, a wooden man"
     corpus4 = "little <= man"
     corpus5 = "little > man"
     little = Term(["little"])
     little_man = Term(["little", "man"])
     terms = set([little, little_man])
     terms_trie = build_trie(terms)
     self.assertEqual(extract_terms(split_words(corpus1), terms_trie),
                      set([little, little_man]))
     self.assertEqual(extract_terms(split_words(corpus2), terms_trie),
                      set([little]))
     self.assertEqual(extract_terms(split_words(corpus3), terms_trie),
                      set([little_man]))
     self.assertEqual(extract_terms(split_words(corpus4), terms_trie),
                      set([little]))
     self.assertEqual(extract_terms(split_words(corpus5), terms_trie),
                      set([little_man]))
Ejemplo n.º 9
0
 def test_node_empty(self):
     node = Node({}, True, Term(["root"]))
     self.assertEqual(node, Node({}, True, Term(["root"])))
     self.assertNotEqual(node, Node({}, False))
Ejemplo n.º 10
0
    def test_term_content_text_one_sentence(self):
        stream = ["Apple. Goat .\nexplore. \ncrater Sphere" \
            + TermsContentText.TERMS_CONTENT_SEPARATOR \
            + "Apples something explores and goats."]

        for window in range(1, 4):
            parse = parse_input(stream, TERMS_CONTENT_TEXT, window)
            self.assertEqual(parse.terms, set([
                Term(["appl"]),
                Term(["goat"]),
                Term(["explor"]),
                Term(["crater", "sphere"]),
            ]), "window %d" % window)
            self.assertEqual(parse.cooccurrences[Term(["appl"])], {
                Term(["goat"]): ["Apples something explores and goats".split()],
                Term(["explor"]): ["Apples something explores and goats".split()],
            }, "window %d" % window)
            self.assertEqual(parse.cooccurrences[Term(["goat"])], {
                Term(["appl"]): ["Apples something explores and goats".split()],
                Term(["explor"]): ["Apples something explores and goats".split()],
            }, "window %d" % window)
            self.assertEqual(parse.cooccurrences[Term(["explor"])], {
                Term(["appl"]): ["Apples something explores and goats".split()],
                Term(["goat"]): ["Apples something explores and goats".split()],
            }, "window %d" % window)
Ejemplo n.º 11
0
 def test_term_content_text(self):
     stream = ["Apple. Goat .\nexplore. \ncrater Sphere" \
         + TermsContentText.TERMS_CONTENT_SEPARATOR \
         + "Apples something explores and goats.\nxyz querty apples crater. Explore Crater sphere\ntermy . explores apples"]
     parse = parse_input(stream, TERMS_CONTENT_TEXT)
     self.assertEqual(parse.terms, set([
         Term(["appl"]),
         Term(["goat"]),
         Term(["explor"]),
         Term(["crater", "sphere"]),
     ]))
     self.assertEqual(parse.cooccurrences[Term(["appl"])], {
         Term(["goat"]): ["Apples something explores and goats".split()],
         Term(["explor"]): ["Apples something explores and goats".split(), "explores apples".split()],
     })
     self.assertEqual(parse.cooccurrences[Term(["goat"])], {
         Term(["appl"]): ["Apples something explores and goats".split()],
         Term(["explor"]): ["Apples something explores and goats".split()],
     })
     self.assertEqual(parse.cooccurrences[Term(["explor"])], {
         Term(["appl"]): ["Apples something explores and goats".split(), "explores apples".split()],
         Term(["goat"]): ["Apples something explores and goats".split()],
         Term(["crater", "sphere"]): ["Explore Crater sphere termy".split()],
     })
     self.assertEqual(parse.cooccurrences[Term(["crater", "sphere"])], {
         Term(["explor"]): ["Explore Crater sphere termy".split()],
     })
Ejemplo n.º 12
0
    def test_wikipedia_articles_list(self):
        # This test will rely on network connectivity to first locally save the wikipedia articles.
        stream = [
            "Paleozoic",
            "Gravity"
        ]
        parse = parse_input(stream, WIKIPEDIA_ARTICLES_LIST)
        self.assertIn(Term(["phanerozo"]), parse.terms)
        self.assertIn(Term(["permian"]), parse.terms)
        self.assertIn(Term(["gravit"]), parse.terms)
        self.assertIn(Term(["mass"]), parse.terms)

        self.assertGreater(len(parse.terms), 200)
        self.assertGreaterEqual(len(parse.cooccurrences[Term(["phanerozo"])][Term(["permian"])]), 1)
        self.assertEqual(len(parse.cooccurrences[Term(["permian"])][Term(["phanerozo"])]), len(parse.cooccurrences[Term(["phanerozo"])][Term(["permian"])]))
        self.assertGreaterEqual(len(parse.cooccurrences[Term(["gravit"])][Term(["mass"])]), 1)
        self.assertEqual(len(parse.cooccurrences[Term(["mass"])][Term(["gravit"])]), len(parse.cooccurrences[Term(["gravit"])][Term(["mass"])]))

        self.assertEqual(parse.inflections.to_dominant_inflection(Term(["phanerozo"])), Term(["phanerozoic"]))
        self.assertEqual(parse.inflections.to_dominant_inflection(Term(["gravit"])), Term(["gravitational"]))
Ejemplo n.º 13
0
 def test_glossary_csv(self):
     stream = [
         ["Apple", "Goat explores. xyz querty apples crater . Explore apples"],
         ["Goat", "nadda."],
         ["explore", "Crater sphere\ntermy."],
         ["crater Sphere", "nadda."]
     ]
     parse = parse_input(stream, GLOSSARY_CSV)
     self.assertEqual(parse.terms, set([
         Term(["appl"]),
         Term(["goat"]),
         Term(["explor"]),
         Term(["crater", "sphere"]),
     ]))
     self.assertEqual(parse.cooccurrences[Term(["appl"])], {
         Term(["goat"]): ["Goat explores".split()],
         Term(["explor"]): ["Goat explores".split(), "Explore apples".split()],
     })
     self.assertEqual(parse.cooccurrences[Term(["goat"])], {})
     self.assertEqual(parse.cooccurrences[Term(["explor"])], {
         Term(["crater", "sphere"]): ["Crater sphere termy".split()],
     })
     self.assertEqual(parse.cooccurrences[Term(["crater", "sphere"])], {})