Beispiel #1
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # "il gatto nero" is a noun phrase, "sulla stuoia" is a prepositional noun phrase.
     v = it.parser.parse(u"Il gatto nero seduto sulla stuoia.")
     self.assertEqual(v,
         u"Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O " +
         u"seduto/VB/B-VP/O " + \
         u"sulla/IN/B-PP/B-PNP stuoia/NN/B-NP/I-PNP ././O/O"
     )
     # Assert the accuracy of the Italian tagger.
     i, n = 0, 0
     for sentence in open(
             os.path.join(PATH, "corpora",
                          "tagged-it-wacky.txt")).readlines():
         sentence = sentence.decode("utf-8").strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s2 = [[w for w, pos in s1]]
         s2 = it.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             t1 = s1[j][1]
             t2 = s2[j][1]
             # WaCKy test set tags plural nouns as "NN", pattern.it as "NNS".
             # Some punctuation marks are also tagged differently,
             # but these are not necessarily errors.
             if t1 == t2 or (t1 == "NN" and
                             t2.startswith("NN")) or s1[j][0] in "\":;)-":
                 i += 1
             n += 1
     #print(float(i) / n)
     self.assertTrue(float(i) / n > 0.92)
     print("pattern.it.parser.parse()")
Beispiel #2
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # "il gatto nero" is a noun phrase, "sulla stuoia" is a prepositional noun phrase.
     v = it.parser.parse(u"Il gatto nero seduto sulla stuoia.")
     self.assertEqual(v,
         u"Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O " + 
         u"seduto/VB/B-VP/O " + \
         u"sulla/IN/B-PP/B-PNP stuoia/NN/B-NP/I-PNP ././O/O"
     )
     # Assert the accuracy of the Italian tagger.
     i, n = 0, 0
     for sentence in open(os.path.join(PATH, "corpora", "tagged-it-wacky.txt")).readlines():
         sentence = sentence.decode("utf-8").strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s2 = [[w for w, pos in s1]]
         s2 = it.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             t1 = s1[j][1]
             t2 = s2[j][1]
             # WaCKy test set tags plural nouns as "NN", pattern.it as "NNS".
             # Some punctuation marks are also tagged differently, 
             # but these are not necessarily errors.
             if t1 == t2 or (t1 == "NN" and t2 == "NNS") or s1[j][0] in "\":;)-":
                 i += 1
             n += 1
     #print float(i) / n
     self.assertTrue(float(i) / n > 0.92)
     print "pattern.it.parser.parse()"
def lemmatize_word(input_word):
    in_word = input_word  #.decode('utf-8')
    word_it = parse(in_word,
                    tokenize=False,
                    tag=False,
                    chunk=False,
                    lemmata=True)
    the_lemmatized_word = word_it.split()[0][0][4]
    return the_lemmatized_word
def _getParse(word, language):
    import pattern.en as pattern_en  # @UnresolvedImport
    import pattern.es as pattern_es  # @UnresolvedImport
    import pattern.fr as pattern_fr  # @UnresolvedImport
    import pattern.de as pattern_de  # @UnresolvedImport
    import pattern.it as pattern_it  # @UnresolvedImport

    if language == "es":
        return pattern_es.parse(word)
    elif language == "en":
        return pattern_en.parse(word)
    elif language == "it":
        return pattern_it.parse(word)
    elif language == "fr":
        return pattern_fr.parse(word)
    elif language == "de":
        return pattern_de.parse(word)
    else:
        return pattern_en.parse(word)