Beispiel #1
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "de zwarte kat" is a noun phrase, "op de mat" is a prepositional
     # noun phrase.
     v = nl.parser.parse("De zwarte kat zat op de mat.")
     self.assertEqual(v,
                      "De/DT/B-NP/O zwarte/JJ/I-NP/O kat/NN/I-NP/O " +
                      "zat/VBD/B-VP/O " +
                      "op/IN/B-PP/B-PNP de/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O"
                      )
     # 2) "jaagt" and "vogels" lemmata are "jagen" and "vogel".
     v = nl.parser.parse("De zwarte kat jaagt op vogels.", lemmata=True)
     self.assertEqual(v,
                      "De/DT/B-NP/O/de zwarte/JJ/I-NP/O/zwart kat/NN/I-NP/O/kat " +
                      "jaagt/VBZ/B-VP/O/jagen " +
                      "op/IN/B-PP/B-PNP/op vogels/NNS/B-NP/I-PNP/vogel ././O/O/."
                      )
     # Assert the accuracy of the Dutch tagger.
     i, n = 0, 0
     for sentence in open(os.path.join(PATH, "corpora", "tagged-nl-twnc.txt")).readlines():
         sentence = sentence.decode("utf-8").strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s1 = [nl.wotan2penntreebank(w, tag) for w, tag in s1]
         s2 = [[w for w, pos in s1]]
         s2 = nl.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             if s1[j][1] == s2[j][1]:
                 i += 1
             n += 1
     self.assertTrue(float(i) / n > 0.90)
     print("pattern.nl.parser.parse()")
Beispiel #2
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "de zwarte kat" is a noun phrase, "op de mat" is a prepositional
     # noun phrase.
     v = nl.parser.parse("De zwarte kat zat op de mat.")
     self.assertEqual(
         v, "De/DT/B-NP/O zwarte/JJ/I-NP/O kat/NN/I-NP/O " +
         "zat/VBD/B-VP/O " +
         "op/IN/B-PP/B-PNP de/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O")
     # 2) "jaagt" and "vogels" lemmata are "jagen" and "vogel".
     v = nl.parser.parse("De zwarte kat jaagt op vogels.", lemmata=True)
     self.assertEqual(
         v, "De/DT/B-NP/O/de zwarte/JJ/I-NP/O/zwart kat/NN/I-NP/O/kat " +
         "jaagt/VBZ/B-VP/O/jagen " +
         "op/IN/B-PP/B-PNP/op vogels/NNS/B-NP/I-PNP/vogel ././O/O/.")
     # Assert the accuracy of the Dutch tagger.
     i, n = 0, 0
     for sentence in open(
             os.path.join(PATH, "corpora",
                          "tagged-nl-twnc.txt")).readlines():
         sentence = sentence.strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s1 = [nl.wotan2penntreebank(w, tag) for w, tag in s1]
         s2 = [[w for w, pos in s1]]
         s2 = nl.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             if s1[j][1] == s2[j][1]:
                 i += 1
             n += 1
     self.assertTrue(float(i) / n > 0.90)
     print("pattern.nl.parser.parse()")
Beispiel #3
0
 def test_wotan2penntreebank(self):
     # Assert tag translation.
     for penntreebank, wotan in (
             ("NNP",  "N(eigen,ev,neut)"),
             ("NNPS", "N(eigen,mv,neut)"),
             ("NN",   "N(soort,ev,neut)"),
             ("NNS",  "N(soort,mv,neut)"),
             ("VBZ",  "V(refl,ott,3,ev)"),
             ("VBP",  "V(intrans,ott,1_of_2_of_3,mv)"),
             ("VBD",  "V(trans,ovt,1_of_2_of_3,mv)"),
             ("VBN",  "V(trans,verl_dw,onverv)"),
             ("VBG",  "V(intrans,teg_dw,onverv)"),
             ("VB",   "V(intrans,inf)"),
             ("MD",   "V(hulp_of_kopp,ott,3,ev)"),
             ("JJ",   "Adj(attr,stell,onverv)"),
             ("JJR",  "Adj(adv,vergr,onverv)"),
             ("JJS",  "Adj(attr,overtr,verv_neut)"),
             ("RP",   "Adv(deel_v)"),
             ("RB",   "Adv(gew,geen_func,stell,onverv)"),
             ("DT",   "Art(bep,zijd_of_mv,neut)"),
             ("CC",   "Conj(neven)"),
             ("CD",   "Num(hoofd,bep,zelfst,onverv)"),
             ("TO",   "Prep(voor_inf)"),
             ("IN",   "Prep(voor)"),
             ("PRP",  "Pron(onbep,neut,attr)"),
             ("PRP$", "Pron(bez,2,ev,neut,attr)"),
             (",",    "Punc(komma)"),
             ("(",    "Punc(haak_open)"),
             (")",    "Punc(haak_sluit)"),
             (".",    "Punc(punt)"),
             ("UH",   "Int"),
             ("SYM",  "Misc(symbool)")):
         self.assertEqual(nl.wotan2penntreebank("", wotan)[1], penntreebank)
     print("pattern.nl.wotan2penntreebank()")
Beispiel #4
0
 def test_wotan2penntreebank(self):
     # Assert tag translation.
     for penntreebank, wotan in (
       ("NNP",  "N(eigen,ev,neut)"),
       ("NNPS", "N(eigen,mv,neut)"),
       ("NN",   "N(soort,ev,neut)"),
       ("NNS",  "N(soort,mv,neut)"),
       ("VBZ",  "V(refl,ott,3,ev)"),
       ("VBP",  "V(intrans,ott,1_of_2_of_3,mv)"),
       ("VBD",  "V(trans,ovt,1_of_2_of_3,mv)"),
       ("VBN",  "V(trans,verl_dw,onverv)"),
       ("VBG",  "V(intrans,teg_dw,onverv)"),
       ("VB",   "V(intrans,inf)"),
       ("MD",   "V(hulp_of_kopp,ott,3,ev)"),
       ("JJ",   "Adj(attr,stell,onverv)"),
       ("JJR",  "Adj(adv,vergr,onverv)"),
       ("JJS",  "Adj(attr,overtr,verv_neut)"),
       ("RP",   "Adv(deel_v)"),
       ("RB",   "Adv(gew,geen_func,stell,onverv)"),
       ("DT",   "Art(bep,zijd_of_mv,neut)"),
       ("CC",   "Conj(neven)"),
       ("CD",   "Num(hoofd,bep,zelfst,onverv)"),
       ("TO",   "Prep(voor_inf)"),
       ("IN",   "Prep(voor)"),
       ("PRP",  "Pron(onbep,neut,attr)"),
       ("PRP$", "Pron(bez,2,ev,neut,attr)"),
       (",",    "Punc(komma)"),
       ("(",    "Punc(haak_open)"),
       (")",    "Punc(haak_sluit)"),
       (".",    "Punc(punt)"),
       ("UH",   "Int"),
       ("SYM",  "Misc(symbool)")):
         self.assertEqual(nl.wotan2penntreebank("", wotan)[1], penntreebank)
     print "pattern.nl.wotan2penntreebank()"