Ejemplo n.º 1
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [("(.)ab(o|is|it|imus|itis|unt)$", r"\1o")]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = "amabimus"
     target = [("amabimus", "amo")]
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Ejemplo n.º 2
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Ejemplo n.º 3
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Ejemplo n.º 4
0
    def evaluate(self: object):
        if self.VERBOSE:
            raise AssertionError("evaluate() method only works when verbose: bool = False")
        return self.lemmatizer.evaluate(self.test_sents)

    def __repr__(self: object):
        return f'<BackoffLatinLemmatizer v0.2>'

if __name__ == '__main__':

    from pprint import pprint
    l1 = DefaultLemmatizer('UNK', verbose=True)
    l2 = DictLemmatizer(lemmas={'arma': 'arma', 'uirum': 'uir'}, backoff=l1, verbose=True)
    l3 = UnigramLemmatizer(train=[[('cano', 'cano'), ('.', 'punc')],], backoff=l2, verbose=True)
    l4 = RegexpLemmatizer(regexps=[('(.)tat(is|i|em|e|es|um|ibus)$', r'\1tas'),], backoff=l3, verbose=True)
    lemmas = l4.lemmatize('arma uirum -que cano nobilitatis .'.split())
    pprint(lemmas)

    # [('arma', 'arma', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>),
    # ('uirum', 'uir', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>),
    # ('-que', '-que', <DictLemmatizer: {'!': 'punc', ...}>),
    # ('cano', 'cano', <DictLemmatizer: {'-nam': 'nam', ...}>),
    # ('nobilitatis',
    # 'nobilitas',
    # <RegexpLemmatizer: [('(bil)(is|i|e...es|ium|ibus)$', '\\1is'), ...]>),
    # ('.', 'punc', <DictLemmatizer: {'!': 'punc', ...}>)]

    print('\n')

    bll = BackoffLatinLemmatizer(seed=5, verbose=False)
    lemmas = bll.lemmatize('arma uirum -que cano nobilitatis .'.split())