Ejemplo n.º 1
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [("(.)ab(o|is|it|imus|itis|unt)$", r"\1o")]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = "amabimus"
     target = [("amabimus", "amo")]
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Ejemplo n.º 2
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Ejemplo n.º 3
0
 def __init__(self: object, default: str = None, backoff: object = None):
     """
     RomanNumeralLemmatizer
     :type default: str
     :param default: Default replacement for lemma; 'NUM' in given pattern
     """
     regexps = [
         (r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'),
         (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')
         ]
     RegexpLemmatizer.__init__(self, regexps, backoff)
     self._regexs = [(re.compile(regexp), pattern,) for regexp, pattern in regexps]
     self.default = default
Ejemplo n.º 4
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Ejemplo n.º 5
0
 def _define_lemmatizer(self: object):
     # Suggested backoff chain--should be tested for optimal order
     self.backoff0 = None
     self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE)
     self.backoff2 = DictLemmatizer(
         lemmas=self.GREEK_OLD_MODEL,
         source="Morpheus Lemmas",
         backoff=self.backoff1,
         verbose=self.VERBOSE,
     )
     self.backoff3 = RegexpLemmatizer(
         self.greek_sub_patterns,
         source="CLTK Greek Regex Patterns",
         backoff=self.backoff2,
         verbose=self.VERBOSE,
     )
     self.backoff4 = UnigramLemmatizer(
         self.train_sents,
         source="CLTK Sentence Training Data",
         backoff=self.backoff3,
         verbose=self.VERBOSE,
     )
     self.backoff5 = DictLemmatizer(
         lemmas=self.GREEK_MODEL,
         source="Greek Model",
         backoff=self.backoff4,
         verbose=self.VERBOSE,
     )
     self.lemmatizer = self.backoff5
Ejemplo n.º 6
0
 def _define_lemmatizer(self: object):
     # Suggested backoff chain--should be tested for optimal order
     self.backoff0 = None
     self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE)
     self.backoff2 = DictLemmatizer(lemmas=self.LATIN_OLD_MODEL, source='Morpheus Lemmas', backoff=self.backoff1, verbose=self.VERBOSE)
     self.backoff3 = RegexpLemmatizer(self.latin_sub_patterns, source='CLTK Latin Regex Patterns', backoff=self.backoff2, verbose=self.VERBOSE)
     self.backoff4 = UnigramLemmatizer(self.train_sents, source='CLTK Sentence Training Data', backoff=self.backoff3, verbose=self.VERBOSE)
     self.backoff5 = DictLemmatizer(lemmas=self.LATIN_MODEL, source='Latin Model', backoff=self.backoff4, verbose=self.VERBOSE)
     self.lemmatizer = self.backoff5
Ejemplo n.º 7
0
    def evaluate(self: object):
        if self.VERBOSE:
            raise AssertionError("evaluate() method only works when verbose: bool = False")
        return self.lemmatizer.evaluate(self.test_sents)

    def __repr__(self: object):
        return f'<BackoffLatinLemmatizer v0.2>'

if __name__ == '__main__':

    from pprint import pprint
    l1 = DefaultLemmatizer('UNK', verbose=True)
    l2 = DictLemmatizer(lemmas={'arma': 'arma', 'uirum': 'uir'}, backoff=l1, verbose=True)
    l3 = UnigramLemmatizer(train=[[('cano', 'cano'), ('.', 'punc')],], backoff=l2, verbose=True)
    l4 = RegexpLemmatizer(regexps=[('(.)tat(is|i|em|e|es|um|ibus)$', r'\1tas'),], backoff=l3, verbose=True)
    lemmas = l4.lemmatize('arma uirum -que cano nobilitatis .'.split())
    pprint(lemmas)

    # [('arma', 'arma', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>),
    # ('uirum', 'uir', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>),
    # ('-que', '-que', <DictLemmatizer: {'!': 'punc', ...}>),
    # ('cano', 'cano', <DictLemmatizer: {'-nam': 'nam', ...}>),
    # ('nobilitatis',
    # 'nobilitas',
    # <RegexpLemmatizer: [('(bil)(is|i|e...es|ium|ibus)$', '\\1is'), ...]>),
    # ('.', 'punc', <DictLemmatizer: {'!': 'punc', ...}>)]

    print('\n')

    bll = BackoffLatinLemmatizer(seed=5, verbose=False)