Example #1
0
 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Example #2
0
 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Example #3
0
 def _define_lemmatizer(self: object):
     # Suggested backoff chain--should be tested for optimal order
     self.backoff0 = None
     self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE)
     self.backoff2 = DictLemmatizer(
         lemmas=self.GREEK_OLD_MODEL,
         source="Morpheus Lemmas",
         backoff=self.backoff1,
         verbose=self.VERBOSE,
     )
     self.backoff3 = RegexpLemmatizer(
         self.greek_sub_patterns,
         source="CLTK Greek Regex Patterns",
         backoff=self.backoff2,
         verbose=self.VERBOSE,
     )
     self.backoff4 = UnigramLemmatizer(
         self.train_sents,
         source="CLTK Sentence Training Data",
         backoff=self.backoff3,
         verbose=self.VERBOSE,
     )
     self.backoff5 = DictLemmatizer(
         lemmas=self.GREEK_MODEL,
         source="Greek Model",
         backoff=self.backoff4,
         verbose=self.VERBOSE,
     )
     self.lemmatizer = self.backoff5
Example #4
0
 def _define_lemmatizer(self: object):
     # Suggested backoff chain--should be tested for optimal order
     self.backoff0 = None
     self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE)
     self.backoff2 = DictLemmatizer(lemmas=self.LATIN_OLD_MODEL, source='Morpheus Lemmas', backoff=self.backoff1, verbose=self.VERBOSE)
     self.backoff3 = RegexpLemmatizer(self.latin_sub_patterns, source='CLTK Latin Regex Patterns', backoff=self.backoff2, verbose=self.VERBOSE)
     self.backoff4 = UnigramLemmatizer(self.train_sents, source='CLTK Sentence Training Data', backoff=self.backoff3, verbose=self.VERBOSE)
     self.backoff5 = DictLemmatizer(lemmas=self.LATIN_MODEL, source='Latin Model', backoff=self.backoff4, verbose=self.VERBOSE)
     self.lemmatizer = self.backoff5
Example #5
0
 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [
         [
             ("ceterum", "ceterus"),
             ("antequam", "antequam"),
             ("destinata", "destino"),
             ("componam", "compono"),
         ]
     ]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [
         ("ceterum", "ceterus"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]  # pylint: disable=line-too-long
     tokenizer = LatinWordTokenizer()
     test_str = test_str.lower()
     test_str = replace_jv(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Example #6
0
        return lemmas

    def evaluate(self: object):
        if self.VERBOSE:
            raise AssertionError("evaluate() method only works when verbose: bool = False")
        return self.lemmatizer.evaluate(self.test_sents)

    def __repr__(self: object):
        return f'<BackoffLatinLemmatizer v0.2>'

if __name__ == '__main__':

    from pprint import pprint
    l1 = DefaultLemmatizer('UNK', verbose=True)
    l2 = DictLemmatizer(lemmas={'arma': 'arma', 'uirum': 'uir'}, backoff=l1, verbose=True)
    l3 = UnigramLemmatizer(train=[[('cano', 'cano'), ('.', 'punc')],], backoff=l2, verbose=True)
    l4 = RegexpLemmatizer(regexps=[('(.)tat(is|i|em|e|es|um|ibus)$', r'\1tas'),], backoff=l3, verbose=True)
    lemmas = l4.lemmatize('arma uirum -que cano nobilitatis .'.split())
    pprint(lemmas)

    # [('arma', 'arma', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>),
    # ('uirum', 'uir', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>),
    # ('-que', '-que', <DictLemmatizer: {'!': 'punc', ...}>),
    # ('cano', 'cano', <DictLemmatizer: {'-nam': 'nam', ...}>),
    # ('nobilitatis',
    # 'nobilitas',
    # <RegexpLemmatizer: [('(bil)(is|i|e...es|ium|ibus)$', '\\1is'), ...]>),
    # ('.', 'punc', <DictLemmatizer: {'!': 'punc', ...}>)]

    print('\n')