def test_unigram_lemmatizer(self): """Test unigram_lemmatizer()""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = UnigramLemmatizer(train=train) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def _define_lemmatizer(self: object): # Suggested backoff chain--should be tested for optimal order self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE) self.backoff2 = DictLemmatizer( lemmas=self.GREEK_OLD_MODEL, source="Morpheus Lemmas", backoff=self.backoff1, verbose=self.VERBOSE, ) self.backoff3 = RegexpLemmatizer( self.greek_sub_patterns, source="CLTK Greek Regex Patterns", backoff=self.backoff2, verbose=self.VERBOSE, ) self.backoff4 = UnigramLemmatizer( self.train_sents, source="CLTK Sentence Training Data", backoff=self.backoff3, verbose=self.VERBOSE, ) self.backoff5 = DictLemmatizer( lemmas=self.GREEK_MODEL, source="Greek Model", backoff=self.backoff4, verbose=self.VERBOSE, ) self.lemmatizer = self.backoff5
def _define_lemmatizer(self: object): # Suggested backoff chain--should be tested for optimal order self.backoff0 = None self.backoff1 = IdentityLemmatizer(verbose=self.VERBOSE) self.backoff2 = DictLemmatizer(lemmas=self.LATIN_OLD_MODEL, source='Morpheus Lemmas', backoff=self.backoff1, verbose=self.VERBOSE) self.backoff3 = RegexpLemmatizer(self.latin_sub_patterns, source='CLTK Latin Regex Patterns', backoff=self.backoff2, verbose=self.VERBOSE) self.backoff4 = UnigramLemmatizer(self.train_sents, source='CLTK Sentence Training Data', backoff=self.backoff3, verbose=self.VERBOSE) self.backoff5 = DictLemmatizer(lemmas=self.LATIN_MODEL, source='Latin Model', backoff=self.backoff4, verbose=self.VERBOSE) self.lemmatizer = self.backoff5
def test_unigram_lemmatizer(self): """Test unigram_lemmatizer()""" train = [ [ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] ] # pylint: disable=line-too-long lemmatizer = UnigramLemmatizer(train=train) test_str = """Ceterum antequam destinata componam""" target = [ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] # pylint: disable=line-too-long tokenizer = LatinWordTokenizer() test_str = test_str.lower() test_str = replace_jv(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
return lemmas def evaluate(self: object): if self.VERBOSE: raise AssertionError("evaluate() method only works when verbose: bool = False") return self.lemmatizer.evaluate(self.test_sents) def __repr__(self: object): return f'<BackoffLatinLemmatizer v0.2>' if __name__ == '__main__': from pprint import pprint l1 = DefaultLemmatizer('UNK', verbose=True) l2 = DictLemmatizer(lemmas={'arma': 'arma', 'uirum': 'uir'}, backoff=l1, verbose=True) l3 = UnigramLemmatizer(train=[[('cano', 'cano'), ('.', 'punc')],], backoff=l2, verbose=True) l4 = RegexpLemmatizer(regexps=[('(.)tat(is|i|em|e|es|um|ibus)$', r'\1tas'),], backoff=l3, verbose=True) lemmas = l4.lemmatize('arma uirum -que cano nobilitatis .'.split()) pprint(lemmas) # [('arma', 'arma', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>), # ('uirum', 'uir', <UnigramLemmatizer: [[('res', 'res'), ...], ...]>), # ('-que', '-que', <DictLemmatizer: {'!': 'punc', ...}>), # ('cano', 'cano', <DictLemmatizer: {'-nam': 'nam', ...}>), # ('nobilitatis', # 'nobilitas', # <RegexpLemmatizer: [('(bil)(is|i|e...es|ium|ibus)$', '\\1is'), ...]>), # ('.', 'punc', <DictLemmatizer: {'!': 'punc', ...}>)] print('\n')