def test_model_lemmatizer(self): """Test model_lemmatizer()""" model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'} # pylint: disable=line-too-long lemmatizer = TrainLemmatizer(model=model) test_str = 'Ceterum antequam destinata componam' target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
numberOfFails = 0 numberOfSuccesses = 0 preprocessedTitle = "" # a temp string where we store the ongoing preprocessing work on a title successfulHits = [] word_tokenizer = WordTokenizer('latin') # build standard dictionary/model # courtesy of Patrick Burns rel_path = os.path.join( '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) file = 'latin_lemmata_cltk.pickle' old_model_path = os.path.join(path, file) LATIN_OLD_MODEL = open_pickle(old_model_path) # make standard lemmatizer # as an instance of TrainLemmatizer # courtesy of Patrick Burns lemmatizer = TrainLemmatizer(model=LATIN_OLD_MODEL, backoff=default) # import custom dictionary csv as python dictionary customDictionaryPath = os.path.join(cwd, 'customDictionary.csv') with open( customDictionaryPath, 'r') as f: # this should close the file after the end of the with loop reader = csv.DictReader(f) for row in reader: customDictionaryCurrentLength += 1 if row['lemma'] == "": continue # in case a token has been added to custom dictionary but no lemma has yet been provided it customDictionary[row['token']] = row['lemma'] # make custom lemmatizer lemmatizer2 = TrainLemmatizer(model=customDictionary, backoff=lemmatizer)
'sed', 'si', 'sic', 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero', 'unus', 'ut', 'qvi', 'qve,', 'qvam', 'qvae', 'qvam' ] # Ниже проходимся по файлам, из-за того, что тексты большие, пришлось разделить их на несколько групп. a = [] for elem in os.listdir()[5:9]: with open(elem, 'r', encoding='UTF-8') as t: for line in t.readlines(): for word in line.split(): a.append(word) print(a) line = ' '.join(a) # Импортируем латинский корпус. corpus_importer = CorpusImporter('latin') lemmatizer = TrainLemmatizer( corpus_importer.import_corpus('latin_models_cltk')) # Лемматизируем все тексты и убираем стоп-слова. sentence = line.lower() lemmatizer = LemmaReplacer('latin') l = lemmatizer.lemmatize(sentence) line1 = ' '.join(l) t = [] for i in line1.split(): if i in stop: continue else: t.append(i) t = ' '.join(t) # Делим слова на пары. bigrm = list(nltk.bigrams(t.split())) print(bigrm)