Exemple #1
0
 def test_model_lemmatizer(self):
     """Test model_lemmatizer()"""
     model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'}  # pylint: disable=line-too-long
     lemmatizer = TrainLemmatizer(model=model)
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Exemple #2
0
 def test_model_lemmatizer(self):
     """Test model_lemmatizer()"""
     model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'}  # pylint: disable=line-too-long
     lemmatizer = TrainLemmatizer(model=model)
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Exemple #3
0
numberOfFails = 0
numberOfSuccesses = 0
preprocessedTitle = ""  # a temp string where we store the ongoing preprocessing work on a title
successfulHits = []
word_tokenizer = WordTokenizer('latin')

# build standard dictionary/model # courtesy of Patrick Burns
rel_path = os.path.join(
    '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)
file = 'latin_lemmata_cltk.pickle'
old_model_path = os.path.join(path, file)
LATIN_OLD_MODEL = open_pickle(old_model_path)

# make standard lemmatizer # as an instance of TrainLemmatizer # courtesy of Patrick Burns
lemmatizer = TrainLemmatizer(model=LATIN_OLD_MODEL, backoff=default)

# import custom dictionary csv as python dictionary
customDictionaryPath = os.path.join(cwd, 'customDictionary.csv')
with open(
        customDictionaryPath,
        'r') as f:  # this should close the file after the end of the with loop
    reader = csv.DictReader(f)
    for row in reader:
        customDictionaryCurrentLength += 1
        if row['lemma'] == "":
            continue  # in case a token has been added to custom dictionary but no lemma has yet been provided it
        customDictionary[row['token']] = row['lemma']

# make custom lemmatizer
lemmatizer2 = TrainLemmatizer(model=customDictionary, backoff=lemmatizer)
Exemple #4
0
    'sed', 'si', 'sic', 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam',
    'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero', 'unus', 'ut', 'qvi',
    'qve,', 'qvam', 'qvae', 'qvam'
]
# Ниже проходимся по файлам, из-за того, что тексты большие, пришлось разделить их на несколько групп.
a = []
for elem in os.listdir()[5:9]:
    with open(elem, 'r', encoding='UTF-8') as t:
        for line in t.readlines():
            for word in line.split():
                a.append(word)
print(a)
line = ' '.join(a)
# Импортируем латинский корпус.
corpus_importer = CorpusImporter('latin')
lemmatizer = TrainLemmatizer(
    corpus_importer.import_corpus('latin_models_cltk'))
# Лемматизируем все тексты и убираем стоп-слова.
sentence = line.lower()
lemmatizer = LemmaReplacer('latin')
l = lemmatizer.lemmatize(sentence)
line1 = ' '.join(l)
t = []
for i in line1.split():
    if i in stop:
        continue
    else:
        t.append(i)
t = ' '.join(t)
# Делим слова на пары.
bigrm = list(nltk.bigrams(t.split()))
print(bigrm)