class LemmatizerLatin: def __init__(self, token=True): self.lemmatizer = BackoffLatinLemmatizer() self.token = token def preprocess(self, text): if self.token: lemma = self.lemmatizer.lemmatize(text) else: plv = PunktLanguageVars() unigrams = plv.word_tokenize(text) lemma = self.lemmatizer.lemmatize(unigrams) lemma = [t[0] if t[1] == "punc" else t[1] for t in lemma] return " ".join(lemma)
def test_backoff_latin_lemmatizer(self): """Test backoffLatinLemmatizer""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer(train=train) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_backoff_latin_lemmatizer_verbose(self): """Test backoffLatinLemmatizer""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer(verbose=True) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def latin_lemma_text(list_of_texts, stopwords=None): ''' Create a list of continuous lemma texts for Latin with cltk (prerequisite). list_of_texts: raw text items stored in a list object stopwords: list of stopwords to be removed, default is None where nothing is removed Latin lemmatizer is cltk's BackoffLatinLemmatizer. Install, import and load before using the function ''' # Import packages and models from cltk and initialize tools from cltk.corpus.utils.importer import CorpusImporter from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer corpus_importer = CorpusImporter( 'latin') # Initialize cltk's CorpusImporter corpus_importer.import_corpus( 'latin_models_cltk') # Import the latin_models_cltk corpus lemmatizer = BackoffLatinLemmatizer() # Initialize Latin lemmatizer import re punctuation = r"[\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»]" # Punctuation pattern a = [] for i in range(len(list_of_texts)): text = str(list_of_texts[i]) new_text = ''.join([ "" if ord(i) < 32 or ord(i) > 126 else i for i in text ]) # Remove Greek (non-ASCII) characters text_no_punct = re.sub(punctuation, '', new_text) # Remove punctuation text_one_white_space = re.sub( r"\s{2,}", ' ', text_no_punct) # Leave only one white space b/w words text_no_trailing_space = text_one_white_space.strip( ) # Remove trailing white space text_lower = text_no_trailing_space.lower( ) # Transform to all lower case text_split = text_lower.split(' ') # Split to a list of tokens lemmas = lemmatizer.lemmatize(text_split) # Lemmatize textunit = '' # Empyt string for textunti for y in range(len(lemmas)): if stopwords is not None: if lemmas[y][1] not in stopwords: textunit = textunit + str(lemmas[y][1] + ' ') else: textunit = textunit + str(lemmas[y][1] + ' ') textunit = textunit.strip() a.append(textunit) # Add the "document" to a list return a
line = re.sub('<[^<]+>', '', line) line = line.translate(str.maketrans('', '', string.punctuation)) line = line.replace('\n', '') line = line.replace('-', '') line = line.replace('—', '') line = line.replace('“', '') line = line.replace('”', '') line = line.lower() return line.split() with open(args.document, 'r', encoding='utf8') as f: for line in f: if (L_OPEN in line and L_CLOSE in line) or LB_OPEN in line: tokens = tokenize(line) if tokens != []: lemmas = lemmatizer.lemmatize(tokens) lemmas = list(zip(*lemmas))[1] lemma_str = "<BOS>" + " " + ' '.join(lemmas) + " " + "<EOS>" + " " line_str = "<BOS>" + " " + line + " " + "<EOS>" + " " lemmas_result.append(lemma_str) tokens_result.append(line_str) doc_start = len(os.listdir(S_DATA_PATH + 'lemmas/')) print(doc_start) for i in range(0, len(lemmas_result), C_SIZE): doc_name = str(doc_start + i // C_SIZE) + ".txt" with open(S_DATA_PATH + "lemmas/section" + doc_name, 'w') as out_file: out_file.writelines(lemmas_result[i:i + C_SIZE]) with open(S_DATA_PATH + "tokens/section" + doc_name, 'w') as out_file: out_file.writelines(tokens_result[i:i + C_SIZE])
lemmatizer = BackoffLatinLemmatizer() ltr_str = '' file = open('corpora/all.txt', 'r') for line in file: ltr_str += str(line) file.close() np_str = np.asarray(ltr_str) for symbol in string.punctuation: np_str = np.char.replace(np_str, symbol, '') np_str = np.char.lower(np_str) tokens = np_str.tolist().split() lemmatized = lemmatizer.lemmatize(tokens) with open('corpora/all_lemmatized.txt', 'w') as lemmata: for parsed in lemmatized: lemmata.write(parsed[1] + ' ') #print('all lemmata written successfully :)') #make list of heroides filnames her_filnames = [] for filename in filenames: if filename[5:8] == 'met': her_filnames.append(str(filename)) #add their text to corpora/her.txt with open('corpora/met.txt', 'w') as outfile: for fname in her_filnames: with open('ovid/' + fname, 'r') as infile:
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer crimefile = open("passio.txt", 'r') lemmatized = open("lemmatized.txt", 'w') tokens = [] for line in crimefile: tokens = line.split() lemmatizer = BackoffLatinLemmatizer() out = lemmatizer.lemmatize(tokens) for word in out: string = word[1] + "\n" lemmatized.write(string)
# Pre-process, tokenize, lemmatize text of thematic sections docs = [] for i in range(len(s_df.Section_id.unique())): text = str(list(s_df.TextUnit[ s_df.Section_id == i])) # Load all text units from a thematic unit new_text = ''.join( ["" if ord(i) < 32 or ord(i) > 126 else i for i in text]) # Remove Greek (non-ASCII) characters text_no_punct = re.sub(punctuation, '', new_text) # Remove punctuation text_one_white_space = re.sub( r"\s{2,}", ' ', text_no_punct) # Leave only one white space b/w words text_no_trailing_space = text_one_white_space.strip( ) # Remove trailing white space text_lower = text_no_trailing_space.lower() # Transform to all lower case text_split = text_lower.split(' ') # Split to a list of tokens lemmas = lemmatizer.lemmatize(text_split) # Lemmatize l = [] # Create empty list for lemmas for y in range(len(lemmas)): if lemmas[y][1] not in D_stoplist: l.append(lemmas[y][1]) # Load the lemma to the list l_string = ' '.join([str(word) for word in l]) # Drop stopwords and create a string docs.append(l_string) # Add the "document" to a list sID['doc'] = docs # Insert stopword-free lemma list into a new column # Pre-process, tokenize and lemmatize section titles section_titles = [] for x in sID.Section_id: text = sID.loc[sID.Section_id == x, 'Section_title'].values[0] # Load upper-case section titles new_text = ''.join(