def test_backoff_latin_lemmatizer(self): """Test backoffLatinLemmatizer""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer(train=train) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_backoff_latin_lemmatizer_verbose(self): """Test backoffLatinLemmatizer""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer(verbose=True) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
class LemmatizerLatin: def __init__(self, token=True): self.lemmatizer = BackoffLatinLemmatizer() self.token = token def preprocess(self, text): if self.token: lemma = self.lemmatizer.lemmatize(text) else: plv = PunktLanguageVars() unigrams = plv.word_tokenize(text) lemma = self.lemmatizer.lemmatize(unigrams) lemma = [t[0] if t[1] == "punc" else t[1] for t in lemma] return " ".join(lemma)
def latin_lemma_text(list_of_texts, stopwords=None): ''' Create a list of continuous lemma texts for Latin with cltk (prerequisite). list_of_texts: raw text items stored in a list object stopwords: list of stopwords to be removed, default is None where nothing is removed Latin lemmatizer is cltk's BackoffLatinLemmatizer. Install, import and load before using the function ''' # Import packages and models from cltk and initialize tools from cltk.corpus.utils.importer import CorpusImporter from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer corpus_importer = CorpusImporter( 'latin') # Initialize cltk's CorpusImporter corpus_importer.import_corpus( 'latin_models_cltk') # Import the latin_models_cltk corpus lemmatizer = BackoffLatinLemmatizer() # Initialize Latin lemmatizer import re punctuation = r"[\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»]" # Punctuation pattern a = [] for i in range(len(list_of_texts)): text = str(list_of_texts[i]) new_text = ''.join([ "" if ord(i) < 32 or ord(i) > 126 else i for i in text ]) # Remove Greek (non-ASCII) characters text_no_punct = re.sub(punctuation, '', new_text) # Remove punctuation text_one_white_space = re.sub( r"\s{2,}", ' ', text_no_punct) # Leave only one white space b/w words text_no_trailing_space = text_one_white_space.strip( ) # Remove trailing white space text_lower = text_no_trailing_space.lower( ) # Transform to all lower case text_split = text_lower.split(' ') # Split to a list of tokens lemmas = lemmatizer.lemmatize(text_split) # Lemmatize textunit = '' # Empyt string for textunti for y in range(len(lemmas)): if stopwords is not None: if lemmas[y][1] not in stopwords: textunit = textunit + str(lemmas[y][1] + ' ') else: textunit = textunit + str(lemmas[y][1] + ' ') textunit = textunit.strip() a.append(textunit) # Add the "document" to a list return a
def stage4(tokens): # lemmatize() returns list of tuples results = BackoffLatinLemmatizer().lemmatize(tokens) lemmas = [] # unique lemmas for result in results: lemma = result[1] if lemma not in lemmas: lemmas.append(lemma) return lemmas
def process(text): tokens = WordTokenizer('latin').tokenize(text) # lemmatize() returns list of tuples results = BackoffLatinLemmatizer().lemmatize(tokens) lemmas = [] # unique lemmas for result in results: lemma = result[1] if lemma not in lemmas: lemmas.append(lemma) return lemmas
def tokenize(request): language = request['Content-Language'] src_data = request['Payload'] print(language) word_tokenizer = WordTokenizer(language) data = word_tokenizer.tokenize(src_data) clean_data = list(map(cltk_normalize, [w for w in data if w.isalpha()])) # and not w in STOPS_LIST] # lemma = LemmaReplacer(language).lemmatize(clean_data) lemma = None if language == 'greek': lemma = BackoffGreekLemmatizer().lemmatize(clean_data) elif language == 'latin': lemma = BackoffLatinLemmatizer().lemmatize(clean_data) result = [] for i, elem in enumerate(lemma): w, l = elem result.append({'index': i + 1, 'word': w, 'lemma': l}) return result
def __init__(self, token=True): self.lemmatizer = BackoffLatinLemmatizer() self.token = token
import os, os.path from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer L_OPEN = '<l' L_CLOSE = '</l>' LB_OPEN = '<lb' S_DATA_PATH = 'data/structured/' C_SIZE = 150 parser = argparse.ArgumentParser(description='Parse unstructered documents.') parser.add_argument('--document', required=True, help='Path to document to parse.') args = parser.parse_args() lemmatizer = BackoffLatinLemmatizer() lemmas_result, tokens_result = [], [] def tokenize(line): line = re.sub('<note[^<]+note>', '', line) line = re.sub('<[^<]+>', '', line) line = line.translate(str.maketrans('', '', string.punctuation)) line = line.replace('\n', '') line = line.replace('-', '') line = line.replace('—', '') line = line.replace('“', '') line = line.replace('”', '') line = line.lower() return line.split() with open(args.document, 'r', encoding='utf8') as f:
import polyglot from polyglot.downloader import downloader from polyglot.text import Text import statistics import math import openpyxl from openpyxl import Workbook from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer from cltk.corpus.utils.importer import CorpusImporter from cltk.stem.latin.j_v import JVReplacer corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') lemmatizer = BackoffLatinLemmatizer() j = JVReplacer() def lemmatize(text): text = j.replace(text) tokens = [token for token in text.split()] lemmatized = lemmatizer.lemmatize(tokens) lemmatized_text = " ".join([token[1] for token in lemmatized]) return lemmatized_text def motets_ordered_by_difference(motets): motets.sort(key=lambda x: x.sentiment_difference()) book = Workbook() sheet = book.active sheet['A1'] = "Title" sheet['B1'] = "Composer"
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer import os #first, build list of filenames filenames = [] for filename in os.listdir('ovid'): filenames.append(str(filename)) #then, concatenate them into one text file https://stackoverflow.com/questions/13613336/python-concatenate-text-files with open('corpora/all.txt', 'w') as outfile: for fname in filenames: with open('ovid/' + fname, 'r') as infile: for line in infile: outfile.write(line) lemmatizer = BackoffLatinLemmatizer() ltr_str = '' file = open('corpora/all.txt', 'r') for line in file: ltr_str += str(line) file.close() np_str = np.asarray(ltr_str) for symbol in string.punctuation: np_str = np.char.replace(np_str, symbol, '') np_str = np.char.lower(np_str) tokens = np_str.tolist().split() lemmatized = lemmatizer.lemmatize(tokens)
# Import basic packages import pandas as pd import re import numpy as np # Import packages and models from cltk and initialize tools from cltk.corpus.utils.importer import CorpusImporter from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer corpus_importer = CorpusImporter('latin') # Initialize cltk's CorpusImporter corpus_importer.import_corpus( 'latin_models_cltk') # Import the latin_models_cltk corpus lemmatizer = BackoffLatinLemmatizer() # Initialize Latin lemmatizer from cltk.stem.latin.j_v import JVReplacer # Import and initialize TfidfVecotirizer with custom stoplist from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() # Load dataframes file_path_df = '/home/mribary/Dropbox/pyDigest/dump/Ddf_v106.csv' file_path_s = '/home/mribary/Dropbox/pyDigest/dump/Ddf_sections_v001.csv' file_path_sID = '/home/mribary/Dropbox/pyDigest/dump/Ddf_Section_IDs_v001.csv' file_path_stoplist = '/home/mribary/Dropbox/pyDigest/dump/D_stoplist_001.txt' df = pd.read_csv(file_path_df, index_col=0) # text units (21055) s = pd.read_csv(file_path_s, index_col=0) # text unitts with section IDs (21055) sID = pd.read_csv(file_path_sID, index_col=0) # sections with section IDs (432) D_stoplist = list(pd.read_csv(file_path_stoplist,
def test_backoff_latin_lemmatizer_evaluate(self): """Test backoffLatinLemmatizer evaluate method""" lemmatizer = BackoffLatinLemmatizer(verbose=False) accuracy = lemmatizer.evaluate() self.assertTrue(.85 <= accuracy <= 1)
path = os.path.expanduser(rel_path) # Check for presence of latin_pos_lemmatized_sents file = 'latin_pos_lemmatized_sents.pickle' latin_pos_lemmatized_sents_path = os.path.join(path, file) if os.path.isfile(latin_pos_lemmatized_sents_path): latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path) else: latin_pos_lemmatized_sents = [] print('The file %s is not available in cltk_data' % file) # Set up CLTK tools word_tokenizer = WordTokenizer('latin') lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents) # Get tokens tokens = clean_and_write.vocab_set('clean_texts/ov_met_6_clean.txt') print(tokens) # Get lemmas lemmas = lemmatizer.lemmatize(tokens) print(lemmas) lemmata = [] for (x, y) in lemmas: lemmata.append(y) print(sorted(set(lemmata)))
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer crimefile = open("passio.txt", 'r') lemmatized = open("lemmatized.txt", 'w') tokens = [] for line in crimefile: tokens = line.split() lemmatizer = BackoffLatinLemmatizer() out = lemmatizer.lemmatize(tokens) for word in out: string = word[1] + "\n" lemmatized.write(string)
def test_backoff_latin_lemmatizer_evaluate_verbose(self): """Test backoffLatinLemmatizer evaluate method""" lemmatizer = BackoffLatinLemmatizer(verbose=True) with self.assertRaises(AssertionError): accuracy = lemmatizer.evaluate()
# la_lemmatizer = LemmaReplacer('latin') # Latin Lemmatizer (NEW with backoff) # Set up training sentences rel_path = os.path.join('/Users/christiancasey/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) # Check for presence of latin_pos_lemmatized_sents file = 'latin_pos_lemmatized_sents.pickle' latin_pos_lemmatized_sents_path = os.path.join(path, file) if os.path.isfile(latin_pos_lemmatized_sents_path): latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path) else: latin_pos_lemmatized_sents = [] print('The file %s is not available in cltk_data' % file) la_lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents) # Greek Lemmatizer grc_corpus_importer = CorpusImporter('greek') grc_corpus_importer.import_corpus('greek_models_cltk') grc_lemmatizer = LemmaReplacer('greek') # Initialize lemmatizers once outside of the loop, # then select based on langauge inside the loop -- get_words_from_file() tagLat = POSTag('latin') tagGrk = POSTag('greek') def lemmatize(word_list, copy): for word in word_list: if copy:
def test_backoff_latin_lemmatizer_models_not_present(self): """Test whether models are present for BackoffLatinLemmatizer""" with patch.object(BackoffLatinLemmatizer, 'models_path', ''): with self.assertRaises(FileNotFoundError): lemmatizer = BackoffLatinLemmatizer()