def test_latin_translations(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) translations = Synonyms(dictionary = 'translations', language = 'latin') translations = translations.lookup_synonyms(lemmas) target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])] self.assertEqual(translations, target)
def test_latin_synonyms(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) synonyms = Synonyms(dictionary = 'synonyms', language = 'latin') syns = synonyms.lookup_synonyms(lemmas) target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])] self.assertEqual(syns, target)
def test_latin_translations(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary='lemmata', language='latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) translations = Synonyms(dictionary='translations', language='latin') translations = translations.lookup_synonyms(lemmas) target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])] self.assertEqual(translations, target)
def test_latin_synonyms(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary='lemmata', language='latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) synonyms = Synonyms(dictionary='synonyms', language='latin') syns = synonyms.lookup_synonyms(lemmas) target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])] self.assertEqual(syns, target)
class FrequencyModel: '''Generate unsupervised count of lemma frequencies in the Tesserae Latin text corpus.''' def __init__(self): self.lemmatizer = Lemmata(dictionary='lemmata', language='latin') self.jv = JVReplacer() self.word_tokenizer = WordTokenizer('latin') self.count_dictionary = dict() self.punctuation_list = [ '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',', '"', '\'' ] def read_files(self, filepath): '''Reads the corpus and builds the self.count_dictionary dictionary object by calling the countgram() method on individual tokens. Dependencies ------------ TessFile class from tesserae.utils Lemmata class from cltk.semantics.latin.lookup JVReplacer class from cltk.stem.latin.j_v WordTokenizer class from cltk.tokenize.word Parameters ---------- filepath: a file in .tess format Results ------- Updates self.count_dictionary Returns ------- none''' tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) stop = 0 while stop != 1: try: rawtoken = next(tokengenerator) cleantoken_list = self.token_cleanup(rawtoken) token = cleantoken_list[0] self.countgram(token) except StopIteration: stop = 1 def countgram(self, targettoken): '''Update the frequency model with a new token from the corpus.''' lemmas = self.lemmatizer.lookup([targettoken]) lemmas = self.lemmatizer.isolate(lemmas) for lem in lemmas: try: test_presence = self.count_dictionary[lem] except KeyError: self.count_dictionary[lem] = 0 self.count_dictionary[lem] += 1 def lemmatize(self, target): '''Use the unsupervised count of lemma frequencies generated by read_files() to assign probabilities in the case of an ambiguous lemmatization. parameters ---------- target: a token to be lemmatized results ------- a list of tuples of the form [(lemma, probability)] ''' if target in self.punctuation_list: lemmalist = [('punc', 1)] return lemmalist if target == 'ne': lemmalist = [('ne', 1)] return lemmalist lemmalist = self.lemmatizer.lookup([target]) lemmas = self.lemmatizer.isolate(lemmalist) if len(lemmas) > 1: all_lemmas_total = sum([self.count_dictionary[l] for l in lemmas]) try: lemmalist = [(l, (self.count_dictionary[l] / all_lemmas_total)) for l in lemmas] except ZeroDivisionError: print([(self.count_dictionary[l], l) for l in lemmas]) return lemmalist lemmalist = [] lemmaobj = (lemmas[0], 1) lemmalist.append(lemmaobj) return lemmalist def token_cleanup(self, rawtoken): '''Standardize tokens by replaceing j with i and v with u, and split into multiple tokens as needed with tokenize() method of word_tokenizer class parameters ---------- rawtoken: the token as drawn from the text return ------ tokenlist: a list of possible word or punctuation tokens ''' rawtoken = self.jv.replace(rawtoken) rawtoken = rawtoken.lower() tokenlist = self.word_tokenizer.tokenize(rawtoken) #sometimes words are split into enclitics and punctuation. return tokenlist def save_pickle(self, filename): '''Saves the self.count_dictionary object for later reuse. dependencies ------------ os package parameters ---------- filename: name for the pickle file''' relativepath = join('~', 'cltk_data', 'latin', 'model', 'latin_models_cltk', 'frequency') path = expanduser(relativepath) pickle_file = join(path, filename) if not os.path.isdir(path): os.makedirs(path) pickle.dump(self.count_dictionary, open(pickle_file, "wb")) def load_pickle(self, filename): '''Load the self.count_dictionary object saved by save_pickle. dependencies ------------ os package parameters ---------- filename: name of the pickle file''' relativepath = join('~', 'cltk_data', 'latin', 'model', 'latin_models_cltk', 'frequency') path = expanduser(relativepath) pickle_file = join(path, filename) pickle_file = open(pickle_file, 'rb') self.count_dictionary = pickle.load(pickle_file) def train_model(self): '''open all the tesserae files and call read_files() on each to build freq model''' relativepath = join('~', 'cltk_data', 'latin', 'text', 'latin_text_tesserae_collection', 'la') path = expanduser(relativepath) onlyfiles = [f for f in listdir(path) if isfile(join(path, f)) and 'augustine' not in f and 'ambrose' not in f and 'jerome' not in f and 'tertullian' not in f and 'eugippius' not in f and 'hilary' not in f] # pylint: disable=line-too-long onlyfiles = [join(path, f) for f in onlyfiles] for filename in onlyfiles: if '.tess' in filename: print(filename) self.read_files(filename) def test_count_dictionary(self, token_list, lemma_list): '''Test the ability of lemmatize(), (which uses the self.count_dictionary dictionary, to predict the most likely lemmatization in ambiguous cases. Punctuation is automatically counted as correct, because the 'punc' lemmatization usage is inconsistent in the test corpus. dependencies ------------ itemgetter class from operator package parameters ---------- token_list: a list of tokens lemma_list: a list of corresponding 'correct' lemmatizaitons results ------- prints four numbers: the number of correctly assigned lemmas in ambiguous cases; the number of ambiguous cases in total; the number of tokens analyzed; and a decimal between 0 and 1 representing the proportion of correct lemmatizations. return ------ a list object containing all incorrect lemmatizations for analysis. Format: [(token, answer_given, correct_answer), (token...)] NOTE: Initial tests show roughly 91% accuracy, identification of punctuation included. ''' trials = 0 correct = 0 errors = [] for position in range(0, (len(token_list) - 1)): lemmalist = self.lemmatizer.lookup(token_list[position]) lemmalist = lemmalist[1] lemma = max(lemmalist, key=itemgetter(1)) if len(lemmalist) > 1: trials = trials + 1 if lemma[0] == lemma_list[position] or lemma[0] == 'punc': correct = correct + 1 else: errors.append( (token_list[position], lemma[0], lemma_list[position])) print(correct) print(trials) print(len(lemma_list)) rate = (len(lemma_list) - trials + correct) / len(lemma_list) print(rate) return errors