def __init__(self, connection): super(GreekTokenizer, self).__init__(connection) # Set up patterns that will be reused self.vowels = 'αειηουωΑΕΙΗΟΥΩ' self.grave = '\u0300' self.acute = '\u0301' self.sigma = 'σ\b' self.sigma_alt = 'ς' # diacriticals should not be considered part of ``word_characters`` so # that extraneous diacritical marks unattended by a proper word # character to bind to do not appear as proper words during # tokenization of display tokens (see BaseTokenizer.tokenize); # also ignore the middle dot character, which is a punctuation mark self.word_regex = re.compile('[ΆΈ-ώ' + self.sigma_alt + ']+', flags=re.UNICODE) self.diacrit_sub1 = \ r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})' self.diacrit_sub2 = \ r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{1})' self.split_pattern = ''.join([ '( / )|([\\s]+)|([^\\w\\d', self.diacriticals, self.sigma_alt, r"])" ]) self.lemmatizer = Lemmata('lemmata', 'greek')
def __init__(self, connection): super(LatinTokenizer, self).__init__(connection) # Set up patterns that will be reused self.jv_replacer = JVReplacer() self.lemmatizer = Lemmata('lemmata', 'lat') self.split_pattern = \ '( / )|([\\s]+)|([^\\w' + self.diacriticals + ']+)'
def __init__(self, connection): super(LatinTokenizer, self).__init__(connection) # Set up patterns that will be reused self.jv_replacer = JVReplacer() self.lemmatizer = Lemmata('lemmata', 'latin') self.split_pattern = \ '[<].+[>][\s]| / | \. \. \.|\.\~\.\~\.|[^\w' + self.diacriticals + ']'
def __init__(self): self.lemmatizer = Lemmata(dictionary='lemmata', language='latin') self.jv = JVReplacer() self.word_tokenizer = WordTokenizer('latin') self.count_dictionary = dict() self.punctuation_list = [ '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',', '"', '\'' ]
def test_latin_lemmata(self): """Test Lemmata class lookup() method""" lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) self.assertEqual(lemmas, target)
def test_latin_lemmata(self): """Test Lemmata class lookup() method""" lemmatizer = Lemmata(dictionary='lemmata', language='latin') test_str = 'Ceterum antequam destinata componam' target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) self.assertEqual(lemmas, target)
def test_latin_translations(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) translations = Synonyms(dictionary = 'translations', language = 'latin') translations = translations.lookup_synonyms(lemmas) target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])] self.assertEqual(translations, target)
def test_latin_synonyms(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) synonyms = Synonyms(dictionary = 'synonyms', language = 'latin') syns = synonyms.lookup_synonyms(lemmas) target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])] self.assertEqual(syns, target)
def test_latin_translations(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary='lemmata', language='latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) translations = Synonyms(dictionary='translations', language='latin') translations = translations.lookup_synonyms(lemmas) target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])] self.assertEqual(translations, target)
def test_latin_synonyms(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary='lemmata', language='latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) synonyms = Synonyms(dictionary='synonyms', language='latin') syns = synonyms.lookup_synonyms(lemmas) target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])] self.assertEqual(syns, target)
def __init__(self, connection): super(GreekTokenizer, self).__init__(connection) # Set up patterns that will be reused self.vowels = 'αειηουωΑΕΙΗΟΥΩ' self.grave = '\u0300' self.acute = '\u0301' self.sigma = 'σ\b' self.sigma_alt = 'ς' self.word_characters = 'Ά-ώ' + self.sigma_alt + self.diacriticals self.diacrit_sub1 = \ '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})' self.diacrit_sub2 = \ '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{1})' self.split_pattern = '[<].+[>][\s]| / |[^\w' + self.diacriticals + self.sigma_alt + '\']' self.lemmatizer = Lemmata('lemmata', 'greek')
class LatinTokenizer(BaseTokenizer): def __init__(self, connection): super(LatinTokenizer, self).__init__(connection) # Set up patterns that will be reused self.jv_replacer = JVReplacer() self.lemmatizer = Lemmata('lemmata', 'lat') self.split_pattern = \ '( / )|([\\s]+)|([^\\w' + self.diacriticals + ']+)' def normalize(self, raw, split=True): """Normalize a Latin word. Parameters ---------- raw : str or list of str The string(s) to normalize. Returns ------- normalized : str or list of str The normalized string(s). Notes ----- This function should be applied to Latin words prior to generating other features (e.g., lemmata). """ # Apply the global normalizer normalized, tags = super(LatinTokenizer, self).normalize(raw) # Replace j/v with i/u, respectively normalized = self.jv_replacer.replace(normalized) if split: normalized = re.split(self.split_pattern, normalized, flags=re.UNICODE) normalized = [ t for t in normalized if t and re.search(r'[\w]+', t) ] return normalized, tags def featurize(self, tokens): """Lemmatize a Latin token. Parameters ---------- tokens : list of str The token to featurize. Returns ------- lemmata : dict The features for the token. Notes ----- Input should be sanitized with `LatinTokenizer.normalize` prior to using this method. """ if not isinstance(tokens, list): tokens = [tokens] lemmata = self.lemmatizer.lookup(tokens) # print("Latin lemmata:", lemmata) fixed_lemmata = [] for lem in lemmata: lem_lemmata = [l[0] for l in lem[1]] fixed_lemmata.append(lem_lemmata) # print("fixed lemmata:", fixed_lemmata) grams = trigrammify(tokens) features = {'lemmata': fixed_lemmata, 'sound': grams} # print('features', features) # for i, l in enumerate(lemmata): # features.append({'lemmata': [lem[0] for lem in l[1]]}) return features
class GreekTokenizer(BaseTokenizer): def __init__(self, connection): super(GreekTokenizer, self).__init__(connection) # Set up patterns that will be reused self.vowels = 'αειηουωΑΕΙΗΟΥΩ' self.grave = '\u0300' self.acute = '\u0301' self.sigma = 'σ\b' self.sigma_alt = 'ς' # diacriticals should not be considered part of ``word_characters`` so # that extraneous diacritical marks unattended by a proper word # character to bind to do not appear as proper words during # tokenization of display tokens (see BaseTokenizer.tokenize); # also ignore the middle dot character, which is a punctuation mark self.word_regex = re.compile('[ΆΈ-ώ' + self.sigma_alt + ']+', flags=re.UNICODE) self.diacrit_sub1 = \ r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})' self.diacrit_sub2 = \ r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{1})' self.split_pattern = ''.join([ '( / )|([\\s]+)|([^\\w\\d', self.diacriticals, self.sigma_alt, r"])" ]) self.lemmatizer = Lemmata('lemmata', 'greek') def normalize(self, raw, split=True): """Normalize a single Greek word. Parameters ---------- raw : str or list of str The word to normalize. Returns ------- normalized : str The normalized string. """ # Perform the global normalization normalized, tags = super(GreekTokenizer, self).normalize(raw) # Convert grave accent to acute normalized = re.sub(self.grave, self.acute, normalized, flags=re.UNICODE) # Remove diacriticals from vowels normalized = re.sub(self.diacrit_sub1, r' \2', normalized, flags=re.UNICODE) normalized = re.sub(self.diacrit_sub2, r' \2\1', normalized, flags=re.UNICODE) # Substitute sigmas normalized = re.sub(self.sigma, self.sigma_alt, normalized, flags=re.UNICODE) # Remove digits and single-quotes from the normalized output normalized = re.sub(r"['\d]+", r' ', normalized, flags=re.UNICODE) # Split the output into a list of normalized tokens if requested if split: normalized = re.split(self.split_pattern, normalized, flags=re.UNICODE) normalized = [ t for t in normalized if t and re.search(r'[\w]+', t) ] return normalized, tags def featurize(self, tokens): """Get the features for a single Greek token. Parameters ---------- token : str The token to featurize. Returns ------- features : dict The features for the token. Notes ----- Input should be sanitized with `greek_normalizer` prior to using this method. """ lemmata = self.lemmatizer.lookup(tokens) fixed_lemmata = [] for lem in lemmata: lem_lemmata = [l[0] for l in lem[1]] fixed_lemmata.append(lem_lemmata) features = {'lemmata': fixed_lemmata} return features
class LatinTokenizer(BaseTokenizer): def __init__(self, connection): super(LatinTokenizer, self).__init__(connection) # Set up patterns that will be reused self.jv_replacer = JVReplacer() self.lemmatizer = Lemmata('lemmata', 'latin') self.split_pattern = \ '[<].+[>][\s]| / | \. \. \.|\.\~\.\~\.|[^\w' + self.diacriticals + ']' # def tokenize(self, raw, record=True, text=None): # normalized = unicodedata.normalize('NFKD', raw).lower() # normalized = self.jv_replacer.replace(normalized) # normalized = re.split(self.split_pattern, normalized, flags=re.UNICODE) # display = re.split(self.split_pattern, raw, flags=re.UNICODE) # featurized = self.featurize(normalized) # # tokens = [] # frequencies = collections.Counter( # [n for i, n in enumerate(normalized) if # re.search('[\w]+', normalized[i], flags=re.UNICODE)]) # frequency_list = [] # # try: # text_id = text.path # except AttributeError: # text_id = None # # base = len(self.tokens) # # for i, d in enumerate(display): # idx = i + base # if re.search('[\w]', d, flags=re.UNICODE): # n = normalized[i] # f = featurized[i] # t = Token(text=text_id, index=idx, display=d, form=n, **f) # else: # t = Token(text=text_id, index=idx, display=d) # tokens.append(t) # # # Update the internal record if necessary # if record: # self.tokens.extend([t for t in tokens]) # self.frequencies.update(frequencies) # frequencies = self.frequencies # if '' in self.frequencies: # del self.frequencies[''] # # print(frequencies) # print(self.frequencies) # # # Prep the freqeuncy objects # for k, v in frequencies.items(): # f = Frequency(text=text_id, form=k, frequency=v) # frequency_list.append(f) # # return tokens, frequency_list def normalize(self, raw): """Normalize a Latin word. Parameters ---------- raw : str or list of str The string(s) to normalize. Returns ------- normalized : str or list of str The normalized string(s). Notes ----- This function should be applied to Latin words prior to generating other features (e.g., lemmata). """ # Apply the global normalizer normalized = super(LatinTokenizer, self).normalize(raw) # Replace j/v with i/u, respectively normalized = self.jv_replacer.replace(normalized) return normalized def featurize(self, tokens): """Lemmatize a Latin token. Parameters ---------- tokens : list of str The token to featurize. Returns ------- lemmata : dict The features for the token. Notes ----- Input should be sanitized with `LatinTokenizer.normalize` prior to using this method. """ if not isinstance(tokens, list): tokens = [tokens] lemmata = self.lemmatizer.lookup(tokens) features = [] for i, l in enumerate(lemmata): features.append({'lemmata': [lem[0] for lem in l[1]]}) return features
---------- filepath: a file in .tess format ''' tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) stop = 0 while stop != 1: try: rawtoken = next(tokengenerator) cleantoken_list = token_cleanup(rawtoken) count_lemma(cleantoken_list[0]) except StopIteration: stop = 1 lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') def count_lemma(targettoken): '''Builds a complex data structure that will contain the 'average context' for each type in the corpus. param targettoken: the token in question param c: the context tokens global SKIP_LIBRARY: a dictionary whose keys are types and whose values are dictionaries; in turn their keys are context types and values are incremented counts. ''' global COUNT_LIBRARY lemmas = lemmatizer.lookup([targettoken]) lemmas = lemmatizer.isolate(lemmas) for lemma in lemmas: if lemma not in COUNT_LIBRARY: COUNT_LIBRARY[lemma] = 0
from cltk.semantics.latin.lookup import Lemmata _LEM_MAPPER = { 'latin': Lemmata('lemmata', 'lat'), 'greek': Lemmata('lemmata', 'grc') } def get_lemmatizer(language): return _LEM_MAPPER[language]
class FrequencyModel: '''Generate unsupervised count of lemma frequencies in the Tesserae Latin text corpus.''' def __init__(self): self.lemmatizer = Lemmata(dictionary='lemmata', language='latin') self.jv = JVReplacer() self.word_tokenizer = WordTokenizer('latin') self.count_dictionary = dict() self.punctuation_list = [ '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',', '"', '\'' ] def read_files(self, filepath): '''Reads the corpus and builds the self.count_dictionary dictionary object by calling the countgram() method on individual tokens. Dependencies ------------ TessFile class from tesserae.utils Lemmata class from cltk.semantics.latin.lookup JVReplacer class from cltk.stem.latin.j_v WordTokenizer class from cltk.tokenize.word Parameters ---------- filepath: a file in .tess format Results ------- Updates self.count_dictionary Returns ------- none''' tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) stop = 0 while stop != 1: try: rawtoken = next(tokengenerator) cleantoken_list = self.token_cleanup(rawtoken) token = cleantoken_list[0] self.countgram(token) except StopIteration: stop = 1 def countgram(self, targettoken): '''Update the frequency model with a new token from the corpus.''' lemmas = self.lemmatizer.lookup([targettoken]) lemmas = self.lemmatizer.isolate(lemmas) for lem in lemmas: try: test_presence = self.count_dictionary[lem] except KeyError: self.count_dictionary[lem] = 0 self.count_dictionary[lem] += 1 def lemmatize(self, target): '''Use the unsupervised count of lemma frequencies generated by read_files() to assign probabilities in the case of an ambiguous lemmatization. parameters ---------- target: a token to be lemmatized results ------- a list of tuples of the form [(lemma, probability)] ''' if target in self.punctuation_list: lemmalist = [('punc', 1)] return lemmalist if target == 'ne': lemmalist = [('ne', 1)] return lemmalist lemmalist = self.lemmatizer.lookup([target]) lemmas = self.lemmatizer.isolate(lemmalist) if len(lemmas) > 1: all_lemmas_total = sum([self.count_dictionary[l] for l in lemmas]) try: lemmalist = [(l, (self.count_dictionary[l] / all_lemmas_total)) for l in lemmas] except ZeroDivisionError: print([(self.count_dictionary[l], l) for l in lemmas]) return lemmalist lemmalist = [] lemmaobj = (lemmas[0], 1) lemmalist.append(lemmaobj) return lemmalist def token_cleanup(self, rawtoken): '''Standardize tokens by replaceing j with i and v with u, and split into multiple tokens as needed with tokenize() method of word_tokenizer class parameters ---------- rawtoken: the token as drawn from the text return ------ tokenlist: a list of possible word or punctuation tokens ''' rawtoken = self.jv.replace(rawtoken) rawtoken = rawtoken.lower() tokenlist = self.word_tokenizer.tokenize(rawtoken) #sometimes words are split into enclitics and punctuation. return tokenlist def save_pickle(self, filename): '''Saves the self.count_dictionary object for later reuse. dependencies ------------ os package parameters ---------- filename: name for the pickle file''' relativepath = join('~', 'cltk_data', 'latin', 'model', 'latin_models_cltk', 'frequency') path = expanduser(relativepath) pickle_file = join(path, filename) if not os.path.isdir(path): os.makedirs(path) pickle.dump(self.count_dictionary, open(pickle_file, "wb")) def load_pickle(self, filename): '''Load the self.count_dictionary object saved by save_pickle. dependencies ------------ os package parameters ---------- filename: name of the pickle file''' relativepath = join('~', 'cltk_data', 'latin', 'model', 'latin_models_cltk', 'frequency') path = expanduser(relativepath) pickle_file = join(path, filename) pickle_file = open(pickle_file, 'rb') self.count_dictionary = pickle.load(pickle_file) def train_model(self): '''open all the tesserae files and call read_files() on each to build freq model''' relativepath = join('~', 'cltk_data', 'latin', 'text', 'latin_text_tesserae_collection', 'la') path = expanduser(relativepath) onlyfiles = [f for f in listdir(path) if isfile(join(path, f)) and 'augustine' not in f and 'ambrose' not in f and 'jerome' not in f and 'tertullian' not in f and 'eugippius' not in f and 'hilary' not in f] # pylint: disable=line-too-long onlyfiles = [join(path, f) for f in onlyfiles] for filename in onlyfiles: if '.tess' in filename: print(filename) self.read_files(filename) def test_count_dictionary(self, token_list, lemma_list): '''Test the ability of lemmatize(), (which uses the self.count_dictionary dictionary, to predict the most likely lemmatization in ambiguous cases. Punctuation is automatically counted as correct, because the 'punc' lemmatization usage is inconsistent in the test corpus. dependencies ------------ itemgetter class from operator package parameters ---------- token_list: a list of tokens lemma_list: a list of corresponding 'correct' lemmatizaitons results ------- prints four numbers: the number of correctly assigned lemmas in ambiguous cases; the number of ambiguous cases in total; the number of tokens analyzed; and a decimal between 0 and 1 representing the proportion of correct lemmatizations. return ------ a list object containing all incorrect lemmatizations for analysis. Format: [(token, answer_given, correct_answer), (token...)] NOTE: Initial tests show roughly 91% accuracy, identification of punctuation included. ''' trials = 0 correct = 0 errors = [] for position in range(0, (len(token_list) - 1)): lemmalist = self.lemmatizer.lookup(token_list[position]) lemmalist = lemmalist[1] lemma = max(lemmalist, key=itemgetter(1)) if len(lemmalist) > 1: trials = trials + 1 if lemma[0] == lemma_list[position] or lemma[0] == 'punc': correct = correct + 1 else: errors.append( (token_list[position], lemma[0], lemma_list[position])) print(correct) print(trials) print(len(lemma_list)) rate = (len(lemma_list) - trials + correct) / len(lemma_list) print(rate) return errors
class GreekTokenizer(BaseTokenizer): def __init__(self, connection): super(GreekTokenizer, self).__init__(connection) # Set up patterns that will be reused self.vowels = 'αειηουωΑΕΙΗΟΥΩ' self.grave = '\u0300' self.acute = '\u0301' self.sigma = 'σ\b' self.sigma_alt = 'ς' self.word_characters = 'Ά-ώ' + self.sigma_alt + self.diacriticals self.diacrit_sub1 = \ '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})' self.diacrit_sub2 = \ '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{1})' self.split_pattern = '[<].+[>][\s]| / |[^\w' + self.diacriticals + self.sigma_alt + '\']' self.lemmatizer = Lemmata('lemmata', 'greek') def normalize(self, raw): """Normalize a single Greek word. Parameters ---------- raw : str or list of str The word to normalize. Returns ------- normalized : str The normalized string. """ # Perform the global normalization normalized = super(GreekTokenizer, self).normalize(raw) # Convert grave accent to acute normalized = re.sub(self.grave, self.acute, normalized, flags=re.UNICODE) # Remove diacriticals from vowels normalized = re.sub(self.diacrit_sub1, r'\1\3', normalized, flags=re.UNICODE) normalized = re.sub(self.diacrit_sub2, r'\1\3\2', normalized, flags=re.UNICODE) # Substitute sigmas normalized = re.sub(self.sigma, self.sigma_alt, normalized, flags=re.UNICODE) normalized = re.sub(r'\'', '', normalized, flags=re.UNICODE) normalized = re.sub(r'[\'0-9]+', '', normalized, flags=re.UNICODE) return normalized def featurize(self, tokens): """Get the features for a single Greek token. Parameters ---------- token : str The token to featurize. Returns ------- features : dict The features for the token. Notes ----- Input should be sanitized with `greek_normalizer` prior to using this method. """ features = [] lemmata = self.lemmatizer.lookup(tokens) for i, l in enumerate(lemmata): features.append({'lemmata': [lem[0] for lem in l[1]]}) return features