def test_tag_ner_str_list_latin(self): """Test make_ner(), str, list.""" text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(text_str) tokens = ner.tag_ner("lat", input_text=text_str_iu, output_type=list) target = [ ("ut", ), ("Uenus", "Entity"), (",", ), ("ut", ), ("Sirius", "Entity"), (",", ), ("ut", ), ("Spica", "Entity"), (",", ), ("ut", ), ("aliae", ), ("quae", ), ("primae", ), ("dicuntur", ), ("esse", ), ("mangitudinis", ), (".", ), ] self.assertEqual(tokens, target)
def get_tags(inputfile, outputfile): try: f = open(inputfile, 'r', encoding="utf-8") #f = codecs.open(inputfile, 'r', encoding='utf-8') try: x = f.read() except IOError as e: logfile.write("I/O error({0}): {1}\n".format(e.errno, e.strerror)) except: #handle other exceptions such as attribute errors logfile.write("Unexpected error:\n" + sys.exc_info()[0] + "\n") f.close() #print("x:",x) j = JVReplacer() x = x.lower() x = j.replace(x) ofile = open(outputfile, "w", encoding="utf-8") ofile.write(x) ofile.close() logfile.write("processing done\n") except IOError as e: logfile.write("I/O error({0}): {1}\n".format(e.errno, e.strerror)) except: #handle other exceptions such as attribute errors logfile.write("Unexpected error:" + sys.exc_info()[0] + "\n")
def stage1(text): ''' Lowercases text, normalizes spelling by converting 'j' to 'i' and 'v' to 'u', removes punctuation. ''' text = JVReplacer().replace(text.lower()) words = re.split('\W', text) return ' '.join([word for word in words if word != ''])
def test_tag_ner_list_str_latin(self): """Test make_ner(), list, str.""" text_list = ['ut', 'Venus', 'Sirius'] jv_replacer = JVReplacer() text_list_iu = [jv_replacer.replace(x) for x in text_list] text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str) target = ' ut Uenus/Entity Sirius/Entity' self.assertEqual(text, target)
def test_tag_ner_str_str_latin(self): """Test make_ner(), str, str.""" jv_replacer = JVReplacer() text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" text_str_iu = jv_replacer.replace(text_str) text = ner.tag_ner("lat", input_text=text_str_iu, output_type=str) target = " ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis." self.assertEqual(text, target)
def test_tag_ner_list_list_latin(self): """Test make_ner(), list, list.""" text_list = ['ut', 'Venus', 'Sirius'] jv_replacer = JVReplacer() text_list_iu = [jv_replacer.replace(x) for x in text_list] tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list) target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')] self.assertEqual(tokens, target)
def test_tag_ner_str_list_latin(self): """Test make_ner(), str, list.""" text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(text_str) tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list) target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)] self.assertEqual(tokens, target)
def test_tag_ner_list_str_latin(self): """Test make_ner(), list, str.""" text_list = ["ut", "Venus", "Sirius"] jv_replacer = JVReplacer() text_list_iu = [jv_replacer.replace(x) for x in text_list] text = ner.tag_ner("lat", input_text=text_list_iu, output_type=str) target = " ut Uenus/Entity Sirius/Entity" self.assertEqual(text, target)
def test_tag_ner_list_list_latin(self): """Test make_ner(), list, list.""" text_list = ["ut", "Venus", "Sirius"] jv_replacer = JVReplacer() text_list_iu = [jv_replacer.replace(x) for x in text_list] tokens = ner.tag_ner("lat", input_text=text_list_iu, output_type=list) target = [("ut", ), ("Uenus", "Entity"), ("Sirius", "Entity")] self.assertEqual(tokens, target)
def __init__( self, pathDF, language='english', dataType='pickle', dataIndex='multi', colname='text', maxValues=2500, pathMeta=False, pathType=False, showLogging=False, model_params=(4,5,300) ): super(CorpusML, self).__init__( pathDF, dataType, dataIndex, colname, maxValues, pathMeta, pathType ) if showLogging: logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) self.model = gensim.models.Word2Vec( workers=model_params[0], min_count=model_params[1], size=model_params[2] ) # self.model.random.seed(42) self.language = language if self.language == 'latin' or self.language == 'greek': from cltk.corpus.utils.importer import CorpusImporter corpus_importer = CorpusImporter(self.language) corpus_importer.import_corpus( '{0}_models_cltk'.format(self.language) ) from cltk.stem.lemma import LemmaReplacer from cltk.tokenize.word import nltk_tokenize_words as tokenizer lemmatizer = LemmaReplacer(self.language) if self.language == 'latin': from cltk.stem.latin.j_v import JVReplacer from cltk.stop.latin.stops import STOPS_LIST as stopwords self.jvreplacer = JVReplacer() elif self.language == 'greek': from cltk.stop.greek.stops import STOPS_LIST as stopwords elif self.language == 'english' or 'german': import nltk nltk.download('stopwords') from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize as tokenizer from nltk.corpus import stopwords stopwords = stopwords.words(self.language) lemmatizer = WordNetLemmatizer() else: raise ValueError( 'Could not find lemmatizer, tokenizer,\ and stopwords for chosen language.') self.lemmatizer = lemmatizer self.tokenizer = tokenizer self.stopwords = stopwords
def get_sims(word, language, lemmatized=False, threshold=0.70): """Get similar Word2Vec terms from vocabulary or trained model. TODO: Add option to install corpus if not available. """ # Normalize incoming word string jv_replacer = JVReplacer() if language == 'latin': # Note that casefold() seemingly does not work with diacritic # Greek, likely because of it expects single code points, not # diacritics. Look into global string normalization to code points # for all languages, especially Greek. word = jv_replacer.replace(word).casefold() model_dirs = { 'greek': '~/cltk_data/greek/model/greek_word2vec_cltk', 'latin': '~/cltk_data/latin/model/latin_word2vec_cltk' } assert language in model_dirs.keys( ), 'Langauges available with Word2Vec model: {}'.format(model_dirs.keys()) if lemmatized: lemma_str = '_lemmed' else: lemma_str = '' model_name = '{0}_s100_w30_min5_sg{1}.model'.format(language, lemma_str) model_dir_abs = os.path.expanduser(model_dirs[language]) model_path = os.path.join(model_dir_abs, model_name) w2v = Word2Vec() try: model = w2v.load(model_path) except FileNotFoundError as fnf_error: print(fnf_error) print( "CLTK's Word2Vec models cannot be found. Please import '{}_word2vec_cltk'." .format(language)) raise try: similars = model.most_similar(word) except KeyError as key_err: print(key_err) possible_matches = [] for term in model.vocab: if term.startswith(word[:3]): possible_matches.append(term) print( "The following terms in the Word2Vec model you may be looking for: '{}'." .format(possible_matches)) return None returned_sims = [] for similar in similars: if similar[1] > threshold: returned_sims.append(similar[0]) if not returned_sims: print( "Matches found, but below the threshold of 'threshold={}'. Lower it to see these results." .format(threshold)) return returned_sims
def __init__(self, connection): super(LatinTokenizer, self).__init__(connection) # Set up patterns that will be reused self.jv_replacer = JVReplacer() self.lemmatizer = Lemmata('lemmata', 'lat') self.split_pattern = \ '( / )|([\\s]+)|([^\\w' + self.diacriticals + ']+)'
def __init__(self, connection): super(LatinTokenizer, self).__init__(connection) # Set up patterns that will be reused self.jv_replacer = JVReplacer() self.lemmatizer = Lemmata('lemmata', 'latin') self.split_pattern = \ '[<].+[>][\s]| / | \. \. \.|\.\~\.\~\.|[^\w' + self.diacriticals + ']'
def test_tag_ner_str_str_latin(self): """Test make_ner(), str, str.""" jv_replacer = JVReplacer() text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(text_str) text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str) target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.' self.assertEqual(text, target)
def jv_replace(text): """ Will perform CLTK-based jv_replacement. """ jv_replacer = JVReplacer() jv_normalized_text = jv_replacer.replace(text) # no lowercasing or Truecasing is done so far! # lowercasing probably won't be done but Truecasing needs bow first return jv_normalized_text
def __init__(self): self.lemmatizer = Lemmata(dictionary='lemmata', language='latin') self.jv = JVReplacer() self.word_tokenizer = WordTokenizer('latin') self.count_dictionary = dict() self.punctuation_list = [ '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',', '"', '\'' ]
def get_tags(inputfile, outputfile): tree = ET.ElementTree(file=inputfile) root = tree.getroot() j = JVReplacer() for w in root.iter('w'): if not w.text is None: w.text = w.text.lower() w.text = j.replace(w.text) tree.write(outputfile, xml_declaration=True, encoding="utf-8")
def latin_lem_replacement(input_words): replacer = JVReplacer() if type(input_words) == list: for i in range(len(input_words)): input_words[i] = normalize_word(replacer.replace(input_words[i])) else: input_words = normalize_word(replacer.replace(input_words)) return input_words
def jv_transform(string_matrix: List[List[str]]) -> List[List[str]]: """ :param string_matrix: a data matrix: a list wrapping a list of strings, with each sublist being a sentence. >>> jv_transform([['venio', 'jacet'], ['julius', 'caesar']]) [['uenio', 'iacet'], ['iulius', 'caesar']] """ jvreplacer = JVReplacer() return [[jvreplacer.replace(word) for word in sentence] for sentence in string_matrix]
def test_roman_numeral_lemmatizer(self): """Test roman_numeral_lemmatizer()""" lemmatizer = RomanNumeralLemmatizer() test_str = 'i ii iii iv v vi vii vii ix x xx xxx xl l lx c cc' target = [('i', 'NUM'), ('ii', 'NUM'), ('iii', 'NUM'), ('iu', 'NUM'), ('u', 'NUM'), ('ui', 'NUM'), ('uii', 'NUM'), ('uii', 'NUM'), ('ix', 'NUM'), ('x', 'NUM'), ('xx', 'NUM'), ('xxx', 'NUM'), ('xl', 'NUM'), ('l', 'NUM'), ('lx', 'NUM'), ('c', 'NUM'), ('cc', 'NUM')] # pylint: disable=line-too-long jv_replacer = JVReplacer() test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = test_str.split() lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_bigram_pos_lemmatizer(self): train = [[('dixissem', 'dico', 'v')], [('de', 'de', 'r'), ('te', 'tu', 'p'), ('autem', 'autem', 'c'), (',', 'punc', 'u'), ('catilina', 'catilina', 'n'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'), (',', 'punc', 'u'), ('probant', 'probo', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'), ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'), (',', 'punc', 'u'), ('clamant', 'clamo', 'v'), (',', 'punc', 'u'), ('neque', 'neque', 'c'), ('hi', 'hic', 'p'), ('solum', 'solus', 'd'), ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'), ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'), ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'), (',', 'punc', 'u'), ('uita', 'uita', 'n'), ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'), ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'), ('illi', 'ille', 'p'), ('equites', 'eques', 'n'), ('romani', 'romanus', 'a'), (',', 'punc', 'u'), ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'), ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'), (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'), ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'), ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'), ('circumstant', 'circumsto', 'v'), ('senatum', 'senatus', 'n'), (',', 'punc', 'u'), ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'), ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'), ('et', 'et', 'c'), ('studia', 'studium', 'n'), ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'), ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'), ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'), ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]] lemmatizer = BigramPOSLemmatizer(train=train, include=['cum']) test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus""" target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_identity_lemmatizer(self): """Test identity_lemmatizer()""" lemmatizer = IdentityLemmatizer() test_str = 'Ceterum antequam destinata componam' target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_latin_lemmata(self): """Test Lemmata class lookup() method""" lemmatizer = Lemmata(dictionary='lemmata', language='latin') test_str = 'Ceterum antequam destinata componam' target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) self.assertEqual(lemmas, target)
def test_latin_lemmata(self): """Test Lemmata class lookup() method""" lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) self.assertEqual(lemmas, target)
def get_sims(word, language, lemmatized=False, threshold=0.70): """Get similar Word2Vec terms from vocabulary or trained model. TODO: Add option to install corpus if not available. """ # Normalize incoming word string jv_replacer = JVReplacer() if language == "latin": # Note that casefold() seemingly does not work with diacritic # Greek, likely because of it expects single code points, not # diacritics. Look into global string normalization to code points # for all languages, especially Greek. word = jv_replacer.replace(word).casefold() model_dirs = { "greek": "~/cltk_data/greek/model/greek_word2vec_cltk", "latin": "~/cltk_data/latin/model/latin_word2vec_cltk", } assert language in model_dirs.keys(), "Langauges available with Word2Vec model: {}".format(model_dirs.keys()) if lemmatized: lemma_str = "_lemmed" else: lemma_str = "" model_name = "{0}_s100_w30_min5_sg{1}.model".format(language, lemma_str) model_dir_abs = os.path.expanduser(model_dirs[language]) model_path = os.path.join(model_dir_abs, model_name) w2v = Word2Vec() try: model = w2v.load(model_path) except FileNotFoundError as fnf_error: print(fnf_error) print("CLTK's Word2Vec models cannot be found. Please import '{}_word2vec_cltk'.".format(language)) raise try: similars = model.most_similar(word) except KeyError as key_err: print(key_err) possible_matches = [] for term in model.vocab: if term.startswith(word[:3]): possible_matches.append(term) print("The following terms in the Word2Vec model you may be looking for: '{}'.".format(possible_matches)) return None returned_sims = [] for similar in similars: if similar[1] > threshold: returned_sims.append(similar[0]) if not returned_sims: print( "Matches found, but below the threshold of 'threshold={}'. Lower it to see these results.".format(threshold) ) return returned_sims
def test_backoff_latin_lemmatizer_verbose(self): """Test backoffLatinLemmatizer""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer(verbose=True) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" pattern = [(r'(\w*)abimus', 'o')] lemmatizer = RegexpLemmatizer(pattern) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')] lemmatizer = RegexpLemmatizer(sub) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_model_lemmatizer(self): """Test model_lemmatizer()""" model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'} # pylint: disable=line-too-long lemmatizer = TrainLemmatizer(model=model) test_str = 'Ceterum antequam destinata componam' target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def main(): # setup() a = open('./Gratian1.txt', 'r').read() b = open('./Gratian2.txt', 'r').read() a_lemmas = process(JVReplacer().replace(a.lower())) b_lemmas = process(JVReplacer().replace(b.lower())) a_only = [lemma for lemma in a_lemmas if lemma not in b_lemmas] b_only = [lemma for lemma in b_lemmas if lemma not in a_lemmas] a_only.sort() b_only.sort() print(a_only) print(b_only)
def test_roman_numeral_lemmatizer_with_default(self): """Test roman_numeral_lemmatizer()""" rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'), (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')] lemmatizer = RomanNumeralLemmatizer(rn_patterns, default="RN") test_str = 'i ii' target = [('i', 'RN'), ('ii', 'RN')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_unigram_lemmatizer(self): """Test unigram_lemmatizer()""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = UnigramLemmatizer(train=train) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_latin_pp_lemmatizer(self): """Test latin_pp_lemmatizer()""" pattern = [(r'(\w*)[a|ie]bimus\b', 1)] pps = { 'amo': [1, 'am', 'amare', 'amau', 'amat'] } lemmatizer = PPLemmatizer(pattern, pps=pps) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_latin_pp_lemmatizer(self): """Test latin_pp_lemmatizer()""" pattern = [(r'(\w*)[a|ie]bimus\b', 1)] pps = {'amo': [1, 'am', 'amare', 'amau', 'amat']} lemmatizer = PPLemmatizer(pattern, pps=pps) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def normalize(self, text, stopwords): """ Normalize the text """ jv = JVReplacer() punkt = RegexpTokenizer(r'\w+') data = [] for citation in text: line = punkt.tokenize(jv.replace(citation.text.lower())) data = data + line self.refs = self.refs + [citation.ref] * len(line) if stopwords: return " ".join([lem for lem in data if lem not in STOPS_LIST_LATIN]) else: return " ".join(data)
def tokenize(text, language="latin"): jv_replacer = JVReplacer() text = jv_replacer.replace(text.lower()) t = WordTokenizer(language) l = LemmaReplacer(language) text_word_tokens = t.tokenize(text) # Garde les mots de plus de trois characters ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']] text_word_tokens = [token for token in text_word_tokens if len(token) > 3] text_word_tokens = l.lemmatize(text_word_tokens) return text_word_tokens
def createNERListFromCorpus(string): """ Will use CLTK NER method on a corpus (as string). Will perform jv replacement in the process. """ ner_list = [] jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(string) corpus_ner = ner.tag_ner('latin', input_text=text_str_iu) for tup in corpus_ner: if len(tup) > 1: ner_list.append(tup[0]) NER_unique_values = set(ner_list) print('These NER were found in the given corpus:') print(NER_unique_values) return ner_list
def lemmatize(fname, tokenizer, lemmatizer): jv = JVReplacer() lemmacounts = {} formcounts = {} lemmaforms = {} with open(fname, "r") as f: i = 0 hangingword = "" # t = time.time() for line in f: line = hangingword.replace("-", "") + line.strip() if line and line[-1] == "-": splitline = line.split(" ") hangingword = " ".join(splitline[-1]) line = " ".join(splitline[0:-1]) else: hangingword = "" noaccents = remove_accents(line).replace("'", "").replace( "/", "").replace("-", "").replace("!", "").replace("?", "").replace(".", "") lemmatized = lemmatizer.lemmatize( tokenizer.tokenize(jv.replace(noaccents.lower()))) for form, lemma in lemmatized: if lemma.lower() == "punc" or lemma.lower() == "period": continue if form not in formcounts: formcounts[form] = 0 formcounts[form] += 1 if lemma not in lemmacounts: lemmacounts[lemma] = 0 lemmaforms[lemma] = set([]) lemmacounts[lemma] += 1 lemmaforms[lemma].add(form) i += 1 if not i % 100: print(basename(fname), i) # print(basename(fname), i, time.time() - t) # t = time.time() return formcounts, lemmacounts, lemmaforms
def test_latin_synonyms(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) synonyms = Synonyms(dictionary = 'synonyms', language = 'latin') syns = synonyms.lookup_synonyms(lemmas) target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])] self.assertEqual(syns, target)
def test_latin_translations(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) translations = Synonyms(dictionary = 'translations', language = 'latin') translations = translations.lookup_synonyms(lemmas) target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])] self.assertEqual(translations, target)
def test_latin_i_u_transform(self): """Test converting ``j`` to ``i`` and ``v`` to ``u``.""" jv_replacer = JVReplacer() trans = jv_replacer.replace('vem jam VEL JAM') self.assertEqual(trans, 'uem iam UEL IAM')
def gen_docs(corpus, lemmatize, rm_stops): """Open and process files from a corpus. Return a list of sentences for an author. Each sentence is itself a list of tokenized words. """ assert corpus in ['phi5', 'tlg'] if corpus == 'phi5': language = 'latin' filepaths = assemble_phi5_author_filepaths() jv_replacer = JVReplacer() text_cleaner = phi5_plaintext_cleanup word_tokenizer = WordTokenizer('latin') if rm_stops: stops = latin_stops else: stops = None elif corpus == 'tlg': language = 'greek' filepaths = assemble_tlg_author_filepaths() text_cleaner = tlg_plaintext_cleanup word_tokenizer = WordTokenizer('greek') if rm_stops: stops = latin_stops else: stops = None if lemmatize: lemmatizer = LemmaReplacer(language) sent_tokenizer = TokenizeSentence(language) for filepath in filepaths: with open(filepath) as f: text = f.read() # light first-pass cleanup, before sentence tokenization (which relies on punctuation) text = text_cleaner(text, rm_punctuation=False, rm_periods=False) sent_tokens = sent_tokenizer.tokenize_sentences(text) # doc_sentences = [] for sentence in sent_tokens: # a second cleanup at sentence-level, to rm all punctuation sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True) sentence = word_tokenizer(sentence) sentence = [s.lower() for s in sentence] sentence = [w for w in sentence if w] if language == 'latin': sentence = [w[1:] if w.startswith('-') else w for w in sentence] if stops: sentence = [w for w in sentence if w not in stops] sentence = [w for w in sentence if len(w) > 1] # rm short words if sentence: sentence = sentence if lemmatize: sentence = lemmatizer.lemmatize(sentence) if sentence and language == 'latin': sentence = [jv_replacer.replace(word) for word in sentence] if sentence: yield sentence
# Import modules # For XML from xml.dom.minidom import parse, parseString import codecs # For CLTK from cltk.stem.latin.j_v import JVReplacer from cltk.stem.lemma import LemmaReplacer from cltk.tag.pos import POSTag # Initialize CLTK lemmatizer = LemmaReplacer('latin') tagger = POSTag('latin') j = JVReplacer() # Parse XML xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml') #xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml') wordElementList = xmldoc.getElementsByTagName('w') for w in wordElementList: form = w.attributes['ana'].value print(form) # Parse the inflected word try: lemmaList = lemmatizer.lemmatize(form.lower()) lemma = lemmaList[0].replace('v', 'u') posList = tagger.tag_tnt(j.replace(form.lower()))