def lemmanade(lines): count = 0 lemons = [] # initialize cltk tools #jvReplace = JVReplacer() wordTokenizer = WordTokenizer('latin') lemmatizer = LemmaReplacer('latin') for verse in lines: count = count + 1 # lowercase #verse = jvReplace.replace(verse.lower()) #tokenize the words chunkTok = wordTokenizer.tokenize(verse.lower()) chunkTok = [ whiteTok(tok) for tok in chunkTok if whiteTok(tok) is not None ] #lemmatize the tokens lemmata = lemmatizer.lemmatize(chunkTok) #add all the lemmatized tokens together in a string lemons.append(lemmata) return lemons
def __init__(self): self.sent_tokenizer = SentenceTokenizer() self.word_tokenizer = WordTokenizer('greek') self.corpus_reader = get_corpus_reader( corpus_name='greek_text_perseus', language='greek') self.lemmatizer = LemmaReplacer('greek') self.tfidf_vectorizer = TfidfVectorizer(input="filename")
def declineEachWordInList(word_list, error_type_context,logger): """ A declension function which might not be needed anymore. """ error_type = error_type_context + "-declension" error_count = 0 total_list_length = len(word_list) words_string = ' '.join(word_list) normalized_string = normalizeLatinWordsInNonstandardGlyphs(words_string) jv_replaced_string = jv_replace(normalized_string) word_list = jv_replaced_string.split() lemmatizer = LemmaReplacer('latin') try: word_list = lemmatizer.lemmatize(word_list) except: print("Lemmatization error with " + word) #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word) error_count += 1 decliner = CollatinusDecliner() declined_forms = [] for word in word_list: try: declined = decliner.decline(word, flatten=True) declined_forms = declined_forms + declined except: logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word) error_count += 1 print('[' + str(error_count) + ' / ' + str(total_list_length) + '] declension errors') return declined_forms
def declineWord(word, error_type_context,logger): """ A declension function which might not be needed anymore. """ error_type = error_type_context + "-declension" normalized_string = normalizeLatinWordsInNonstandardGlyphs(word) jv_replaced_string = jv_replace(normalized_string) lemmatizer = LemmaReplacer('latin') try: word_list = lemmatizer.lemmatize(word_list) except: print("Lemmatization error with " + word) #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word) error_count += 1 decliner = CollatinusDecliner() declined_forms = [] for word in word_list: try: declined = decliner.decline(word, flatten=True) declined_forms = declined_forms + declined except: logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word) error_count += 1 return declined_forms
def main(): input = open('./Gratian1.txt', 'r').read() input = re.sub('['+string.punctuation+']', '', input) input = input.lower() lemmatizer = LemmaReplacer('latin') lemmata = lemmatizer.lemmatize(input) dictionary_1r = {} for lemma in lemmata: if lemma in dictionary_1r: dictionary_1r[lemma] += 1 else: dictionary_1r[lemma] = 1 # lemmata = dictionary_1r.keys() # for lemma in lemmata: # print("%2d\t%s" % (dictionary_1r[lemma], lemma)) input = open('./Gratian2.txt', 'r').read() input = re.sub('['+string.punctuation+']', '', input) input = input.lower() lemmata = lemmatizer.lemmatize(input) dictionary_2r = {} for lemma in lemmata: if lemma in dictionary_2r: dictionary_2r[lemma] += 1 else: dictionary_2r[lemma] = 1 lemmata = dictionary_2r.keys() for lemma in lemmata: if lemma not in dictionary_1r: print("%2d\t%s" % (dictionary_2r[lemma], lemma))
def test_lemmatizer_inlist_outlemma_greek(self): """Test the Greek lemmatizer. """ replacer = LemmaReplacer('greek') unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι'] lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False) target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι'] self.assertEqual(lemmatized, target)
def test_lemmatizer_instr_greek(self): """Test the Greek lemmatizer. """ replacer = LemmaReplacer('greek') unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι' lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False) target = ['τὴν', 'διάγνωσις', 'ἔρχομαι'] self.assertEqual(lemmatized, target)
def test_lemmatizer_instr_outlemma_outstring_greek(self): """Test the Greek lemmatizer. """ replacer = LemmaReplacer('greek') unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι' lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True) target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι' self.assertEqual(lemmatized, target)
def test_lemmatizer_inlist_latin(self): """Test the Latin lemmatizer. """ replacer = LemmaReplacer('latin') unlemmatized = ['hominum', 'divomque', 'voluptas'] lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False) target = ['h**o', 'divus', 'voluptas'] self.assertEqual(lemmatized, target)
def test_lemmatizer_inlist_outlemma_outstring_latin(self): """Test the Latin lemmatizer. """ replacer = LemmaReplacer('latin') unlemmatized = ['hominum', 'divomque', 'voluptas'] lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True) target = 'hominum/h**o divomque/divus voluptas/voluptas' self.assertEqual(lemmatized, target)
def test_lemmatizer_instr_outlemma_latin(self): """Test the Latin lemmatizer. """ replacer = LemmaReplacer('latin') unlemmatized = 'hominum divomque voluptas' lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False) target = ['hominum/h**o', 'divomque/divus', 'voluptas/voluptas'] self.assertEqual(lemmatized, target)
def lemmatize(): req_data = request.get_json() if req_data and req_data['input_text']: input_text = req_data['input_text'] lemmatizer = LemmaReplacer('greek') return jsonify(lemmatizer.lemmatize(input_text)) return jsonify({})
def test_lemmatizer_inlist_outstring_greek(self): """Test the Greek lemmatizer. """ replacer = LemmaReplacer('greek') unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι'] lemmatized = replacer.lemmatize(unlemmatized, return_lemma=False, return_string=True) target = 'τὴν διάγνωσις ἔρχομαι' self.assertEqual(lemmatized, target)
def test_lemmatizer_instr_outstring_latin(self): """Test the Latin lemmatizer. """ replacer = LemmaReplacer('latin') unlemmatized = 'hominum divomque voluptas' lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True) target = 'h**o divus voluptas' self.assertEqual(lemmatized, target)
def main(): corpus_importer = CorpusImporter('latin') corpora_list = corpus_importer.list_corpora print(corpora_list) corpus_importer.import_corpus('latin_models_cltk') sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.' sentence = sentence.lower() lemmatizer = LemmaReplacer('latin') lemmatized_sentence = lemmatizer.lemmatize(sentence) print(lemmatized_sentence)
def lemmatizeList(list): tagger = POSTag('greek') lemmatizer = LemmaReplacer('greek') lemmWords = lemmatizer.lemmatize(list) # Remove Stopwords and numbers and lowercases all words. lemmWords = [w.lower() for w in lemmWords if not w in STOPS_LIST] lemmWords = removeNumbers(lemmWords) return lemmWords
def lemmatizeWord(word): """ CLTK-based lemmatization function to lemmatize a single word. Since CLTK lemmatization always returns a list, it will only return the first element of that list. If you want the whole list or lemmatize more than one word, use lemmatizeAllWordsFromList. This function has no error checking in form of try-catch or anything. It's possible that lemmatization fails and thus, the returned string is empty. """ lemmatizer = LemmaReplacer('latin') result = lemmatizer.lemmatize(word) # always returns list return result[0]
def lemmatize(self, return_string=True, return_raw=False): """Transforms words into their lemmata. Gives a new version of the text in which every word is lemmatized. All verbs are transformed into the first person singular present active, all nouns are transformed into the singular masculine nominative, et.c. Returns: :obj:`self.__class__` New version of the text with tokens transformed to their lemmata Example: >>> text = LatinText('Gallia est omnis divisa in partes tres') >>> print(text.lemmatize()) gallia edo1 omne divido in pars tres """ # noqa from cltk.stem.lemma import LemmaReplacer return self.__class__( text=LemmaReplacer( self.options['language'] ).lemmatize( self.data.lower(), return_string=return_string, return_raw=return_raw ), options=self.options )
def tokenize(text, language="latin"): jv_replacer = JVReplacer() text = jv_replacer.replace(text.lower()) t = WordTokenizer(language) l = LemmaReplacer(language) text_word_tokens = t.tokenize(text) # Garde les mots de plus de trois characters ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']] text_word_tokens = [token for token in text_word_tokens if len(token) > 3] text_word_tokens = l.lemmatize(text_word_tokens) return text_word_tokens
def lemmatize(self, return_string=True, return_raw=False): # pragma: no cover return self.__class__(data=LemmaReplacer(self.language).lemmatize( self.data.lower(), return_string=return_string, return_raw=return_raw), metadata=self.metadata)
def __init__( self, pathDF, language='english', dataType='pickle', dataIndex='multi', colname='text', maxValues=2500, pathMeta=False, pathType=False, showLogging=False, model_params=(4,5,300) ): super(CorpusML, self).__init__( pathDF, dataType, dataIndex, colname, maxValues, pathMeta, pathType ) if showLogging: logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) self.model = gensim.models.Word2Vec( workers=model_params[0], min_count=model_params[1], size=model_params[2] ) # self.model.random.seed(42) self.language = language if self.language == 'latin' or self.language == 'greek': from cltk.corpus.utils.importer import CorpusImporter corpus_importer = CorpusImporter(self.language) corpus_importer.import_corpus( '{0}_models_cltk'.format(self.language) ) from cltk.stem.lemma import LemmaReplacer from cltk.tokenize.word import nltk_tokenize_words as tokenizer lemmatizer = LemmaReplacer(self.language) if self.language == 'latin': from cltk.stem.latin.j_v import JVReplacer from cltk.stop.latin.stops import STOPS_LIST as stopwords self.jvreplacer = JVReplacer() elif self.language == 'greek': from cltk.stop.greek.stops import STOPS_LIST as stopwords elif self.language == 'english' or 'german': import nltk nltk.download('stopwords') from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize as tokenizer from nltk.corpus import stopwords stopwords = stopwords.words(self.language) lemmatizer = WordNetLemmatizer() else: raise ValueError( 'Could not find lemmatizer, tokenizer,\ and stopwords for chosen language.') self.lemmatizer = lemmatizer self.tokenizer = tokenizer self.stopwords = stopwords
def preprocess(doc): assert (type(doc) == str) word_tokenizer = WordTokenizer('latin') doc_word_tokens = word_tokenizer.tokenize(doc) doc_word_tokens_no_punt = [ token.lower() for token in doc_word_tokens if token not in ['.', ',', ':', ';'] ] # lemmeatization corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') jv_replacer = JVReplacer() lemmatizer = LemmaReplacer('latin') lemmata = lemmatizer.lemmatize(" ".join(doc_word_tokens_no_punt)) cleaned = remove_latin_library_items(" ".join(lemmata)) return cleaned
def main(): jv = JVReplacer(); more = 0 lemmatizer = LemmaReplacer('latin'); word_counts = {}; lines = open(sys.argv[1]); for line in lines: words = line.split(); for i in range(0, len(words)): words[i] = jv.replace(remove_punctuation(words[i]).lower()); for word in words: #if word in stops_augmented: # continue; if "&" in word: continue if (len(lemmatizer.lemmatize(word)) == 0): more += 1 continue; stem = lemmatizer.lemmatize(word)[0]; if not stem in word_counts: word_counts[stem] = 1; else: word_counts[stem] = word_counts[stem] + 1; words_to_show = 400 sorted_words = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True); top_words = [word[0] for word in sorted_words][0:words_to_show]; word_freqs = [word[1] for word in sorted_words][0:words_to_show]; for i in range(0, words_to_show): print(str(i) + " " + top_words[i] + " " + str(word_freqs[i])); count = 0; for i in range(0, words_to_show): count += word_freqs[i]; print(str(count)); print(str(more)) s = np.arange(0.0, words_to_show, 1); t = word_freqs; plt.plot(s, t); plt.ylabel('# appearances in Elegiae'); plt.xlabel('rank of word frequency'); plt.show();
def lemmatizeList(self, lines): from cltk.corpus.utils.formatter import cltk_normalize tagger = POSTag('greek') lemmatizer = LemmaReplacer('greek') # can help when using certain texts (doc says it, so i does it) lines = cltk_normalize(lines) # print(lines) # exit(0) lines = lemmatizer.lemmatize(lines) # Remove Stopwords and numbers and lowercases all words. lines = [w.lower() for w in lines if not w in STOPS_LIST] # lemmWords = removeNumbers(lemmWords) return ' '.join(lines)
def lemmatizeLat(self, tokenized_words: list, return_raw: bool = False) -> ([str]) or ([str], [str]): """ Lemmatizes given list of words against the cltk perseus corpus. If second parameter is set to true -> returns a list with words BUT additionally with derived "source-word" after a "/" seperator. :param tokenized_words: String list of words to be lemmatized. :param return_raw: Boolean, decides if return should contain raw "source word" or not. :return: First index position -> List of lemmmatas; Second index position -_> if second parameter was true then list of lemmatas with "source_words" attached to each lemmata string BEFORE the lemmatized word. """ lemmatizer = LemmaReplacer('latin') lemmata: [str] = lemmatizer.lemmatize(tokenized_words, False) if return_raw: lemmata_with_source: [str] = lemmatizer.lemmatize( tokenized_words, True) return lemmata, lemmata_with_source else: return lemmata
def get_lemma(input_words, language): lang = None if language == "Latin": lemmatizer = LemmaReplacer("latin") # Required for CLTK module input_words = latin_lem_replacement(input_words) if language == "Greek": lemmatizer = LemmaReplacer("greek") if type(input_words) == list: results = lemmatizer.lemmatize(input_words) return results else: input_words = normalize_word(input_words) results = lemmatizer.lemmatize(input_words) if len(results) > 0: return results[0] else: return input_words
def runTest(text): '''Test cltk tools for latin''' print('Test phrase:') print(' -> ' + text) print() # print('[1/3] Testing JVReplacer') # jv = JVReplacer() # text = jv.replace(text) # print(' -> ' + text) # print() print('[2/3] Testing WordTokenizer') tokenizer = WordTokenizer('latin') tok = tokenizer.tokenize(text) print(' -> ' + ', '.join(["'{}'".format(t) for t in tok])) print() print('[3/3] Testing LemmaReplacer') lemmatizer = LemmaReplacer('latin') lem = lemmatizer.lemmatize(tok) print(' -> ' + ', '.join(["'{}'".format(l) for l in lem])) print()
def get_docs(letters): docs = [] count = 0 for i, entry in enumerate(letters): letter, tag = entry NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]') replacer = JVReplacer() letter = replacer.replace(letter) words = re.sub(NO_PUNCT_RE, '', letter).lower().split() for i, word in enumerate(words): if word.endswith('-'): words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1]) words = [w for w in words if not w.endswith('-')] words = [w for w in words if w not in STOPS_LIST] words = ' '.join(words) lemmatizer = LemmaReplacer('latin') words = lemmatizer.lemmatize(words) count += len(words) doc = TaggedDocument(words, [tag]) docs.append(doc) return docs
def pre_process(letters): pre_processed = [] for letter in letters: NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]') replacer = JVReplacer() letter = replacer.replace(letter) words = re.sub(NO_PUNCT_RE, '', letter).lower().split() for i, word in enumerate(words): if word.endswith('-'): words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1]) words = [w for w in words if not w.endswith('-')] words = [w for w in words if w not in STOPS_LIST] words = ' '.join(words) lemmatizer = LemmaReplacer('latin') words = lemmatizer.lemmatize(words) # very common words that seemed to be cofounding the topic model words = [ w for w in words if w not in ['magnus', 'bonus', 'ago', 'valeo'] ] pre_processed.append(words) return pre_processed
def entities(self, lemmatize=False, unique=False): """Returns a list of entities recognized in the text. Uses cltk's built in named-entity recognition. Reorganizes cltk's raw output from list of tuples to list of strings. Every entity recognized is added to the list returned. Unless unique option is set, entities which appear multiple times will be returned multiple times in the list. Args: lemmatize (:obj:`bool`, optional) Set True to lemmatize text before searching for entities unique (:obj:`bool`, optional) Set True and no entity appears in the return list more than once Example: >>> text = LatinText('Gallia est omnis divisa in partes tres') >>> print(text.entities()) ['Gallia'] """ # noqa from cltk.stem.lemma import LemmaReplacer from cltk.tag import ner entity_list = [] # filtering non-entities for result in ner.tag_ner( self.options['language'], input_text=self.data, output_type=list ): # appending if item flagged as entity in tuple[1] try: if result[1] == 'Entity': entity_list.append(result[0]) # do nothing if 'Entity' not specified except: pass # removing duplicate entities if unique option specified if unique: entity_list = list(set(entity_list)) # lemmatizing entities if option has been specified if lemmatize: entity_list = LemmaReplacer(self.options['language']).lemmatize( entity_list, return_string=False, return_raw=False ) return entity_list
def entities(self, lemmatize=False, unique=False): entity_list = [] # filtering non-entities for result in ner.tag_ner(self.language, input_text=self.data, output_type=list): # appending if item flagged as entity in tuple[1] try: if result[1] == 'Entity': entity_list.append(result[0]) # do nothing if 'Entity' not specified except: pass # removing duplicate entities if unique option specified if unique: entity_list = list(set(entity_list)) # lemmatizing entities if option has been specified if lemmatize: entity_list = LemmaReplacer(self.language).lemmatize( entity_list, return_string=False, return_raw=False) return entity_list
def lemmata(text): lemmatizer = LemmaReplacer('greek') return [ word for word in set(lemmatizer.lemmatize(text.lower())) if not word in STOPS_LIST ]
def flatten_list(word_list): flat_list = [] for word in word_list: flat_list.append(word.text) return flat_list def remove_digits(some_string): return ''.join([i for i in some_string if not i.isdigit()]) la_corpus_importer = CorpusImporter('latin') la_corpus_importer.import_corpus('latin_text_latin_library') la_corpus_importer.import_corpus('latin_models_cltk') la_lemmatizer = LemmaReplacer('latin') grc_corpus_importer = CorpusImporter('greek') grc_corpus_importer.import_corpus('greek_models_cltk') grc_lemmatizer = LemmaReplacer('greek') def lemmatize(word_list, copy): for word in word_list: if copy: word.lemmatization = word.text return if word.language in LATIN_CODES: word.lemmatization = \ remove_digits(la_lemmatizer.lemmatize(word.text)[0]) elif word.language in GREEK_CODES: word.lemmatization = \
#import codecs # Import module For XML from xml.dom.minidom import parse, parseString # For CLTK #from cltk.corpus.utils.importer import CorpusImporter #corpus_importer = CorpusImporter('latin') #corpus_importer.import_corpus('latin_models_cltk') from cltk.stem.latin.j_v import JVReplacer from cltk.stem.lemma import LemmaReplacer from cltk.tag.pos import POSTag lemmatizer = LemmaReplacer('latin') tagger = POSTag('latin') j = JVReplacer() text = [] #text = ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres'] with open('/home/ilbuonme/siti/paolo.monella/ursus/lemma/recycleBin/textForOrig-myCLTK.txt', 'r') as f: for x in f.readlines(): for w in x.split(' '): text.append(w) for t in text: if t: # Note: the tagger likes 'divisa', while the lemmatizer likes 'diuisa' lemmaList = lemmatizer.lemmatize(t.lower()) posList = tagger.tag_tnt(j.replace(t.lower()))
def lemmata(text): lemmatizer = LemmaReplacer('greek') return [word for word in set(lemmatizer.lemmatize(text.lower())) if not word in STOPS_LIST]
# - report to DC list/wiki # Import modules # For XML from xml.dom.minidom import parse, parseString import codecs # For CLTK from cltk.stem.latin.j_v import JVReplacer from cltk.stem.lemma import LemmaReplacer from cltk.tag.pos import POSTag # Initialize CLTK lemmatizer = LemmaReplacer('latin') tagger = POSTag('latin') j = JVReplacer() # Parse XML xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml') #xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml') wordElementList = xmldoc.getElementsByTagName('w') for w in wordElementList: form = w.attributes['ana'].value print(form) # Parse the inflected word try: lemmaList = lemmatizer.lemmatize(form.lower())
def gen_docs(corpus, lemmatize, rm_stops): """Open and process files from a corpus. Return a list of sentences for an author. Each sentence is itself a list of tokenized words. """ assert corpus in ['phi5', 'tlg'] if corpus == 'phi5': language = 'latin' filepaths = assemble_phi5_author_filepaths() jv_replacer = JVReplacer() text_cleaner = phi5_plaintext_cleanup word_tokenizer = WordTokenizer('latin') if rm_stops: stops = latin_stops else: stops = None elif corpus == 'tlg': language = 'greek' filepaths = assemble_tlg_author_filepaths() text_cleaner = tlg_plaintext_cleanup word_tokenizer = WordTokenizer('greek') if rm_stops: stops = latin_stops else: stops = None if lemmatize: lemmatizer = LemmaReplacer(language) sent_tokenizer = TokenizeSentence(language) for filepath in filepaths: with open(filepath) as f: text = f.read() # light first-pass cleanup, before sentence tokenization (which relies on punctuation) text = text_cleaner(text, rm_punctuation=False, rm_periods=False) sent_tokens = sent_tokenizer.tokenize_sentences(text) # doc_sentences = [] for sentence in sent_tokens: # a second cleanup at sentence-level, to rm all punctuation sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True) sentence = word_tokenizer(sentence) sentence = [s.lower() for s in sentence] sentence = [w for w in sentence if w] if language == 'latin': sentence = [w[1:] if w.startswith('-') else w for w in sentence] if stops: sentence = [w for w in sentence if w not in stops] sentence = [w for w in sentence if len(w) > 1] # rm short words if sentence: sentence = sentence if lemmatize: sentence = lemmatizer.lemmatize(sentence) if sentence and language == 'latin': sentence = [jv_replacer.replace(word) for word in sentence] if sentence: yield sentence
def getLemma(self): lemmatizer = LemmaReplacer('latin') return lemmatizer.lemmatize(self.text)