def process_document(self, doc): cleaned_sents = [] for paragraph in doc['text'].values(): if type(paragraph) != str: paragraph = paragraph.values() else: paragraph = self.sent_tokenizer.tokenize(paragraph) for sent in paragraph: if type(sent) is dict: for subsent in sent.values(): tokenized = self.sent_tokenizer.tokenize(subsent) for token in tokenized: cleaned = tlg_plaintext_cleanup( token, rm_punctuation=True, rm_periods=True) sentence = cltk_normalize(cleaned) if len(self.word_tokenizer.tokenize(sentence)) > 5: cleaned_sents.append(sentence) else: tokenized = self.sent_tokenizer.tokenize(sent) for token in tokenized: cleaned = tlg_plaintext_cleanup(token, rm_punctuation=True, rm_periods=True) sentence = cltk_normalize(cleaned) if len(self.word_tokenizer.tokenize(sentence)) > 5: cleaned_sents.append(sentence) return cleaned_sents
def test_cltk_normalize_compatible(self): """Test Normalizing Text with compatibility True""" s1 = "café" s2 = "cafe\u0301" normalized_text = cltk_normalize(s1, compatibility=True) target = normalize("NFKC", s2) self.assertEqual(normalized_text, target)
def test_cltk_normalize_noncompatible(self): """Test Normalizing Text with compatibility False""" s1 = 'café' s2 = 'cafe\u0301' normalized_text = cltk_normalize(s1, compatibility=False) target = normalize('NFC', s2) self.assertEqual(normalized_text, target)
def test_cltk_normalize_noncompatible(self): """Test Normalizing Text with compatibility False""" s1 = 'café' s2 = 'cafe\u0301' normalized_text = cltk_normalize(s1, compatibility=False) target = normalize('NFC', s2) self.assertEqual(normalized_text, target)
def main(): if len(sys.argv) < 2: print( "Please supply an inflected word on the command line. Example: search_by_lemma.py κύνεσσιν\n" ) sys.exit() infl = sys.argv[1] lem = lemmatize(infl)[0] # lemmatized print("searching for " + lem + " <- " + infl) index = {} for work in ["iliad", "odyssey"]: for book in range(1, 24 + 1): # ranges from 1 to 24 filename = 'texts/homer.' + work + '.part.' + str(book) + '.tess' #print(filename) reader = get_corpus_reader(corpus_name='greek_text_tesserae', language='greek') reader._fileids = [filename] sentences = list(reader.sents([filename])) sentences = [cltk_normalize(s) for s in sentences] count_sentences = 0 for s in sentences: count_sentences = count_sentences + 1 no_punct = re.sub( r"[,;:\.']", '', s ) # remove punctuation, which lemmatizer treats as independent words words = re.split("\s+", no_punct) count_words = 0 for word in lemmatize(no_punct): count_words = count_words + 1 if lem == word: i = count_words - 1 w = words[i] context = " ".join( words[max(i - 3, 0):min(i + 4, len(words) - 1)]) #context = re.sub(re.compile("("+w+")"),r"__\1__",context) # ... surround with __ __ pos_tagged = tagger.tag_tnt(no_punct) # ... tag words in sentence with parts of speech, https://github.com/cltk/tutorials/blob/master/8%20Part-of-speech%20tagging.ipynb # for descriptions of what the POS tags mean, see https://linguistics.stackexchange.com/questions/12803/what-do-the-labels-mean-in-this-latin-pos-tagging describe = w for t in pos_tagged: if t[0] == w: describe = t[0] + " " + pos_tag_to_description( t[1]) break print(work + " " + str(book) + ", sentence " + str(count_sentences) + ", word " + str(count_words) + ": " + describe + " " + context) if w in index: index[w] += 1 else: index[w] = 1 #sys.exit() for w in sorted(list(index.keys())): print(str(index[w]) + " " + w)
def iter_docs(docs_dir, rm_ascii=False): """Stream files in a dir (TLG, TEI, etc.) doc-by-doc.""" file_names = os.listdir(docs_dir) for file_name in file_names: file_path = os.path.join(docs_dir, file_name) with open(file_path) as file_open: file_read = file_open.read() tokens = tokenize(file_read, rm_ascii=rm_ascii) tokens = [cltk_normalize(token) for token in tokens] # ignore very short docs # todo: get file length distribution to better know what is short in TLG if len(tokens) < DOC_MIN: continue yield file_name, tokens
def iter_docs(docs_dir, rm_ascii=False): """Stream files in a dir (TLG, TEI, etc.) doc-by-doc.""" file_names = os.listdir(docs_dir) for file_name in file_names: file_path = os.path.join(docs_dir, file_name) with open(file_path) as file_open: file_read = file_open.read() tokens = tokenize(file_read, rm_ascii=rm_ascii) tokens = [cltk_normalize(token) for token in tokens] # ignore very short docs # todo: get file length distribution to better know what is short in TLG if len(tokens) < DOC_MIN: continue yield file_name, tokens
def lemmatizeList(self, lines): from cltk.corpus.utils.formatter import cltk_normalize tagger = POSTag('greek') lemmatizer = LemmaReplacer('greek') # can help when using certain texts (doc says it, so i does it) lines = cltk_normalize(lines) # print(lines) # exit(0) lines = lemmatizer.lemmatize(lines) # Remove Stopwords and numbers and lowercases all words. lines = [w.lower() for w in lines if not w in STOPS_LIST] # lemmWords = removeNumbers(lemmWords) return ' '.join(lines)
def predict_from_file(path, model, use_sequential_decoding, align, step_len): """Runs prediction using the model on the texts located in the file given in path.""" max_seq_len = model.processor.max_seq_len - 2 with open(path, "r") as fp: texts = fp.read().splitlines() # prepare texts texts = clean_texts(texts, CHARS_TO_REMOVE, CHARS_TO_REPLACE) texts = [cltk_normalize(replace_square_brackets(t)) for t in texts] texts = [t.replace(" ", "_") for t in texts] results = [] # break up long texts for t in texts: sequences = [] if len(t) >= max_seq_len: if not (step_len and step_len < max_seq_len): step_len = round(max_seq_len / 2) # for i in range(0, len(t) - step_len, step_len): for i in range(0, len(t), step_len): seq = t[i : i + max_seq_len] sequences.append(seq) else: sequences.append(t) sequences = convert_masking(sequences) dicts = sentences_to_dicts(sequences) if use_sequential_decoding: result = model.predict_sequentially(dicts=dicts) else: result = model.predict(dicts=dicts) results.append(result) # output results for result in results: nb_of_masks = 0 # needed to proper alignment for i, res in enumerate(result): prediced_text = res["predictions"]["text_with_preds"].replace("_", " ") masked_text = res["predictions"]["masked_text"].replace("_", " ") if align: if not step_len: step_len = round(max_seq_len / 2) # an approximate alignment is calculated by shifting each line by step_len + 2 * the number of masks in the overlaping portion of the previous prediction (to take into account the square brackets which are added around each prediction) print(" " * (step_len * i + (2 * nb_of_masks)) + prediced_text) nb_of_masks += len(re.findall(r"#+", masked_text[:step_len])) else: print(res["predictions"]["text_with_preds"].replace("_", " "))
def normalize(self): """Fixes problems with differences in greek accent encoding. Certain Greek accents have more than one possible encoding. Uses cltk's built-in normalizer to correct the character encoding differences and ensure that accents are encoded the same way. Returns: :obj:`self.__class__` New instance with altered text Example: >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι') >>> print(text.normalize()) ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι """ # noqa from cltk.corpus.utils.formatter import cltk_normalize return self.__class__( text=cltk_normalize(str(self.data)), options=self.options )
def clean_tokens(tokens, chars_to_remove, chars_to_replace): """Cleans a list of tokens.""" cleaned_tokens = [] for t in tokens: if t: # remove words in which latin characters appear if not re.search(r"\w+", t, re.ASCII): # remove tokens containing digits if not re.search(r"\d+", t): # normalize t = cltk_normalize(t) t = t.strip("\t\r\n") # remove unwanted chars t = remove_unwanted_chars(t, chars_to_remove) t = replace_chars(t, chars_to_replace) # remove any inter-word hypens or en-dashes t = re.sub(r"([^\s])(-|–)", r"\1", t) # convert some other forms of whitespace to normal spaces t = re.sub(r"\s+", " ", t) # remove repeated whitespace t = re.sub(r"\s{2,}", " ", t) cleaned_tokens.append(t) return cleaned_tokens
corpus_importer.import_corpus('greek_models_cltk') corpus_importer2 = CorpusImporter('greek') corpus_importer2.import_corpus('greek_text_perseus') philippians_reader = get_corpus_reader(corpus_name="greek_text_perseus", language="greek") philippians_reader._fileids = [ 'new-testament__letter-to-the-philippians__grc.json' ] # print(list(perseus_reader.sents())) sentences = list(philippians_reader.sents()) sentence = cltk_normalize(sentences[0]) lemmatizer = LemmaReplacer('greek') word_list = lemmatizer.lemmatize(sentence) tagger = POSTag('greek') parts_of_speech = tagger.tag_ngram_123_backoff(sentence) # This is not a great lemmatizer standard_list = lemmatizer.lemmatize(list(philippians_reader.words()), return_raw=True) lemmatizer2 = BackoffGreekLemmatizer() # this one seems better backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words()))
def lemmatize(s): # returns an array return lemmatizer.lemmatize(cltk_normalize(s))
#!/bin/python3 from cltk.stem.lemma import LemmaReplacer from cltk.corpus.utils.formatter import cltk_normalize lemmatizer = LemmaReplacer('greek') text = """ μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος οὐλομένην, ἣ μυρί' ̓Αχαιοῖς ἄλγε' ἔθηκε, πολλὰς δ' ἰφθίμους ψυχὰς ̓́Αϊδι προί̈αψεν ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν οἰωνοῖσί τε πᾶσι, Διὸς δ' ἐτελείετο βουλή, ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε Ατρεί̈δης τε ἄναξ ἀνδρῶν καὶ δῖος ̓Αχιλλεύς. """ #print(lemmatizer.lemmatize(text)) print(lemmatizer.lemmatize("Μῆνιν ἄειδε, θεά")) print( lemmatizer.lemmatize( cltk_normalize("μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος"))) # ... doesn't work without the normalization
DOC_MIN = 50 # drop docs shorter than remove_ascii = True no_below = 20 no_above = 0.1 STOPS_LIST_GRK = [ simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0] for stop in STOPS_LIST_GRK if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0 ] STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ'] STOPS_LIST_GRK += [ "τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν', 'πρε', 'ἀλλ' ] # useful for after rm accents STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK] ascii_str = string.ascii_letters + string.punctuation + string.digits def mk_working_dir(fp): """Make dir if not exists.""" user_dir = os.path.expanduser(fp) try: os.makedirs(user_dir) except FileExistsError: pass def tokenize(text, rm_ascii=False): """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine
# configs for all notebooks working_dir = os.path.expanduser('~/cltk_data/user_data/lda_1kgreek/') PREPROCESS_DEACCENT = False TOK_MIN = 3 # rm words shorter than TOK_MAX = 20 # rm words longer than DOC_MIN = 50 # drop docs shorter than remove_ascii = True no_below = 20 no_above = 0.1 STOPS_LIST_GRK = [simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0] for stop in STOPS_LIST_GRK if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0] STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ'] STOPS_LIST_GRK += ["τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν', 'πρε', 'ἀλλ'] # useful for after rm accents STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK] ascii_str = string.ascii_letters + string.punctuation + string.digits def mk_working_dir(fp): """Make dir if not exists.""" user_dir = os.path.expanduser(fp) try: os.makedirs(user_dir) except FileExistsError: pass def tokenize(text, rm_ascii=False): """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine
def normalize(self): return self.__class__(cltk_normalize(str(self.data)), self.metadata)