def process_document(self, doc): cleaned_sents = [] for paragraph in doc['text'].values(): if type(paragraph) != str: paragraph = paragraph.values() else: paragraph = self.sent_tokenizer.tokenize(paragraph) for sent in paragraph: if type(sent) is dict: for subsent in sent.values(): tokenized = self.sent_tokenizer.tokenize(subsent) for token in tokenized: cleaned = tlg_plaintext_cleanup( token, rm_punctuation=True, rm_periods=True) sentence = cltk_normalize(cleaned) if len(self.word_tokenizer.tokenize(sentence)) > 5: cleaned_sents.append(sentence) else: tokenized = self.sent_tokenizer.tokenize(sent) for token in tokenized: cleaned = tlg_plaintext_cleanup(token, rm_punctuation=True, rm_periods=True) sentence = cltk_normalize(cleaned) if len(self.word_tokenizer.tokenize(sentence)) > 5: cleaned_sents.append(sentence) return cleaned_sents
def sentence_tokenize_corpus(): """Fetches and tokenizes the corpus then writes it back out.""" with open("texts.csv", 'r') as f: csv_reader = csv.reader(f, delimiter='\t') print("Sentence tokenizing...") for i, row in enumerate(csv_reader): tokenized_texts = tokenize_with_custom_punkt_tokenizer(row[0]) rc = 0 group = "" for num, text in enumerate(tokenized_texts): text = strip_accents_from_sentence(text) text = tlg_plaintext_cleanup(text, rm_punctuation=True, rm_periods=True) wc = len(word_tokenizer.tokenize(text)) if wc > 5: write_to_csv("sents.csv", [text], [row[1]], [row[2]]) rc += wc group += text if rc > 100: write_to_csv("groups.csv", [group], [row[1]], [row[2]]) rc = 0 group = "" write_to_csv("groups.csv", [group], [row[1]], [row[2]]) with open("spurious_texts.csv", 'r') as f: csv_reader = csv.reader(f, delimiter='\t') print("Sentence tokenizing...") for i, row in enumerate(csv_reader): tokenized_texts = tokenize_with_custom_punkt_tokenizer(row[0]) rc = 0 group = "" for num, text in enumerate(tokenized_texts): text = strip_accents_from_sentence(text) text = tlg_plaintext_cleanup(text, rm_punctuation=True, rm_periods=True) wc = len(word_tokenizer.tokenize(text)) if wc > 5: write_spurious_to_csv("sents_spurious.csv", [text], [row[1]]) rc += wc group += " " group += text if rc > 100: write_spurious_to_csv("groups_spurious.csv", [group], [row[1]]) rc = 0 group = "" write_spurious_to_csv("groups_spurious.csv", [group], [row[1]])
def tlgu_cleanup(self, rm_punctuation=True, rm_periods=False): """Fix TLG betacode texts using TLGU. Necessary to cleanup TLG texts before processing, but can also used to perform rudimentary cleaning operations on other Greek texts. Args: rm_punctuation (:obj:`bool`, optional) True to remove punctuation marks (exception periods) rm_periods (:obj:`bool`, optional) True to remove periods Returns: :obj:`self.__class__` New instance with altered text Example: >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι') >>> print(text.tlgu_cleanup()) ῖν εἰς δὲ τὸν ἕτερον καττίτερον εἰ λῶιον καὶ ἄμεινόν ἐστι """ # noqa from cltk.corpus.utils.formatter import tlg_plaintext_cleanup return self.__class__( text=tlg_plaintext_cleanup( self.data, rm_punctuation=rm_punctuation, rm_periods=rm_periods ), options=self.options )
def test_tlg_plaintext_cleanup_rm_periods(self): """Test post-TLGU cleanup of text of Greek TLG text.""" dirty = """{ΑΘΗΝΑΙΟΥ ΝΑΥΚΡΑΤΙΤΟΥ ΔΕΙΠΝΟΣΟΦΙΣΤΩΝ} LATIN Ἀθήναιος (μὲν) ὁ τῆς 999 βίβλου πατήρ: ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην.""" # pylint: disable=line-too-long clean = tlg_plaintext_cleanup(dirty, rm_punctuation=True, rm_periods=True) target = ' Ἀθήναιος ὁ τῆς βίβλου πατήρ ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην' self.assertEqual(clean, target)
def test_tlg_plaintext_cleanup(self): """Test post-TLGU cleanup of text of Greek TLG text.""" dirty = """{ΑΘΗΝΑΙΟΥ ΝΑΥΚΡΑΤΙΤΟΥ ΔΕΙΠΝΟΣΟΦΙΣΤΩΝ} LATIN Ἀθήναιος (μὲν) ὁ τῆς 999 βίβλου πατήρ: ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην.""" # pylint: disable=line-too-long clean = tlg_plaintext_cleanup(dirty, rm_punctuation=True, rm_periods=False) target = ' Ἀθήναιος ὁ τῆς βίβλου πατήρ ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην.' self.assertEqual(clean, target)
def tlgu_cleanup(self, rm_punctuation=True, rm_periods=False): return self.__class__(data=tlg_plaintext_cleanup( self.data, rm_punctuation=rm_punctuation, rm_periods=rm_periods), metadata=self.metadata)