def process_document(self, doc):
     cleaned_sents = []
     for paragraph in doc['text'].values():
         if type(paragraph) != str:
             paragraph = paragraph.values()
         else:
             paragraph = self.sent_tokenizer.tokenize(paragraph)
         for sent in paragraph:
             if type(sent) is dict:
                 for subsent in sent.values():
                     tokenized = self.sent_tokenizer.tokenize(subsent)
                     for token in tokenized:
                         cleaned = tlg_plaintext_cleanup(
                             token, rm_punctuation=True, rm_periods=True)
                         sentence = cltk_normalize(cleaned)
                         if len(self.word_tokenizer.tokenize(sentence)) > 5:
                             cleaned_sents.append(sentence)
             else:
                 tokenized = self.sent_tokenizer.tokenize(sent)
                 for token in tokenized:
                     cleaned = tlg_plaintext_cleanup(token,
                                                     rm_punctuation=True,
                                                     rm_periods=True)
                     sentence = cltk_normalize(cleaned)
                     if len(self.word_tokenizer.tokenize(sentence)) > 5:
                         cleaned_sents.append(sentence)
     return cleaned_sents
Esempio n. 2
0
def sentence_tokenize_corpus():
    """Fetches and tokenizes the corpus then writes it back out."""
    with open("texts.csv", 'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        print("Sentence tokenizing...")
        for i, row in enumerate(csv_reader):
            tokenized_texts = tokenize_with_custom_punkt_tokenizer(row[0])

            rc = 0
            group = ""
            for num, text in enumerate(tokenized_texts):
                text = strip_accents_from_sentence(text)
                text = tlg_plaintext_cleanup(text, rm_punctuation=True, rm_periods=True)
                wc = len(word_tokenizer.tokenize(text))
                if wc > 5:
                    write_to_csv("sents.csv", [text], [row[1]], [row[2]])

                rc += wc
                group += text
                if rc > 100:
                    write_to_csv("groups.csv", [group], [row[1]], [row[2]])
                    rc = 0
                    group = ""

            write_to_csv("groups.csv", [group], [row[1]], [row[2]])
    
    with open("spurious_texts.csv", 'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        print("Sentence tokenizing...")
        for i, row in enumerate(csv_reader):
            tokenized_texts = tokenize_with_custom_punkt_tokenizer(row[0])

            rc = 0
            group = ""
            for num, text in enumerate(tokenized_texts):
                text = strip_accents_from_sentence(text)
                text = tlg_plaintext_cleanup(text, rm_punctuation=True, rm_periods=True)
                wc = len(word_tokenizer.tokenize(text))
                if wc > 5:
                    write_spurious_to_csv("sents_spurious.csv", [text], [row[1]])

                rc += wc
                group += " "
                group += text
                if rc > 100:
                    write_spurious_to_csv("groups_spurious.csv", [group], [row[1]])
                    rc = 0
                    group = ""

            write_spurious_to_csv("groups_spurious.csv", [group], [row[1]])
Esempio n. 3
0
    def tlgu_cleanup(self, rm_punctuation=True, rm_periods=False):
        """Fix TLG betacode texts using TLGU.

        Necessary to cleanup TLG texts before processing, but can also used to
        perform rudimentary cleaning operations on other Greek texts.

        Args:
            rm_punctuation (:obj:`bool`, optional) True to remove punctuation marks (exception periods)
            rm_periods (:obj:`bool`, optional) True to remove periods

        Returns:
            :obj:`self.__class__` New instance with altered text

        Example:
            >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι')
            >>> print(text.tlgu_cleanup())
            ῖν εἰς δὲ τὸν ἕτερον καττίτερον εἰ λῶιον καὶ ἄμεινόν ἐστι
        """ # noqa
        from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
        return self.__class__(
            text=tlg_plaintext_cleanup(
                self.data, rm_punctuation=rm_punctuation, rm_periods=rm_periods
            ),
            options=self.options
        )
Esempio n. 4
0
 def test_tlg_plaintext_cleanup_rm_periods(self):
     """Test post-TLGU cleanup of text of Greek TLG text."""
     dirty = """{ΑΘΗΝΑΙΟΥ ΝΑΥΚΡΑΤΙΤΟΥ ΔΕΙΠΝΟΣΟΦΙΣΤΩΝ} LATIN Ἀθήναιος (μὲν) ὁ τῆς 999 βίβλου πατήρ: ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην."""  # pylint: disable=line-too-long
     clean = tlg_plaintext_cleanup(dirty,
                                   rm_punctuation=True,
                                   rm_periods=True)
     target = ' Ἀθήναιος ὁ τῆς βίβλου πατήρ ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην'
     self.assertEqual(clean, target)
Esempio n. 5
0
 def test_tlg_plaintext_cleanup(self):
     """Test post-TLGU cleanup of text of Greek TLG text."""
     dirty = """{ΑΘΗΝΑΙΟΥ ΝΑΥΚΡΑΤΙΤΟΥ ΔΕΙΠΝΟΣΟΦΙΣΤΩΝ} LATIN Ἀθήναιος (μὲν) ὁ τῆς 999 βίβλου πατήρ: ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην."""  # pylint: disable=line-too-long
     clean = tlg_plaintext_cleanup(dirty, rm_punctuation=True, rm_periods=False)
     target = ' Ἀθήναιος ὁ τῆς βίβλου πατήρ ποιεῖται δὲ τὸν λόγον πρὸς Τιμοκράτην.'
     self.assertEqual(clean, target)
Esempio n. 6
0
 def tlgu_cleanup(self, rm_punctuation=True, rm_periods=False):
     return self.__class__(data=tlg_plaintext_cleanup(
         self.data, rm_punctuation=rm_punctuation, rm_periods=rm_periods),
                           metadata=self.metadata)