Exemple #1
0
 def tokenize_to_words(self, text):
     """Get a sequence of lowercase strings corresponding to the words in a
     text. Sentence-boundaries are only implicitly preserved (e.g.
     through punctuation).
     """
     text = unicode_utils.to_unicode(text)
     words = self.word_tokenizer.tokenize(text)
     return words
Exemple #2
0
    def replace(self, text):
        """
        Replaces all matches for the patterns in :py:attr:`compiled_patterns`
        with their corresponding replacements in :py:attr:`compiled_patterns`.

        :param text: The text that will be scanned for replacements.
        """
        text = unicode_utils.to_unicode(text)
        for pattern, repl in self.compiled_patterns: text = re.sub(pattern, repl, text)
        return text
Exemple #3
0
 def tokenize_text(self, text):
     """Returns a sequence of sequences containing each sentence in a text,
     and then each word.
     """
     text = unicode_utils.to_unicode(text)
     sents = self.sent_tokenizer.tokenize(text)
     word_tokenized_sents = (
         self.word_tokenizer.tokenize(sent) for sent in sents
     )
     return word_tokenized_sents
Exemple #4
0
    def get_word_count(self, text):
        """Gets the word count for a text by tokenizing it and returning the length
        of the resulting sequence.

        *Note*: This is an expensive operation that should not be called if we
        already have a tokenized version of the text.
        """
        text = unicode_utils.to_unicode(text)
        words = self.tokenize_to_words(text)
        return len(list(words))
Exemple #5
0
 def tokenize_to_sentences(self, text):
     """Returns a sequence containing each sentence in a text. Each sentence is
     converted to lowercase stripped of whitespace at the beginning and end.
     """
     text = unicode_utils.to_unicode(text)
     word_tokenized_sents = self.tokenize_text(text)
     lowercase_sents = (
         ( word.lower().strip() for word in words )
             for words in word_tokenized_sents
     )
     return lowercase_sents