Exemple #1
0
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
Exemple #2
0
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
Exemple #3
0
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     :param model: tokenizer object to used # Should be in init?
     :type model: object
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
Exemple #4
0
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     :param model: tokenizer object to used # Should be in init?
     :type model: object
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
Exemple #5
0
    def tokenize(self, text: str, split_enclitics:list = ['ne', 'n', 'que', 've', 'ue', 'st'],
                                  split_words:list = []):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        :param model: tokenizer object to used # Should be in init?
        :type model: object
        """
        if self._latin_replacements:
            split_words = self._latin_replacements

        if split_words:
            text = self._replace_patterns(text, split_words)
        sents = self.sent_tokenizer.tokenize(text)
        if split_enclitics:
            sents = self._split_enclitics(sents, split_enclitics)
        tokenizer = TreebankWordTokenizer()
        return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]