Beispiel #1
0
    def _tokenize(self, text):
        # Pre-clean
        text = text.strip()

        # Apply pre-processors
        for pp in self.pre_processor_funcs:
            log.debug("pre-processing: %s", pp)
            text = pp(text)

        if _len(text) <= self.GOOGLE_TTS_MAX_CHARS:
            return _clean_tokens([text])

        # Tokenize
        log.debug("tokenizing: %s", self.tokenizer_func)
        tokens = self.tokenizer_func(text)

        # Clean
        tokens = _clean_tokens(tokens)

        # Minimize
        min_tokens = []
        for t in tokens:
            min_tokens += _minimize(t, ' ', self.GOOGLE_TTS_MAX_CHARS)

        # Filter empty tokens, post-minimize
        tokens = [t for t in min_tokens if t]

        return min_tokens
Beispiel #2
0
def test_strip():
    _in = [" Bacon  ", "& ", "ipsum\r", "."]
    _out = ["Bacon", "&", "ipsum"]
    assert _clean_tokens(_in) == _out
Beispiel #3
0
def test_only_space_and_punc():
    _in = [",(:)?", "\t    ", "\n"]
    _out = []
    assert _clean_tokens(_in) == _out
Beispiel #4
0
 def test_strip(self):
     _in = [" Bacon  ", "& ", "ipsum\r", "."]
     _out = ["Bacon", "&", "ipsum"]
     self.assertEqual(_clean_tokens(_in), _out)
Beispiel #5
0
 def test_only_space_and_punc(self):
     _in = [",(:)?", "\t    ", "\n"]
     _out = []
     self.assertEqual(_clean_tokens(_in), _out)