def test_other_punctuation(self): # String of the unique 'other punctuations' other_punc_str = ''.join( set(symbols.ALL_PUNC) - set(symbols.TONE_MARKS) - set(symbols.PERIOD_COMMA)) t = Tokenizer([other_punctuation]) self.assertEqual(len(t.run(other_punc_str)) - 1, len(other_punc_str))
def __init__( self, text, tld='com', lang='en', slow=False, lang_check=True, pre_processor_funcs=[ pre_processors.tone_marks, pre_processors.end_of_line, pre_processors.abbreviations, pre_processors.word_sub ], tokenizer_func=Tokenizer([ tokenizer_cases.tone_marks, tokenizer_cases.period_comma, tokenizer_cases.colon, tokenizer_cases.other_punctuation ]).run ): # Debug for k, v in dict(locals()).items(): if k == 'self': continue log.debug("%s: %s", k, v) # Text assert text, 'No text to speak' self.text = text # Translate URL top-level domain self.tld = tld # Language self.lang_check = lang_check self.lang = lang if self.lang_check: # Fallback lang in case it is deprecated self.lang = _fallback_deprecated_lang(lang) try: langs = tts_langs() if self.lang not in langs: raise ValueError("Language not supported: %s" % lang) except RuntimeError as e: log.debug(str(e), exc_info=True) log.warning(str(e)) # Read speed if slow: self.speed = Speed.SLOW else: self.speed = Speed.NORMAL # Pre-processors and tokenizer self.pre_processor_funcs = pre_processor_funcs self.tokenizer_func = tokenizer_func
def __init__( self, text, lang='en', slow=False, lang_check=True, pre_processor_funcs=[ pre_processors.tone_marks, pre_processors.end_of_line, pre_processors.abbreviations, pre_processors.word_sub ], tokenizer_func=Tokenizer([ tokenizer_cases.tone_marks, tokenizer_cases.period_comma, tokenizer_cases.colon, tokenizer_cases.other_punctuation ]).run ): # Debug for k, v in locals().items(): if k == 'self': continue log.debug("%s: %s", str(k), str(v)) # Text assert text, 'No text to speak' self.text = text # Language if lang_check: try: langs = tts_langs() if lang.lower() not in langs: raise ValueError("Language not supported: %s" % lang) except RuntimeError as e: log.debug(str(e), exc_info=True) log.warning(str(e)) self.lang_check = lang_check self.lang = lang.lower() # Read speed if slow: self.speed = Speed.SLOW else: self.speed = Speed.NORMAL # Pre-processors and tokenizer self.pre_processor_funcs = pre_processor_funcs self.tokenizer_func = tokenizer_func # Google Translate token self.token = gtts_token.Token()
def __init__(self, text, lang='en', slow=False, lang_check=True, pre_processor_funcs=[ pre_processors.tone_marks, pre_processors.end_of_line, pre_processors.abbreviations, pre_processors.word_sub ], tokenizer_func=Tokenizer([ tokenizer_cases.tone_marks, tokenizer_cases.period_comma, tokenizer_cases.colon, tokenizer_cases.other_punctuation ]).run): # Debug for k, v in locals().items(): if k == 'self': continue log.debug("%s: %s", k, v) # Text assert text, 'No text to speak' self.text = text self.lang = 'en' # Read speed if slow: self.speed = Speed.SLOW else: self.speed = Speed.NORMAL # Pre-processors and tokenizer self.pre_processor_funcs = pre_processor_funcs self.tokenizer_func = tokenizer_func # Google Translate token self.token = gtts_token.Token()
def test_tone_marks(self): t = Tokenizer([tone_marks]) _in = "Lorem? Ipsum!" _out = ['Lorem?', 'Ipsum!'] self.assertEqual(t.run(_in), _out)
def test_legacy_all_punctuation(self): t = Tokenizer([legacy_all_punctuation]) self.assertEqual( len(t.run(symbols.ALL_PUNC)) - 1, len(symbols.ALL_PUNC))
def test_period_comma(self): t = Tokenizer([period_comma]) _in = "Hello, it's 24.5 degrees in the U.K. today. $20,000,000." _out = ['Hello', "it's 24.5 degrees in the U.K. today", '$20,000,000.'] self.assertEqual(t.run(_in), _out)
def test_colon(self): t = Tokenizer([colon]) _in = "It's now 6:30 which means: morning missing:space" _out = ["It's now 6:30 which means", ' morning missing', 'space'] self.assertEqual(t.run(_in), _out)