def setUp(self): self.tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes()) self.ph = ESpeak("tr", backend="espeak") self.tokenizer_ph = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph)
def test_not_found_characters(self): self.ph = ESpeak("en-us") tokenizer_local = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph) self.assertEqual(len(self.tokenizer.not_found_characters), 0) text = "Yolk of one egg beaten light" ids = tokenizer_local.text_to_ids(text) text_hat = tokenizer_local.ids_to_text(ids) self.assertEqual(tokenizer_local.not_found_characters, ["̩"]) self.assertEqual(text_hat, "jˈoʊk ʌv wˈʌn ˈɛɡ bˈiːʔn lˈaɪt")
def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=True, return_wav=True, tokenizer=tokenizer, ap=self.ap, samples=items, batch_group_size=bgs, min_text_len=c.min_text_len, max_text_len=c.max_text_len, min_audio_len=c.min_audio_len, max_audio_len=c.max_audio_len, start_by_longest=start_by_longest, ) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers, ) return dataloader, dataset
def test_init_from_config(self): @dataclass class Characters(Coqpit): characters_class: str = None characters: str = _phonemes punctuations: str = _punctuations pad: str = _pad eos: str = _eos bos: str = _bos blank: str = _blank is_unique: bool = True is_sorted: bool = True @dataclass class TokenizerConfig(Coqpit): enable_eos_bos_chars: bool = True use_phonemes: bool = True add_blank: bool = False characters: str = Characters() phonemizer: str = "espeak" phoneme_language: str = "tr" text_cleaner: str = "phoneme_cleaners" characters = Characters() tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig()) tokenizer_ph.phonemizer.backend = "espeak" text = "Bu bir Örnek." text_ph = "<BOS>" + self.ph.phonemize(text, separator="") + "<EOS>" ids = tokenizer_ph.text_to_ids(text) test_hat = tokenizer_ph.ids_to_text(ids) self.assertEqual(text_ph, test_hat)
def init_from_config(config: Coqpit): """Initialize model from config.""" from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config) tokenizer = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config) return BaseTacotron(config, ap, tokenizer, speaker_manager)
def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None): """Initiate model from config Args: config (ForwardTTSConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. """ from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return ForwardTTS(new_config, ap, tokenizer, speaker_manager)
def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): """Initiate model from config Args: config (VitsConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. verbose (bool): If True, print init messages. Defaults to True. """ from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config, verbose) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return GlowTTS(new_config, ap, tokenizer, speaker_manager)
def setup_loader(ap, r, verbose=False): tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=False, samples=meta_data, tokenizer=tokenizer, ap=ap, batch_group_size=0, min_text_len=c.min_text_len, max_text_len=c.max_text_len, min_audio_len=c.min_audio_len, max_audio_len=c.max_audio_len, phoneme_cache_path=c.phoneme_cache_path, precompute_num_workers=0, use_noise_augment=False, verbose=verbose, speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None, d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. dataset.compute_input_seq(c.num_loader_workers) dataset.preprocess_samples() loader = DataLoader( dataset, batch_size=c.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, sampler=None, num_workers=c.num_loader_workers, pin_memory=False, ) return loader
], sort_by_audio_len=True, max_seq_len=500000, output_path=output_path, datasets=[dataset_config], ) # INITIALIZE THE AUDIO PROCESSOR # Audio processor is used for feature extraction and audio I/O. # It mainly serves to the dataloader and the training loggers. ap = AudioProcessor.init_from_config(config) # INITIALIZE THE TOKENIZER # Tokenizer is used to convert text to sequences of token IDs. # If characters are not defined in the config, default characters are passed to the config tokenizer, config = TTSTokenizer.init_from_config(config) # LOAD DATA SAMPLES # Each sample is a list of ```[text, audio_file_path, speaker_name]``` # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples( dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size, ) # init model model = ForwardTTS(config, ap, tokenizer)
class TestTTSTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes()) self.ph = ESpeak("tr", backend="espeak") self.tokenizer_ph = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph) def test_encode_decode_graphemes(self): text = "This is, a test." ids = self.tokenizer.encode(text) test_hat = self.tokenizer.decode(ids) self.assertEqual(text, test_hat) self.assertEqual(len(ids), len(text)) def test_text_to_ids_phonemes(self): # TODO: note sure how to extend to cover all the languages and phonemizer. text = "Bu bir Örnek." text_ph = self.ph.phonemize(text, separator="") ids = self.tokenizer_ph.text_to_ids(text) test_hat = self.tokenizer_ph.ids_to_text(ids) self.assertEqual(text_ph, test_hat) def test_text_to_ids_phonemes_with_eos_bos(self): text = "Bu bir Örnek." self.tokenizer_ph.use_eos_bos = True text_ph = IPAPhonemes().bos + self.ph.phonemize( text, separator="") + IPAPhonemes().eos ids = self.tokenizer_ph.text_to_ids(text) test_hat = self.tokenizer_ph.ids_to_text(ids) self.assertEqual(text_ph, test_hat) def test_text_to_ids_phonemes_with_eos_bos_and_blank(self): text = "Bu bir Örnek." self.tokenizer_ph.use_eos_bos = True self.tokenizer_ph.add_blank = True text_ph = "<BOS><BLNK>b<BLNK>ʊ<BLNK> <BLNK>b<BLNK>ɪ<BLNK>r<BLNK> <BLNK>œ<BLNK>r<BLNK>n<BLNK>ˈ<BLNK>ɛ<BLNK>c<BLNK>.<BLNK><EOS>" ids = self.tokenizer_ph.text_to_ids(text) text_hat = self.tokenizer_ph.ids_to_text(ids) self.assertEqual(text_ph, text_hat) def test_print_logs(self): self.tokenizer.print_logs() self.tokenizer_ph.print_logs() def test_not_found_characters(self): self.ph = ESpeak("en-us") tokenizer_local = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph) self.assertEqual(len(self.tokenizer.not_found_characters), 0) text = "Yolk of one egg beaten light" ids = tokenizer_local.text_to_ids(text) text_hat = tokenizer_local.ids_to_text(ids) self.assertEqual(tokenizer_local.not_found_characters, ["̩"]) self.assertEqual(text_hat, "jˈoʊk ʌv wˈʌn ˈɛɡ bˈiːʔn lˈaɪt") def test_init_from_config(self): @dataclass class Characters(Coqpit): characters_class: str = None characters: str = _phonemes punctuations: str = _punctuations pad: str = _pad eos: str = _eos bos: str = _bos blank: str = _blank is_unique: bool = True is_sorted: bool = True @dataclass class TokenizerConfig(Coqpit): enable_eos_bos_chars: bool = True use_phonemes: bool = True add_blank: bool = False characters: str = Characters() phonemizer: str = "espeak" phoneme_language: str = "tr" text_cleaner: str = "phoneme_cleaners" characters = Characters() tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig()) tokenizer_ph.phonemizer.backend = "espeak" text = "Bu bir Örnek." text_ph = "<BOS>" + self.ph.phonemize(text, separator="") + "<EOS>" ids = tokenizer_ph.text_to_ids(text) test_hat = tokenizer_ph.ids_to_text(ids) self.assertEqual(text_ph, test_hat)