Beispiel #1
0
    def setUp(self):
        self.tokenizer = TTSTokenizer(use_phonemes=False,
                                      characters=Graphemes())

        self.ph = ESpeak("tr", backend="espeak")
        self.tokenizer_ph = TTSTokenizer(use_phonemes=True,
                                         characters=IPAPhonemes(),
                                         phonemizer=self.ph)
Beispiel #2
0
 def test_not_found_characters(self):
     self.ph = ESpeak("en-us")
     tokenizer_local = TTSTokenizer(use_phonemes=True,
                                    characters=IPAPhonemes(),
                                    phonemizer=self.ph)
     self.assertEqual(len(self.tokenizer.not_found_characters), 0)
     text = "Yolk of one egg beaten light"
     ids = tokenizer_local.text_to_ids(text)
     text_hat = tokenizer_local.ids_to_text(ids)
     self.assertEqual(tokenizer_local.not_found_characters, ["̩"])
     self.assertEqual(text_hat, "jˈoʊk ʌv wˈʌn ˈɛɡ bˈiːʔn lˈaɪt")
Beispiel #3
0
    def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):

        # load dataset
        meta_data_train, meta_data_eval = load_tts_samples(dataset_config,
                                                           eval_split=True,
                                                           eval_split_size=0.2)
        items = meta_data_train + meta_data_eval

        tokenizer, _ = TTSTokenizer.init_from_config(c)
        dataset = TTSDataset(
            outputs_per_step=r,
            compute_linear_spec=True,
            return_wav=True,
            tokenizer=tokenizer,
            ap=self.ap,
            samples=items,
            batch_group_size=bgs,
            min_text_len=c.min_text_len,
            max_text_len=c.max_text_len,
            min_audio_len=c.min_audio_len,
            max_audio_len=c.max_audio_len,
            start_by_longest=start_by_longest,
        )
        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=dataset.collate_fn,
            drop_last=True,
            num_workers=c.num_loader_workers,
        )
        return dataloader, dataset
Beispiel #4
0
    def test_init_from_config(self):
        @dataclass
        class Characters(Coqpit):
            characters_class: str = None
            characters: str = _phonemes
            punctuations: str = _punctuations
            pad: str = _pad
            eos: str = _eos
            bos: str = _bos
            blank: str = _blank
            is_unique: bool = True
            is_sorted: bool = True

        @dataclass
        class TokenizerConfig(Coqpit):
            enable_eos_bos_chars: bool = True
            use_phonemes: bool = True
            add_blank: bool = False
            characters: str = Characters()
            phonemizer: str = "espeak"
            phoneme_language: str = "tr"
            text_cleaner: str = "phoneme_cleaners"
            characters = Characters()

        tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
        tokenizer_ph.phonemizer.backend = "espeak"
        text = "Bu bir Örnek."
        text_ph = "<BOS>" + self.ph.phonemize(text, separator="") + "<EOS>"
        ids = tokenizer_ph.text_to_ids(text)
        test_hat = tokenizer_ph.ids_to_text(ids)
        self.assertEqual(text_ph, test_hat)
Beispiel #5
0
    def init_from_config(config: Coqpit):
        """Initialize model from config."""
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config)
        tokenizer = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config)
        return BaseTacotron(config, ap, tokenizer, speaker_manager)
Beispiel #6
0
    def init_from_config(config: "ForwardTTSConfig",
                         samples: Union[List[List], List[Dict]] = None):
        """Initiate model from config

        Args:
            config (ForwardTTSConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        """
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config)
        tokenizer, new_config = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config, samples)
        return ForwardTTS(new_config, ap, tokenizer, speaker_manager)
Beispiel #7
0
    def init_from_config(config: "GlowTTSConfig",
                         samples: Union[List[List], List[Dict]] = None,
                         verbose=True):
        """Initiate model from config

        Args:
            config (VitsConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
            verbose (bool): If True, print init messages. Defaults to True.
        """
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config, verbose)
        tokenizer, new_config = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config, samples)
        return GlowTTS(new_config, ap, tokenizer, speaker_manager)
def setup_loader(ap, r, verbose=False):
    tokenizer, _ = TTSTokenizer.init_from_config(c)
    dataset = TTSDataset(
        outputs_per_step=r,
        compute_linear_spec=False,
        samples=meta_data,
        tokenizer=tokenizer,
        ap=ap,
        batch_group_size=0,
        min_text_len=c.min_text_len,
        max_text_len=c.max_text_len,
        min_audio_len=c.min_audio_len,
        max_audio_len=c.max_audio_len,
        phoneme_cache_path=c.phoneme_cache_path,
        precompute_num_workers=0,
        use_noise_augment=False,
        verbose=verbose,
        speaker_id_mapping=speaker_manager.ids
        if c.use_speaker_embedding else None,
        d_vector_mapping=speaker_manager.embeddings
        if c.use_d_vector_file else None,
    )

    if c.use_phonemes and c.compute_input_seq_cache:
        # precompute phonemes to have a better estimate of sequence lengths.
        dataset.compute_input_seq(c.num_loader_workers)
    dataset.preprocess_samples()

    loader = DataLoader(
        dataset,
        batch_size=c.batch_size,
        shuffle=False,
        collate_fn=dataset.collate_fn,
        drop_last=False,
        sampler=None,
        num_workers=c.num_loader_workers,
        pin_memory=False,
    )
    return loader
Beispiel #9
0
    ],
    sort_by_audio_len=True,
    max_seq_len=500000,
    output_path=output_path,
    datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# init model
model = ForwardTTS(config, ap, tokenizer)
Beispiel #10
0
class TestTTSTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = TTSTokenizer(use_phonemes=False,
                                      characters=Graphemes())

        self.ph = ESpeak("tr", backend="espeak")
        self.tokenizer_ph = TTSTokenizer(use_phonemes=True,
                                         characters=IPAPhonemes(),
                                         phonemizer=self.ph)

    def test_encode_decode_graphemes(self):
        text = "This is, a test."
        ids = self.tokenizer.encode(text)
        test_hat = self.tokenizer.decode(ids)
        self.assertEqual(text, test_hat)
        self.assertEqual(len(ids), len(text))

    def test_text_to_ids_phonemes(self):
        # TODO: note sure how to extend to cover all the languages and phonemizer.
        text = "Bu bir Örnek."
        text_ph = self.ph.phonemize(text, separator="")
        ids = self.tokenizer_ph.text_to_ids(text)
        test_hat = self.tokenizer_ph.ids_to_text(ids)
        self.assertEqual(text_ph, test_hat)

    def test_text_to_ids_phonemes_with_eos_bos(self):
        text = "Bu bir Örnek."
        self.tokenizer_ph.use_eos_bos = True
        text_ph = IPAPhonemes().bos + self.ph.phonemize(
            text, separator="") + IPAPhonemes().eos
        ids = self.tokenizer_ph.text_to_ids(text)
        test_hat = self.tokenizer_ph.ids_to_text(ids)
        self.assertEqual(text_ph, test_hat)

    def test_text_to_ids_phonemes_with_eos_bos_and_blank(self):
        text = "Bu bir Örnek."
        self.tokenizer_ph.use_eos_bos = True
        self.tokenizer_ph.add_blank = True
        text_ph = "<BOS><BLNK>b<BLNK>ʊ<BLNK> <BLNK>b<BLNK>ɪ<BLNK>r<BLNK> <BLNK>œ<BLNK>r<BLNK>n<BLNK>ˈ<BLNK>ɛ<BLNK>c<BLNK>.<BLNK><EOS>"
        ids = self.tokenizer_ph.text_to_ids(text)
        text_hat = self.tokenizer_ph.ids_to_text(ids)
        self.assertEqual(text_ph, text_hat)

    def test_print_logs(self):
        self.tokenizer.print_logs()
        self.tokenizer_ph.print_logs()

    def test_not_found_characters(self):
        self.ph = ESpeak("en-us")
        tokenizer_local = TTSTokenizer(use_phonemes=True,
                                       characters=IPAPhonemes(),
                                       phonemizer=self.ph)
        self.assertEqual(len(self.tokenizer.not_found_characters), 0)
        text = "Yolk of one egg beaten light"
        ids = tokenizer_local.text_to_ids(text)
        text_hat = tokenizer_local.ids_to_text(ids)
        self.assertEqual(tokenizer_local.not_found_characters, ["̩"])
        self.assertEqual(text_hat, "jˈoʊk ʌv wˈʌn ˈɛɡ bˈiːʔn lˈaɪt")

    def test_init_from_config(self):
        @dataclass
        class Characters(Coqpit):
            characters_class: str = None
            characters: str = _phonemes
            punctuations: str = _punctuations
            pad: str = _pad
            eos: str = _eos
            bos: str = _bos
            blank: str = _blank
            is_unique: bool = True
            is_sorted: bool = True

        @dataclass
        class TokenizerConfig(Coqpit):
            enable_eos_bos_chars: bool = True
            use_phonemes: bool = True
            add_blank: bool = False
            characters: str = Characters()
            phonemizer: str = "espeak"
            phoneme_language: str = "tr"
            text_cleaner: str = "phoneme_cleaners"
            characters = Characters()

        tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
        tokenizer_ph.phonemizer.backend = "espeak"
        text = "Bu bir Örnek."
        text_ph = "<BOS>" + self.ph.phonemize(text, separator="") + "<EOS>"
        ids = tokenizer_ph.text_to_ids(text)
        test_hat = tokenizer_ph.ids_to_text(ids)
        self.assertEqual(text_ph, test_hat)