Beispiel #1
0
 def _init_speaker_encoder(self, speaker_manager):
     """Initialize the SpeakerEncoder"""
     if self.encoder_checkpoint:
         if speaker_manager is None:
             speaker_manager = SpeakerManager(
                 encoder_model_path=self.encoder_checkpoint,
                 encoder_config_path=self.encoder_config)
         else:
             speaker_manager.init_speaker_encoder(self.encoder_checkpoint,
                                                  self.encoder_config)
     return speaker_manager
Beispiel #2
0
def test_run_all_models():
    """Check if all the models are downloadable and tts models run correctly."""
    print(" > Run synthesizer with all the models.")
    download_dir = get_user_data_dir("tts")
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    manager = ModelManager(output_prefix=get_tests_output_path())
    model_names = manager.list_models()
    for model_name in model_names:
        print(f"\n > Run - {model_name}")
        model_path, _, _ = manager.download_model(model_name)
        if "tts_models" in model_name:
            local_download_dir = os.path.dirname(model_path)
            # download and run the model
            speaker_files = glob.glob(local_download_dir + "/speaker*")
            language_files = glob.glob(local_download_dir + "/language*")
            language_id = ""
            if len(speaker_files) > 0:
                # multi-speaker model
                if "speaker_ids" in speaker_files[0]:
                    speaker_manager = SpeakerManager(
                        speaker_id_file_path=speaker_files[0])
                elif "speakers" in speaker_files[0]:
                    speaker_manager = SpeakerManager(
                        d_vectors_file_path=speaker_files[0])

                # multi-lingual model - Assuming multi-lingual models are also multi-speaker
                if len(language_files
                       ) > 0 and "language_ids" in language_files[0]:
                    language_manager = LanguageManager(
                        language_ids_file_path=language_files[0])
                    language_id = language_manager.language_names[0]

                speaker_id = list(speaker_manager.ids.keys())[0]
                run_cli(
                    f"tts --model_name  {model_name} "
                    f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" '
                )
            else:
                # single-speaker model
                run_cli(
                    f"tts --model_name  {model_name} "
                    f'--text "This is an example." --out_path "{output_path}"')
            # remove downloaded models
            shutil.rmtree(download_dir)
        else:
            # only download the model
            manager.download_model(model_name)
        print(f" | > OK: {model_name}")

    folders = glob.glob(os.path.join(manager.output_prefix, "*"))
    assert len(folders) == len(model_names)
    shutil.rmtree(manager.output_prefix)
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data, speaker_manager

    # Audio processor
    ap = AudioProcessor(**c.audio)

    # load data instances
    meta_data_train, meta_data_eval = load_tts_samples(
        c.datasets,
        eval_split=args.eval,
        eval_split_max_size=c.eval_split_max_size,
        eval_split_size=c.eval_split_size)

    # use eval and training partitions
    meta_data = meta_data_train + meta_data_eval

    # init speaker manager
    if c.use_speaker_embedding:
        speaker_manager = SpeakerManager(data_items=meta_data)
    elif c.use_d_vector_file:
        speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
    else:
        speaker_manager = None

    # setup model
    model = setup_model(c)

    # restore model
    model.load_checkpoint(c, args.checkpoint_path, eval=True)

    if use_cuda:
        model.cuda()

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)
    # set r
    r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
    own_loader = setup_loader(ap, r, verbose=True)

    extract_spectrograms(
        own_loader,
        model,
        ap,
        args.output_path,
        quantized_wav=args.quantized,
        save_audio=args.save_audio,
        debug=args.debug,
        metada_name="metada.txt",
    )
Beispiel #4
0
    def _load_speakers(self, speaker_file: str) -> None:
        """Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker
        encoder if it is defined.

        Args:
            speaker_file (str): path to the speakers meta-data file.
        """
        print("Loading speakers ...")
        self.speaker_manager = SpeakerManager(
            encoder_model_path=self.encoder_checkpoint,
            encoder_config_path=self.encoder_config)
        self.speaker_manager.load_d_vectors_file(
            self.tts_config.get("d_vector_file", speaker_file))
        self.num_speakers = self.speaker_manager.num_speakers
        self.d_vector_dim = self.speaker_manager.d_vector_dim
Beispiel #5
0
    def init_from_config(config: Coqpit):
        """Initialize model from config."""
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config)
        tokenizer = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config)
        return BaseTacotron(config, ap, tokenizer, speaker_manager)
Beispiel #6
0
    def test_secl_forward(self):
        num_speakers = 10
        num_langs = 3
        batch_size = 2

        speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG)
        speaker_encoder_config.model_params["use_torch_spec"] = True
        speaker_encoder = setup_encoder_model(speaker_encoder_config).to(
            device)
        speaker_manager = SpeakerManager()
        speaker_manager.encoder = speaker_encoder

        args = VitsArgs(
            language_ids_file=LANG_FILE,
            use_language_embedding=True,
            spec_segment_size=10,
            use_speaker_encoder_as_loss=True,
        )
        config = VitsConfig(num_speakers=num_speakers,
                            use_speaker_embedding=True,
                            model_args=args)
        config.audio.sample_rate = 16000

        input_dummy, input_lengths, _, spec, spec_lengths, waveform = self._create_inputs(
            config, batch_size=batch_size)
        speaker_ids = torch.randint(0, num_speakers,
                                    (batch_size, )).long().to(device)
        lang_ids = torch.randint(0, num_langs,
                                 (batch_size, )).long().to(device)

        model = Vits(config, speaker_manager=speaker_manager).to(device)
        output_dict = model.forward(
            input_dummy,
            input_lengths,
            spec,
            spec_lengths,
            waveform,
            aux_input={
                "speaker_ids": speaker_ids,
                "language_ids": lang_ids
            },
        )
        self._check_forward_outputs(config, output_dict,
                                    speaker_encoder_config)
Beispiel #7
0
    def test_speaker_embedding():
        # load config
        config = load_config(encoder_config_path)
        config.audio.resample = True

        # create a dummy speaker encoder
        model = setup_speaker_encoder_model(config)
        save_checkpoint(model, None, None, get_tests_input_path(), 0)

        # load audio processor and speaker encoder
        ap = AudioProcessor(**config.audio)
        manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)

        # load a sample audio and compute embedding
        waveform = ap.load_wav(sample_wav_path)
        mel = ap.melspectrogram(waveform)
        d_vector = manager.compute_d_vector(mel)
        assert d_vector.shape[1] == 256

        # compute d_vector directly from an input file
        d_vector = manager.compute_d_vector_from_clip(sample_wav_path)
        d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path)
        d_vector = torch.FloatTensor(d_vector)
        d_vector2 = torch.FloatTensor(d_vector2)
        assert d_vector.shape[0] == 256
        assert (d_vector - d_vector2).sum() == 0.0

        # compute d_vector from a list of wav files.
        d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2])
        d_vector3 = torch.FloatTensor(d_vector3)
        assert d_vector3.shape[0] == 256
        assert (d_vector - d_vector3).sum() != 0.0

        # remove dummy model
        os.remove(encoder_model_path)
Beispiel #8
0
 def test_init_multispeaker(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config)
     # speaker embedding with default speaker_embedding_dim
     config.use_speaker_embedding = True
     config.num_speakers = 5
     config.d_vector_dim = None
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     # use external speaker embeddings with speaker_embedding_dim = 301
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 301
     model = GlowTTS(config)
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, 301)
     # use speaker embedddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_speaker_embedding = True
     config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech",
                                         "speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
     # use external speaker embeddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 256
     config.d_vector_file = os.path.join(get_tests_data_path(),
                                         "dummy_speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
Beispiel #9
0
    def init_from_config(config: "ForwardTTSConfig",
                         samples: Union[List[List], List[Dict]] = None):
        """Initiate model from config

        Args:
            config (ForwardTTSConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        """
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config)
        tokenizer, new_config = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config, samples)
        return ForwardTTS(new_config, ap, tokenizer, speaker_manager)
Beispiel #10
0
    def _init_speaker_manager(self):
        """Initialize the SpeakerManager"""
        # setup if multi-speaker settings are in the global model config
        speaker_manager = None
        speakers_file = get_from_config_or_model_args_with_default(
            self.tts_config, "speakers_file", None)
        if self._is_use_speaker_embedding():
            if self.tts_speakers_file:
                speaker_manager = SpeakerManager(
                    speaker_id_file_path=self.tts_speakers_file)
            elif speakers_file:
                speaker_manager = SpeakerManager(
                    speaker_id_file_path=speakers_file)

        if self._is_use_d_vector_file():
            d_vector_file = get_from_config_or_model_args_with_default(
                self.tts_config, "d_vector_file", None)
            if self.tts_speakers_file:
                speaker_manager = SpeakerManager(
                    d_vectors_file_path=self.tts_speakers_file)
            elif d_vector_file:
                speaker_manager = SpeakerManager(
                    d_vectors_file_path=d_vector_file)
        return speaker_manager
Beispiel #11
0
    def init_from_config(config: "GlowTTSConfig",
                         samples: Union[List[List], List[Dict]] = None,
                         verbose=True):
        """Initiate model from config

        Args:
            config (VitsConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
            verbose (bool): If True, print init messages. Defaults to True.
        """
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config, verbose)
        tokenizer, new_config = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config, samples)
        return GlowTTS(new_config, ap, tokenizer, speaker_manager)
Beispiel #12
0
 def test_speakers_file_processing(self):
     manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)
     self.assertEqual(manager.num_speakers, 1)
     self.assertEqual(manager.embedding_dim, 256)
     manager = SpeakerManager(d_vectors_file_path=d_vectors_file_pth_path)
     self.assertEqual(manager.num_speakers, 1)
     self.assertEqual(manager.embedding_dim, 256)
     d_vector = manager.get_embedding_by_clip(manager.clip_ids[0])
     assert len(d_vector) == 256
     d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0])
     assert len(d_vectors[0]) == 256
     d_vector1 = manager.get_mean_embedding(manager.speaker_names[0],
                                            num_samples=2,
                                            randomize=True)
     assert len(d_vector1) == 256
     d_vector2 = manager.get_mean_embedding(manager.speaker_names[0],
                                            num_samples=2,
                                            randomize=False)
     assert len(d_vector2) == 256
     assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0
Beispiel #13
0
 def test_speakers_file_processing():
     manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)
     print(manager.num_speakers)
     print(manager.d_vector_dim)
     print(manager.clip_ids)
     d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0])
     assert len(d_vector) == 256
     d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0])
     assert len(d_vectors[0]) == 256
     d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True)
     assert len(d_vector1) == 256
     d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False)
     assert len(d_vector2) == 256
     assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0
Beispiel #14
0
    if isinstance(wav_file, list):
        speaker_name = wav_file[2]
        wav_file = wav_file[1]
    else:
        speaker_name = None

    mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T
    mel_spec = torch.FloatTensor(mel_spec[None, :, :])
    if args.use_cuda:
        mel_spec = mel_spec.cuda()
    embedd = model.compute_embedding(mel_spec)
    embedd = embedd.detach().cpu().numpy()

    # create speaker_mapping if target dataset is defined
    wav_file_name = os.path.basename(wav_file)
    speaker_mapping[wav_file_name] = {}
    speaker_mapping[wav_file_name]["name"] = speaker_name
    speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist()

if speaker_mapping:
    # save speaker_mapping if target dataset is defined
    if ".json" not in args.output_path:
        mapping_file_path = os.path.join(args.output_path, "speakers.json")
    else:
        mapping_file_path = args.output_path
    os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
    speaker_manager = SpeakerManager()
    # pylint: disable=W0212
    speaker_manager._save_json(mapping_file_path, speaker_mapping)
    print("Speaker embeddings saved at:", mapping_file_path)
Beispiel #15
0
class Synthesizer(object):
    def __init__(
        self,
        tts_checkpoint: str,
        tts_config_path: str,
        tts_speakers_file: str = "",
        vocoder_checkpoint: str = "",
        vocoder_config: str = "",
        encoder_checkpoint: str = "",
        encoder_config: str = "",
        use_cuda: bool = False,
    ) -> None:
        """General 🐸 TTS interface for inference. It takes a tts and a vocoder
        model and synthesize speech from the provided text.

        The text is divided into a list of sentences using `pysbd` and synthesize
        speech on each sentence separately.

        If you have certain special characters in your text, you need to handle
        them before providing the text to Synthesizer.

        TODO: set the segmenter based on the source language

        Args:
            tts_checkpoint (str): path to the tts model file.
            tts_config_path (str): path to the tts config file.
            vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
            vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
            encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`,
            encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`,
            use_cuda (bool, optional): enable/disable cuda. Defaults to False.
        """
        self.tts_checkpoint = tts_checkpoint
        self.tts_config_path = tts_config_path
        self.tts_speakers_file = tts_speakers_file
        self.vocoder_checkpoint = vocoder_checkpoint
        self.vocoder_config = vocoder_config
        self.encoder_checkpoint = encoder_checkpoint
        self.encoder_config = encoder_config
        self.use_cuda = use_cuda

        self.tts_model = None
        self.vocoder_model = None
        self.speaker_manager = None
        self.num_speakers = 0
        self.tts_speakers = {}
        self.d_vector_dim = 0
        self.seg = self._get_segmenter("en")
        self.use_cuda = use_cuda

        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self._load_tts(tts_checkpoint, tts_config_path, use_cuda)
        self.output_sample_rate = self.tts_config.audio["sample_rate"]
        if vocoder_checkpoint:
            self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
            self.output_sample_rate = self.vocoder_config.audio["sample_rate"]

    @staticmethod
    def _get_segmenter(lang: str):
        """get the sentence segmenter for the given language.

        Args:
            lang (str): target language code.

        Returns:
            [type]: [description]
        """
        return pysbd.Segmenter(language=lang, clean=True)

    def _load_speakers(self, speaker_file: str) -> None:
        """Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker
        encoder if it is defined.

        Args:
            speaker_file (str): path to the speakers meta-data file.
        """
        print("Loading speakers ...")
        self.speaker_manager = SpeakerManager(
            encoder_model_path=self.encoder_checkpoint,
            encoder_config_path=self.encoder_config,
        )
        self.speaker_manager.load_d_vectors_file(
            self.tts_config.get("d_vector_file", speaker_file))
        self.num_speakers = self.speaker_manager.num_speakers
        self.d_vector_dim = self.speaker_manager.d_vector_dim

    def _set_tts_speaker_file(self):
        """Set the TTS speaker file used by a multi-speaker model."""
        # setup if multi-speaker settings are in the global model config
        if (hasattr(self.tts_config, "use_speaker_embedding")
                and self.tts_config.use_speaker_embedding is True):
            if self.tts_config.use_d_vector_file:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          self.tts_config["d_vector_file"])
                self.tts_config["d_vector_file"] = self.tts_speakers_file
            else:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          self.tts_config["speakers_file"])

        # setup if multi-speaker settings are in the model args config
        if (self.tts_speakers_file is None
                and hasattr(self.tts_config, "model_args") and hasattr(
                    self.tts_config.model_args, "use_speaker_embedding")
                and self.tts_config.model_args.use_speaker_embedding):
            _args = self.tts_config.model_args
            if _args.use_d_vector_file:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          _args["d_vector_file"])
                _args["d_vector_file"] = self.tts_speakers_file
            else:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          _args["speakers_file"])

    def _load_tts(self, tts_checkpoint: str, tts_config_path: str,
                  use_cuda: bool) -> None:
        """Load the TTS model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        # pylint: disable=global-statement

        self.tts_config = load_config(tts_config_path)

        # Patch stats_path
        stats_path = self.tts_config["audio"].get("stats_path", "")
        if stats_path and (not os.path.isfile(stats_path)):
            stats_path = os.path.join(os.path.dirname(tts_checkpoint),
                                      os.path.split(stats_path)[1])
            self.tts_config["audio"]["stats_path"] = stats_path

        # Patch speakers file
        speakers_file = self.tts_config.get("model_args",
                                            {}).get("speakers_file", "")
        if speakers_file and (not os.path.isfile(speakers_file)):
            speakers_file = os.path.join(os.path.dirname(tts_checkpoint),
                                         os.path.split(speakers_file)[1])
            self.tts_config["model_args"]["speakers_file"] = speakers_file

        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        self.tts_model = setup_tts_model(config=self.tts_config)
        self.tts_model.load_checkpoint(self.tts_config,
                                       tts_checkpoint,
                                       eval=True)
        if use_cuda:
            self.tts_model.cuda()
        self._set_tts_speaker_file()

    def _load_vocoder(self, model_file: str, model_config: str,
                      use_cuda: bool) -> None:
        """Load the vocoder model.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)

        # Patch stats_path
        stats_path = self.vocoder_config["audio"].get("stats_path", "")
        if stats_path and (not os.path.isfile(stats_path)):
            stats_path = os.path.join(os.path.dirname(model_file),
                                      os.path.split(stats_path)[1])
            self.vocoder_config["audio"]["stats_path"] = stats_path

        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config.audio)
        self.vocoder_model = setup_vocoder_model(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()

    def split_into_sentences(self, text) -> List[str]:
        """Split give text into sentences.

        Args:
            text (str): input text in string format.

        Returns:
            List[str]: list of sentences.
        """
        return self.seg.segment(text)

    def save_wav(self, wav: List[int], path: str) -> None:
        """Save the waveform as a file.

        Args:
            wav (List[int]): waveform as a list of values.
            path (str): output path to save the waveform.
        """
        wav = np.array(wav)
        self.ap.save_wav(wav, path, self.output_sample_rate)

    def tts(self,
            text: str,
            speaker_idx: str = "",
            speaker_wav=None,
            style_wav=None) -> List[int]:
        """🐸 TTS magic. Run all the models and generate speech.

        Args:
            text (str): input text.
            speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "".
            speaker_wav ():
            style_wav ([type], optional): style waveform for GST. Defaults to None.

        Returns:
            List[int]: [description]
        """
        start_time = time.time()
        wavs = []
        sens = self.split_into_sentences(text)
        print(" > Text splitted to sentences.")
        print(sens)

        # handle multi-speaker
        speaker_embedding = None
        speaker_id = None
        if isinstance(speaker_idx, int):
            speaker_id = speaker_idx
        elif self.tts_speakers_file:
            if speaker_idx and isinstance(speaker_idx, str):
                if self.tts_config.use_d_vector_file:
                    # get the speaker embedding from the saved d_vectors.
                    speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(
                        speaker_idx)[0]
                else:
                    # get speaker idx from the speaker name
                    try:
                        speaker_id = self.tts_model.speaker_manager.speaker_ids[
                            speaker_idx]
                    except KeyError:
                        # Interpet as int
                        speaker_id = int(speaker_idx)

            elif not speaker_idx and not speaker_wav:
                raise ValueError(
                    " [!] Look like you use a multi-speaker model. "
                    "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model."
                )
            else:
                speaker_embedding = None
        else:
            if speaker_idx:
                raise ValueError(
                    f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
                    "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
                )

        # compute a new d_vector from the given clip.
        if speaker_wav is not None:
            speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(
                speaker_wav)

        use_gl = self.vocoder_model is None

        for sen in sens:
            # synthesize voice
            outputs = synthesis(
                model=self.tts_model,
                text=sen,
                CONFIG=self.tts_config,
                use_cuda=self.use_cuda,
                ap=self.ap,
                speaker_id=speaker_id,
                style_wav=style_wav,
                enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars,
                use_griffin_lim=use_gl,
                d_vector=speaker_embedding,
            )
            waveform = outputs["wav"]
            mel_postnet_spec = (
                outputs["outputs"]["model_outputs"][0].detach().cpu().numpy())
            if not use_gl:
                # denormalize tts output based on tts audio config
                mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T
                device_type = "cuda" if self.use_cuda else "cpu"
                # renormalize spectrogram based on vocoder config
                vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
                # compute scale factor for possible sample rate mismatch
                scale_factor = [
                    1,
                    self.vocoder_config["audio"]["sample_rate"] /
                    self.ap.sample_rate,
                ]
                if scale_factor[1] != 1:
                    print(" > interpolating tts model output.")
                    vocoder_input = interpolate_vocoder_input(
                        scale_factor, vocoder_input)
                else:
                    vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
                # run vocoder model
                # [1, T, C]
                waveform = self.vocoder_model.inference(
                    vocoder_input.to(device_type))
            if self.use_cuda and not use_gl:
                waveform = waveform.cpu()
            if not use_gl:
                waveform = waveform.numpy()
            waveform = waveform.squeeze()

            # trim silence
            waveform = trim_silence(waveform, self.ap)

            wavs += list(waveform)
            wavs += [0] * 10000

        # compute stats
        process_time = time.time() - start_time
        audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
        print(f" > Processing time: {process_time}")
        print(f" > Real-time factor: {process_time / audio_time}")
        return wavs
Beispiel #16
0
def main():
    """Run `tts` model training directly by a `config.json` file."""
    # init trainer args
    train_args = TrainingArgs()
    parser = train_args.init_argparse(arg_prefix="")

    # override trainer args from comman-line args
    args, config_overrides = parser.parse_known_args()
    train_args.parse_args(args)

    # load config.json and register
    if args.config_path or args.continue_path:
        if args.config_path:
            # init from a file
            config = load_config(args.config_path)
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        elif args.continue_path:
            # continue from a prev experiment
            config = load_config(
                os.path.join(args.continue_path, "config.json"))
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        else:
            # init from console args
            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel

            config_base = BaseTrainingConfig()
            config_base.parse_known_args(config_overrides)
            config = register_config(config_base.model)()

    # load training samples
    train_samples, eval_samples = load_tts_samples(config.datasets,
                                                   eval_split=True)

    # setup audio processor
    ap = AudioProcessor(**config.audio)

    # init speaker manager
    if check_config_and_model_args(config, "use_speaker_embedding", True):
        speaker_manager = SpeakerManager(data_items=train_samples +
                                         eval_samples)
        if hasattr(config, "model_args"):
            config.model_args.num_speakers = speaker_manager.num_speakers
        else:
            config.num_speakers = speaker_manager.num_speakers
    elif check_config_and_model_args(config, "use_d_vector_file", True):
        if check_config_and_model_args(config, "use_speaker_encoder_as_loss",
                                       True):
            speaker_manager = SpeakerManager(
                d_vectors_file_path=config.model_args.d_vector_file,
                encoder_model_path=config.model_args.
                speaker_encoder_model_path,
                encoder_config_path=config.model_args.
                speaker_encoder_config_path,
                use_cuda=torch.cuda.is_available(),
            )
        else:
            speaker_manager = SpeakerManager(
                d_vectors_file_path=get_from_config_or_model_args(
                    config, "d_vector_file"))
        config.num_speakers = speaker_manager.num_speakers
        if hasattr(config, "model_args"):
            config.model_args.num_speakers = speaker_manager.num_speakers
    else:
        speaker_manager = None

    if check_config_and_model_args(config, "use_language_embedding", True):
        language_manager = LanguageManager(config=config)
        if hasattr(config, "model_args"):
            config.model_args.num_languages = language_manager.num_languages
        else:
            config.num_languages = language_manager.num_languages
    else:
        language_manager = None

    # init the model from config
    model = setup_model(config, speaker_manager, language_manager)

    # init the trainer and 🚀
    trainer = Trainer(
        train_args,
        config,
        config.output_path,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
        training_assets={"audio_processor": ap},
        parse_command_line_args=False,
    )
    trainer.fit()
Beispiel #17
0
args = parser.parse_args()

c_dataset = load_config(args.config_dataset_path)

meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets,
                                                   eval_split=not args.no_eval)

if meta_data_eval is None:
    wav_files = meta_data_train
else:
    wav_files = meta_data_train + meta_data_eval

encoder_manager = SpeakerManager(
    encoder_model_path=args.model_path,
    encoder_config_path=args.config_path,
    d_vectors_file_path=args.old_file,
    use_cuda=args.use_cuda,
)

class_name_key = encoder_manager.encoder_config.class_name_key

# compute speaker embeddings
speaker_mapping = {}
for idx, wav_file in enumerate(tqdm(wav_files)):
    if isinstance(wav_file, dict):
        class_name = wav_file[class_name_key]
        wav_file = wav_file["audio_file"]
    else:
        class_name = None

    wav_file_name = os.path.basename(wav_file)
Beispiel #18
0
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples,
                                  parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

# init model
model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(TrainerArgs(),
                  config,
                  output_path,
                  model=model,
                  train_samples=train_samples,
Beispiel #19
0
        help="Path to model config file.",
    )

    parser.add_argument(
        "config_dataset_path",
        type=str,
        help="Path to dataset config file.",
    )
    parser.add_argument("--use_cuda",
                        type=bool,
                        help="flag to set cuda.",
                        default=True)
    parser.add_argument("--eval",
                        type=bool,
                        help="compute eval.",
                        default=True)

    args = parser.parse_args()

    c_dataset = load_config(args.config_dataset_path)

    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets,
                                                       eval_split=args.eval)
    items = meta_data_train + meta_data_eval

    enc_manager = SpeakerManager(encoder_model_path=args.model_path,
                                 encoder_config_path=args.config_path,
                                 use_cuda=args.use_cuda)

    compute_encoder_accuracy(items, enc_manager)
Beispiel #20
0
    decoder_diff_spec_alpha=0.0,
    attention_norm="softmax",
    optimizer="Adam",
    lr_scheduler=None,
    lr=3e-5,
)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)

# init speaker manager for multi-speaker training
# it mainly handles speaker-id to speaker-name for the model and the data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)

# init model
model = Tacotron2(config, speaker_manager)

# init the trainer and 🚀
trainer = Trainer(
    TrainingArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
)
Beispiel #21
0
parser.add_argument("--use_cuda",
                    type=bool,
                    help="flag to set cuda.",
                    default=True)
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)

args = parser.parse_args()

c_dataset = load_config(args.config_dataset_path)

meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets,
                                                   eval_split=args.eval)
wav_files = meta_data_train + meta_data_eval

speaker_manager = SpeakerManager(encoder_model_path=args.model_path,
                                 encoder_config_path=args.config_path,
                                 use_cuda=args.use_cuda)

# compute speaker embeddings
speaker_mapping = {}
for idx, wav_file in enumerate(tqdm(wav_files)):
    if isinstance(wav_file, list):
        speaker_name = wav_file[2]
        wav_file = wav_file[1]
    else:
        speaker_name = None

    # extract the embedding
    embedd = speaker_manager.compute_d_vector_from_clip(wav_file)

    # create speaker_mapping if target dataset is defined