Example #1
0
 def _init_language_manager(self):
     """Initialize the LanguageManager"""
     # setup if multi-lingual settings are in the global model config
     language_manager = None
     if check_config_and_model_args(self.tts_config,
                                    "use_language_embedding", True):
         if self.tts_languages_file:
             language_manager = LanguageManager(
                 language_ids_file_path=self.tts_languages_file)
         elif self.tts_config.get("language_ids_file", None):
             language_manager = LanguageManager(
                 language_ids_file_path=self.tts_config.language_ids_file)
         else:
             language_manager = LanguageManager(config=self.tts_config)
     return language_manager
Example #2
0
def test_run_all_models():
    """Check if all the models are downloadable and tts models run correctly."""
    print(" > Run synthesizer with all the models.")
    download_dir = get_user_data_dir("tts")
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    manager = ModelManager(output_prefix=get_tests_output_path())
    model_names = manager.list_models()
    for model_name in model_names:
        print(f"\n > Run - {model_name}")
        model_path, _, _ = manager.download_model(model_name)
        if "tts_models" in model_name:
            local_download_dir = os.path.dirname(model_path)
            # download and run the model
            speaker_files = glob.glob(local_download_dir + "/speaker*")
            language_files = glob.glob(local_download_dir + "/language*")
            language_id = ""
            if len(speaker_files) > 0:
                # multi-speaker model
                if "speaker_ids" in speaker_files[0]:
                    speaker_manager = SpeakerManager(
                        speaker_id_file_path=speaker_files[0])
                elif "speakers" in speaker_files[0]:
                    speaker_manager = SpeakerManager(
                        d_vectors_file_path=speaker_files[0])

                # multi-lingual model - Assuming multi-lingual models are also multi-speaker
                if len(language_files
                       ) > 0 and "language_ids" in language_files[0]:
                    language_manager = LanguageManager(
                        language_ids_file_path=language_files[0])
                    language_id = language_manager.language_names[0]

                speaker_id = list(speaker_manager.ids.keys())[0]
                run_cli(
                    f"tts --model_name  {model_name} "
                    f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" '
                )
            else:
                # single-speaker model
                run_cli(
                    f"tts --model_name  {model_name} "
                    f'--text "This is an example." --out_path "{output_path}"')
            # remove downloaded models
            shutil.rmtree(download_dir)
        else:
            # only download the model
            manager.download_model(model_name)
        print(f" | > OK: {model_name}")

    folders = glob.glob(os.path.join(manager.output_prefix, "*"))
    assert len(folders) == len(model_names)
    shutil.rmtree(manager.output_prefix)
Example #3
0
    def init_multilingual(self, config: Coqpit):
        """Initialize multilingual modules of a model.

        Args:
            config (Coqpit): Model configuration.
        """
        if self.args.language_ids_file is not None:
            self.language_manager = LanguageManager(
                language_ids_file_path=config.language_ids_file)

        if self.args.use_language_embedding and self.language_manager:
            print(" > initialization of language-embedding layers.")
            self.num_languages = self.language_manager.num_languages
            self.embedded_language_dim = self.args.embedded_language_dim
            self.emb_l = nn.Embedding(self.num_languages,
                                      self.embedded_language_dim)
            torch.nn.init.xavier_uniform_(self.emb_l.weight)
        else:
            self.embedded_language_dim = 0
            self.emb_l = None
Example #4
0
def main():
    """Run `tts` model training directly by a `config.json` file."""
    # init trainer args
    train_args = TrainingArgs()
    parser = train_args.init_argparse(arg_prefix="")

    # override trainer args from comman-line args
    args, config_overrides = parser.parse_known_args()
    train_args.parse_args(args)

    # load config.json and register
    if args.config_path or args.continue_path:
        if args.config_path:
            # init from a file
            config = load_config(args.config_path)
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        elif args.continue_path:
            # continue from a prev experiment
            config = load_config(
                os.path.join(args.continue_path, "config.json"))
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        else:
            # init from console args
            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel

            config_base = BaseTrainingConfig()
            config_base.parse_known_args(config_overrides)
            config = register_config(config_base.model)()

    # load training samples
    train_samples, eval_samples = load_tts_samples(config.datasets,
                                                   eval_split=True)

    # setup audio processor
    ap = AudioProcessor(**config.audio)

    # init speaker manager
    if check_config_and_model_args(config, "use_speaker_embedding", True):
        speaker_manager = SpeakerManager(data_items=train_samples +
                                         eval_samples)
        if hasattr(config, "model_args"):
            config.model_args.num_speakers = speaker_manager.num_speakers
        else:
            config.num_speakers = speaker_manager.num_speakers
    elif check_config_and_model_args(config, "use_d_vector_file", True):
        if check_config_and_model_args(config, "use_speaker_encoder_as_loss",
                                       True):
            speaker_manager = SpeakerManager(
                d_vectors_file_path=config.model_args.d_vector_file,
                encoder_model_path=config.model_args.
                speaker_encoder_model_path,
                encoder_config_path=config.model_args.
                speaker_encoder_config_path,
                use_cuda=torch.cuda.is_available(),
            )
        else:
            speaker_manager = SpeakerManager(
                d_vectors_file_path=get_from_config_or_model_args(
                    config, "d_vector_file"))
        config.num_speakers = speaker_manager.num_speakers
        if hasattr(config, "model_args"):
            config.model_args.num_speakers = speaker_manager.num_speakers
    else:
        speaker_manager = None

    if check_config_and_model_args(config, "use_language_embedding", True):
        language_manager = LanguageManager(config=config)
        if hasattr(config, "model_args"):
            config.model_args.num_languages = language_manager.num_languages
        else:
            config.num_languages = language_manager.num_languages
    else:
        language_manager = None

    # init the model from config
    model = setup_model(config, speaker_manager, language_manager)

    # init the trainer and 🚀
    trainer = Trainer(
        train_args,
        config,
        config.output_path,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
        training_assets={"audio_processor": ap},
        parse_command_line_args=False,
    )
    trainer.fit()
Example #5
0
    ],
)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)

# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.model_args.num_speakers = speaker_manager.num_speakers

language_manager = LanguageManager(config=config)
config.model_args.num_languages = language_manager.num_languages

# init model
model = Vits(config, speaker_manager, language_manager)

# init the trainer and 🚀
trainer = Trainer(
    TrainingArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
)