def _init_language_manager(self): """Initialize the LanguageManager""" # setup if multi-lingual settings are in the global model config language_manager = None if check_config_and_model_args(self.tts_config, "use_language_embedding", True): if self.tts_languages_file: language_manager = LanguageManager( language_ids_file_path=self.tts_languages_file) elif self.tts_config.get("language_ids_file", None): language_manager = LanguageManager( language_ids_file_path=self.tts_config.language_ids_file) else: language_manager = LanguageManager(config=self.tts_config) return language_manager
def test_run_all_models(): """Check if all the models are downloadable and tts models run correctly.""" print(" > Run synthesizer with all the models.") download_dir = get_user_data_dir("tts") output_path = os.path.join(get_tests_output_path(), "output.wav") manager = ModelManager(output_prefix=get_tests_output_path()) model_names = manager.list_models() for model_name in model_names: print(f"\n > Run - {model_name}") model_path, _, _ = manager.download_model(model_name) if "tts_models" in model_name: local_download_dir = os.path.dirname(model_path) # download and run the model speaker_files = glob.glob(local_download_dir + "/speaker*") language_files = glob.glob(local_download_dir + "/language*") language_id = "" if len(speaker_files) > 0: # multi-speaker model if "speaker_ids" in speaker_files[0]: speaker_manager = SpeakerManager( speaker_id_file_path=speaker_files[0]) elif "speakers" in speaker_files[0]: speaker_manager = SpeakerManager( d_vectors_file_path=speaker_files[0]) # multi-lingual model - Assuming multi-lingual models are also multi-speaker if len(language_files ) > 0 and "language_ids" in language_files[0]: language_manager = LanguageManager( language_ids_file_path=language_files[0]) language_id = language_manager.language_names[0] speaker_id = list(speaker_manager.ids.keys())[0] run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" ' ) else: # single-speaker model run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}"') # remove downloaded models shutil.rmtree(download_dir) else: # only download the model manager.download_model(model_name) print(f" | > OK: {model_name}") folders = glob.glob(os.path.join(manager.output_prefix, "*")) assert len(folders) == len(model_names) shutil.rmtree(manager.output_prefix)
def init_multilingual(self, config: Coqpit): """Initialize multilingual modules of a model. Args: config (Coqpit): Model configuration. """ if self.args.language_ids_file is not None: self.language_manager = LanguageManager( language_ids_file_path=config.language_ids_file) if self.args.use_language_embedding and self.language_manager: print(" > initialization of language-embedding layers.") self.num_languages = self.language_manager.num_languages self.embedded_language_dim = self.args.embedded_language_dim self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) torch.nn.init.xavier_uniform_(self.emb_l.weight) else: self.embedded_language_dim = 0 self.emb_l = None
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True) # setup audio processor ap = AudioProcessor(**config.audio) # init speaker manager if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers elif check_config_and_model_args(config, "use_d_vector_file", True): if check_config_and_model_args(config, "use_speaker_encoder_as_loss", True): speaker_manager = SpeakerManager( d_vectors_file_path=config.model_args.d_vector_file, encoder_model_path=config.model_args. speaker_encoder_model_path, encoder_config_path=config.model_args. speaker_encoder_config_path, use_cuda=torch.cuda.is_available(), ) else: speaker_manager = SpeakerManager( d_vectors_file_path=get_from_config_or_model_args( config, "d_vector_file")) config.num_speakers = speaker_manager.num_speakers if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: speaker_manager = None if check_config_and_model_args(config, "use_language_embedding", True): language_manager = LanguageManager(config=config) if hasattr(config, "model_args"): config.model_args.num_languages = language_manager.num_languages else: config.num_languages = language_manager.num_languages else: language_manager = None # init the model from config model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
], ) # init audio processor ap = AudioProcessor(**config.audio.to_dict()) # load training samples train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) config.model_args.num_speakers = speaker_manager.num_speakers language_manager = LanguageManager(config=config) config.model_args.num_languages = language_manager.num_languages # init model model = Vits(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, )