def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples if "feature_path" in config and config.feature_path: # load pre-computed features print(f" > Loading features from: {config.feature_path}") eval_samples, train_samples = load_wav_feat_data( config.data_path, config.feature_path, config.eval_split_size) else: # load data raw wav files eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # setup audio processor ap = AudioProcessor(**config.audio) # init the model from config model = setup_model(config) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
def main(): """ Call train.py as a new process and pass command arguments """ parser = TrainingArgs().init_argparse(arg_prefix="") parser.add_argument("--script", type=str, help="Target training script to distibute.") args, unargs = parser.parse_known_args() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") # set arguments for train.py folder_path = pathlib.Path(__file__).parent.absolute() if os.path.exists(os.path.join(folder_path, args.script)): command = [os.path.join(folder_path, args.script)] else: command = [args.script] command.append("--continue_path={}".format(args.continue_path)) command.append("--restore_path={}".format(args.restore_path)) command.append("--config_path={}".format(args.config_path)) command.append("--group_id=group_{}".format(group_id)) command.append("--use_ddp=true") command += unargs command.append("") # run processes processes = [] for i in range(num_gpus): my_env = os.environ.copy() my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[-1] = "--rank={}".format(i) # prevent stdout for processes with rank != 0 stdout = None p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with processes.append(p) print(command) for p in processes: p.wait()
optimizer="Adam", lr_scheduler=None, lr=3e-5, ) # init audio processor ap = AudioProcessor(**config.audio.to_dict()) # load training samples train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model model = Tacotron2(config, speaker_manager) # init the trainer and 🚀 trainer = Trainer( TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, ) trainer.fit()
import os from TTS.trainer import Trainer, TrainingArgs, init_training from TTS.tts.configs import AlignTTSConfig, BaseDatasetConfig output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") ) config = AlignTTSConfig( batch_size=32, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=25, print_eval=True, mixed_precision=False, output_path=output_path, datasets=[dataset_config], ) args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config) trainer = Trainer(args, config, output_path, c_logger, dashboard_logger) trainer.fit()
import os from TTS.trainer import Trainer, TrainingArgs, init_training from TTS.vocoder.configs import UnivnetConfig output_path = os.path.dirname(os.path.abspath(__file__)) config = UnivnetConfig( batch_size=64, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, seq_len=8192, pad_short=2000, use_noise_augment=True, eval_split_size=10, print_step=25, print_eval=False, mixed_precision=False, lr_gen=1e-4, lr_disc=1e-4, data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), output_path=output_path, ) args, config, output_path, _, c_logger, dashboard_logger = init_training( TrainingArgs(), config) trainer = Trainer(args, config, output_path, c_logger, dashboard_logger) trainer.fit()
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True) # setup audio processor ap = AudioProcessor(**config.audio) # init speaker manager if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers elif check_config_and_model_args(config, "use_d_vector_file", True): if check_config_and_model_args(config, "use_speaker_encoder_as_loss", True): speaker_manager = SpeakerManager( d_vectors_file_path=config.model_args.d_vector_file, encoder_model_path=config.model_args. speaker_encoder_model_path, encoder_config_path=config.model_args. speaker_encoder_config_path, use_cuda=torch.cuda.is_available(), ) else: speaker_manager = SpeakerManager( d_vectors_file_path=get_from_config_or_model_args( config, "d_vector_file")) config.num_speakers = speaker_manager.num_speakers if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: speaker_manager = None if check_config_and_model_args(config, "use_language_embedding", True): language_manager = LanguageManager(config=config) if hasattr(config, "model_args"): config.model_args.num_languages = language_manager.num_languages else: config.num_languages = language_manager.num_languages else: language_manager = None # init the model from config model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
spec_gain=1.0, signal_norm=False, do_amp_to_db_linear=False, ) config = VitsConfig( audio=audio_config, run_name="vits_ljspeech", batch_size=48, eval_batch_size=16, batch_group_size=5, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), compute_input_seq_cache=True, print_step=25, print_eval=True, mixed_precision=True, max_seq_len=500000, output_path=output_path, datasets=[dataset_config], ) args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True) trainer.fit()
def init_arguments(): train_config = TrainingArgs() parser = train_config.init_argparse(arg_prefix="") return parser