def _create_random_model(self): # pylint: disable=global-statement config = load_config( os.path.join(get_tests_output_path(), "dummy_model_config.json")) model = setup_model(config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(config, model, None, None, 10, 1, output_path)
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainTTSArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config(os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples( config.datasets, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size, ) # init the model from config model = setup_model(config, train_samples + eval_samples) # init the trainer and 🚀 trainer = Trainer( train_args, model.config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, parse_command_line_args=False, ) trainer.fit()
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) # load data instances meta_data_train, meta_data_eval = load_tts_samples( c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # init speaker manager if c.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) elif c.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) else: speaker_manager = None # setup model model = setup_model(c) # restore model model.load_checkpoint(c, args.checkpoint_path, eval=True) if use_cuda: model.cuda() num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # set r r = 1 if c.model.lower() == "glow_tts" else model.decoder.r own_loader = setup_loader(ap, r, verbose=True) extract_spectrograms( own_loader, model, ap, args.output_path, quantized_wav=args.quantized, save_audio=args.save_audio, debug=args.debug, metada_name="metada.txt", )
def test_Tacotron(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) # create model model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test run_cli( f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' ) run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True) # setup audio processor ap = AudioProcessor(**config.audio) # init speaker manager if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers elif check_config_and_model_args(config, "use_d_vector_file", True): if check_config_and_model_args(config, "use_speaker_encoder_as_loss", True): speaker_manager = SpeakerManager( d_vectors_file_path=config.model_args.d_vector_file, encoder_model_path=config.model_args. speaker_encoder_model_path, encoder_config_path=config.model_args. speaker_encoder_config_path, use_cuda=torch.cuda.is_available(), ) else: speaker_manager = SpeakerManager( d_vectors_file_path=get_from_config_or_model_args( config, "d_vector_file")) config.num_speakers = speaker_manager.num_speakers if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: speaker_manager = None if check_config_and_model_args(config, "use_language_embedding", True): language_manager = LanguageManager(config=config) if hasattr(config, "model_args"): config.model_args.num_languages = language_manager.num_languages else: config.num_languages = language_manager.num_languages else: language_manager = None # init the model from config model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA.") args = parser.parse_args() C = load_config(args.config_path) ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if "characters" in C.keys(): symbols, phonemes = make_symbols(**C.characters) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) # TODO: handle multi-speaker model = setup_model(C) model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True) # data loader preprocessor = importlib.import_module("TTS.tts.datasets.formatters") preprocessor = getattr(preprocessor, args.dataset) meta_data = preprocessor(args.data_path, args.dataset_metafile) dataset = TTSDataset( model.decoder.r, C.text_cleaner, compute_linear_spec=False, ap=ap, meta_data=meta_data, characters=C.characters if "characters" in C.keys() else None, add_blank=C["add_blank"] if "add_blank" in C.keys() else False, use_phonemes=C.use_phonemes,
parser.add_argument("--config_path", type=str, help="Path to config file of torch model.") parser.add_argument( "--output_path", type=str, help="path to output file including file name to save TF model.") args = parser.parse_args() # load model config config_path = args.config_path c = load_config(config_path) num_speakers = 0 # init torch model model = setup_model(c) checkpoint = load_fsspec(args.torch_model_path, map_location=torch.device("cpu")) state_dict = checkpoint["model"] model.load_state_dict(state_dict) # init tf model num_chars = len(phonemes) if c.use_phonemes else len(symbols) model_tf = Tacotron2( num_chars=num_chars, num_speakers=num_speakers, r=model.decoder.r, out_channels=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"], attn_type=c.attention_type, attn_win=c.windowing,