def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval ap = AudioProcessor(**c.audio) model = setup_speaker_encoder_model(c) optimizer = RAdam(model.parameters(), lr=c.lr) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=False) data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) if c.loss == "ge2e": criterion = GE2ELoss(loss_method="softmax") elif c.loss == "angleproto": criterion = AngleProtoLoss() elif c.loss == "softmaxproto": criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers) else: raise Exception("The %s not is a loss supported" % c.loss) if args.restore_path: checkpoint = load_fsspec(args.restore_path) try: model.load_state_dict(checkpoint["model"]) if "criterion" in checkpoint: criterion.load_state_dict(checkpoint["criterion"]) except (KeyError, RuntimeError): print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if use_cuda: model = model.cuda() criterion.cuda() global_step = args.restore_step _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step)
def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=True, return_wav=True, tokenizer=tokenizer, ap=self.ap, samples=items, batch_group_size=bgs, min_text_len=c.min_text_len, max_text_len=c.max_text_len, min_audio_len=c.min_audio_len, max_audio_len=c.max_audio_len, start_by_longest=start_by_longest, ) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers, ) return dataloader, dataset
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval global train_classes ap = AudioProcessor(**c.audio) model = setup_encoder_model(c) optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True) train_data_loader, train_classes, map_classid_to_classname = setup_loader( ap, is_val=False, verbose=True) if c.run_eval: eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True) else: eval_data_loader = None num_classes = len(train_classes) criterion = model.get_criterion(c, num_classes) if c.loss == "softmaxproto" and c.model != "speaker_encoder": c.map_classid_to_classname = map_classid_to_classname copy_model_files(c, OUT_PATH) if args.restore_path: criterion, args.restore_step = model.load_checkpoint( c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion) print(" > Model restored from step %d" % args.restore_step, flush=True) else: args.restore_step = 0 if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if use_cuda: model = model.cuda() criterion.cuda() global_step = args.restore_step _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
def main(): # pylint: disable=W0601 global c # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description= """Find all the unique characters or phonemes in a dataset.\n\n""" """ Example runs: python TTS/bin/find_unique_chars.py --config_path config.json """, formatter_class=RawTextHelpFormatter, ) parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) args = parser.parse_args() c = load_config(args.config_path) # load all datasets train_items, eval_items = load_tts_samples( c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size) items = train_items + eval_items print("Num items:", len(items)) is_lang_def = all(item["language"] for item in items) if not c.phoneme_language or not is_lang_def: raise ValueError("Phoneme language must be defined in config.") phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) phones = [] for ph in phonemes: phones.extend(ph) phones = set(phones) lower_phones = filter(lambda c: c.islower(), phones) phones_force_lower = [c.lower() for c in phones] phones_force_lower = set(phones_force_lower) print(f" > Number of unique phonemes: {len(phones)}") print(f" > Unique phonemes: {''.join(sorted(phones))}") print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") print( f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}" )
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainTTSArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config(os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples( config.datasets, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size, ) # init the model from config model = setup_model(config, train_samples + eval_samples) # init the trainer and 🚀 trainer = Trainer( train_args, model.config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, parse_command_line_args=False, ) trainer.fit()
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) # load data instances meta_data_train, meta_data_eval = load_tts_samples( c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # init speaker manager if c.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) elif c.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) else: speaker_manager = None # setup model model = setup_model(c) # restore model model.load_checkpoint(c, args.checkpoint_path, eval=True) if use_cuda: model.cuda() num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # set r r = 1 if c.model.lower() == "glow_tts" else model.decoder.r own_loader = setup_loader(ap, r, verbose=True) extract_spectrograms( own_loader, model, ap, args.output_path, quantized_wav=args.quantized, save_audio=args.save_audio, debug=args.debug, metada_name="metada.txt", )
def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description= """Find all the unique characters or phonemes in a dataset.\n\n""" """ Example runs: python TTS/bin/find_unique_chars.py --config_path config.json """, formatter_class=RawTextHelpFormatter, ) parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) args = parser.parse_args() c = load_config(args.config_path) # load all datasets train_items, eval_items = load_tts_samples( c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size) items = train_items + eval_items texts = "".join(item["text"] for item in items) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) chars_force_lower = [c.lower() for c in chars] chars_force_lower = set(chars_force_lower) print(f" > Number of unique characters: {len(chars)}") print(f" > Unique characters: {''.join(sorted(chars))}") print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") print( f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}" )
# It mainly serves to the dataloader and the training loggers. ap = AudioProcessor.init_from_config(config) # INITIALIZE THE TOKENIZER # Tokenizer is used to convert text to sequences of token IDs. # If characters are not defined in the config, default characters are passed to the config tokenizer, config = TTSTokenizer.init_from_config(config) # LOAD DATA SAMPLES # Each sample is a list of ```[text, audio_file_path, speaker_name]``` # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples( dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size, ) # init model model = ForwardTTS(config, ap, tokenizer) # INITIALIZE THE TRAINER # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, # distributed training, etc. trainer = Trainer(TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples)
use_speaker_embedding=True, # set this to enable multi-sepeaker training decoder_ssim_alpha=0.0, # disable ssim losses that causes NaN for some runs. postnet_ssim_alpha=0.0, postnet_diff_spec_alpha=0.0, decoder_diff_spec_alpha=0.0, attention_norm="softmax", optimizer="Adam", lr_scheduler=None, lr=3e-5, ) # init audio processor ap = AudioProcessor(**config.audio.to_dict()) # load training samples train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model model = Tacotron2(config, speaker_manager) # init the trainer and 🚀 trainer = Trainer( TrainingArgs(), config, output_path, model=model,
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True) # setup audio processor ap = AudioProcessor(**config.audio) # init speaker manager if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers elif check_config_and_model_args(config, "use_d_vector_file", True): if check_config_and_model_args(config, "use_speaker_encoder_as_loss", True): speaker_manager = SpeakerManager( d_vectors_file_path=config.model_args.d_vector_file, encoder_model_path=config.model_args. speaker_encoder_model_path, encoder_config_path=config.model_args. speaker_encoder_config_path, use_cuda=torch.cuda.is_available(), ) else: speaker_manager = SpeakerManager( d_vectors_file_path=get_from_config_or_model_args( config, "d_vector_file")) config.num_speakers = speaker_manager.num_speakers if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: speaker_manager = None if check_config_and_model_args(config, "use_language_embedding", True): language_manager = LanguageManager(config=config) if hasattr(config, "model_args"): config.model_args.num_languages = language_manager.num_languages else: config.num_languages = language_manager.num_languages else: language_manager = None # init the model from config model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
help="Previous speakers.json file, only compute for new audios.", default=None) parser.add_argument("--use_cuda", type=bool, help="flag to set cuda. Default False", default=False) parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False) args = parser.parse_args() c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval) if meta_data_eval is None: wav_files = meta_data_train else: wav_files = meta_data_train + meta_data_eval encoder_manager = SpeakerManager( encoder_model_path=args.model_path, encoder_config_path=args.config_path, d_vectors_file_path=args.old_file, use_cuda=args.use_cuda, ) class_name_key = encoder_manager.encoder_config.class_name_key
meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en", ) dataset_config_pt = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="pt-br", ) # Adding the EN samples twice to create a language unbalanced dataset train_samples, eval_samples = load_tts_samples( [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True) # gerenate a speaker unbalanced dataset for i, sample in enumerate(train_samples): if i < 5: sample["speaker_name"] = "ljspeech-0" else: sample["speaker_name"] = "ljspeech-1" def is_balanced(lang_1, lang_2): return 0.85 < lang_1 / lang_2 < 1.2 class TestSamplers(unittest.TestCase): def test_language_random_sampler(self): # pylint: disable=no-self-use
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") parser.add_argument( "--data_path", type=str, required=False, help="folder including the target set of wavs overriding dataset config.", ) args, overrides = parser.parse_known_args() CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) # load config CONFIG.audio.signal_norm = False # do not apply earlier normalization CONFIG.audio.stats_path = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio.to_dict()) # load the meta data of target dataset if args.data_path: dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True) else: dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item if isinstance(item, str) else item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel ** 2).sum(axis=1) linear_square_sum += (linear ** 2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) output_file_path = args.out_path stats = {} stats["mel_mean"] = mel_mean stats["mel_std"] = mel_scale stats["linear_mean"] = linear_mean stats["linear_std"] = linear_scale print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") print(f" > Avg linear spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path CONFIG.audio.signal_norm = True # remove redundant values del CONFIG.audio.max_norm del CONFIG.audio.min_level_db del CONFIG.audio.symmetric_norm del CONFIG.audio.clip_norm stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}")