def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval ap = AudioProcessor(**c.audio) model = setup_model(c) optimizer = RAdam(model.parameters(), lr=c.lr) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=False) data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) if c.loss == "ge2e": criterion = GE2ELoss(loss_method="softmax") elif c.loss == "angleproto": criterion = AngleProtoLoss() elif c.loss == "softmaxproto": criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) else: raise Exception("The %s not is a loss supported" % c.loss) if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint["model"]) if "criterion" in checkpoint: criterion.load_state_dict(checkpoint["criterion"]) except (KeyError, RuntimeError): print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if use_cuda: model = model.cuda() criterion.cuda() global_step = args.restore_step _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval ap = AudioProcessor(**c.audio) model = SpeakerEncoder( input_dim=c.model["input_dim"], proj_dim=c.model["proj_dim"], lstm_dim=c.model["lstm_dim"], num_lstm_layers=c.model["num_lstm_layers"], ) optimizer = RAdam(model.parameters(), lr=c.lr) if c.loss == "ge2e": criterion = GE2ELoss(loss_method="softmax") elif c.loss == "angleproto": criterion = AngleProtoLoss() else: raise Exception("The %s not is a loss supported" % c.loss) if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint["model"]) except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_meta_data(c.datasets) global_step = args.restore_step _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if "characters" in c.keys() and c["characters"]: symbols, phonemes = make_symbols(**c.characters) # set model characters model_characters = phonemes if c.use_phonemes else symbols num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, None) # setup model model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") model.load_state_dict(checkpoint["model"]) if use_cuda: model.cuda() num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # set r r = 1 if c.model.lower() == "glow_tts" else model.decoder.r own_loader = setup_loader(ap, r, verbose=True) extract_spectrograms( own_loader, model, ap, args.output_path, quantized_wav=args.quantized, save_audio=args.save_audio, debug=args.debug, metada_name="metada.txt", )
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) # set model characters model_characters = phonemes if c.use_phonemes else symbols num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( c, args, meta_data_train, OUT_PATH) # setup model model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = GlowTTSLoss() if args.restore_path: print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: #pylint: disable=bare-except print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['initial_lr'] = c.lr print(f" > Model restored from step {checkpoint['step']:d}", flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if args.restore_step == 0 or not args.best_path: best_loss = float('inf') print(" > Starting with inf best loss.") else: print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") keep_all_best = c.get('keep_all_best', False) keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, model_characters, keep_all_best=keep_all_best, keep_after=keep_after)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, speaker_mapping, symbols, phonemes, model_characters # Audio processor ap = AudioProcessor(**c.audio) # setup custom characters if set in config file. if "characters" in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) model_characters = phonemes if c.use_phonemes else symbols # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if "train_portion" in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if "eval_portion" in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( c, args, meta_data_train, OUT_PATH) model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) # scalers for mixed precision training scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None scaler_st = torch.cuda.amp.GradScaler( ) if c.mixed_precision and c.separate_stopnet else None params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) if args.restore_path: print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) # optimizer restore print(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint["optimizer"]) if "scaler" in checkpoint and c.mixed_precision: print(" > Restoring AMP Scaler...") scaler.load_state_dict(checkpoint["scaler"]) if c.reinit_layers: raise RuntimeError except (KeyError, RuntimeError): print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if args.restore_step == 0 or not args.best_path: best_loss = float("inf") print(" > Starting with inf best loss.") else: print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] print(f" > Starting with loaded last best loss {best_loss}.") keep_all_best = c.get("keep_all_best", False) keep_after = c.get("keep_after", 10000) # void if keep_all_best False # define data loaders train_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=True) eval_loader = setup_loader(ap, model.decoder.r, is_val=True) global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) train_loader.dataset.outputs_per_step = r eval_loader.dataset.outputs_per_step = r train_loader = setup_loader(ap, model.decoder.r, is_val=False, dataset=train_loader.dataset) eval_loader = setup_loader(ap, model.decoder.r, is_val=True, dataset=eval_loader.dataset) print("\n > Number of output frames:", model.decoder.r) # train one epoch train_avg_loss_dict, global_step = train( train_loader, model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, scaler, scaler_st, ) # eval one epoch eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict["avg_postnet_loss"] if c.run_eval: target_loss = eval_avg_loss_dict["avg_postnet_loss"] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, model_characters, keep_all_best=keep_all_best, keep_after=keep_after, scaler=scaler.state_dict() if c.mixed_precision else None, )
c = load_config(args.config_path) ap = AudioProcessor(**c["audio"]) data_path = args.data_path split_ext = os.path.splitext(data_path) sep = args.separator if args.target_dataset != "": # if target dataset is defined dataset_config = [ BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None), ] wav_files, _ = load_meta_data(dataset_config, eval_split=False) else: # if target dataset is not defined if len(split_ext) > 0 and split_ext[1].lower() == ".csv": # Parse CSV print(f"CSV file: {data_path}") with open(data_path) as f: wav_path = os.path.join(os.path.dirname(data_path), "wavs") wav_files = [] print(f"Separator is: {sep}") for line in f: components = line.split(sep) if len(components) != 2: print("Invalid line") continue wav_file = os.path.join(wav_path, components[0] + ".wav")
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) if not speaker_mapping: print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" ) speaker_mapping = load_speaker_mapping( c.external_speaker_embedding_file) if not speaker_mapping: raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) speaker_embedding_dim = len(speaker_mapping[list( speaker_mapping.keys())[0]]['embedding']) elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) speaker_embedding_dim = None assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file speaker_mapping = load_speaker_mapping( c.external_speaker_embedding_file) speaker_embedding_dim = len(speaker_mapping[list( speaker_mapping.keys())[0]]['embedding']) elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" else: # if start new train and don't use External Embedding file speaker_mapping = {name: i for i, name in enumerate(speakers)} speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 speaker_embedding_dim = None speaker_mapping = None model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None if c.apex_amp_level == "O1": # pylint: disable=import-outside-toplevel from apex import amp model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: amp = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict if amp and 'amp' in checkpoint: amp.load_state_dict(checkpoint['amp']) for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, amp, speaker_mapping) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, amp_state_dict=amp.state_dict() if amp else None)
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of spectrogtram features.") parser.add_argument( "--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") parser.add_argument("--out_path", default=None, type=str, help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) CONFIG.audio['signal_norm'] = False # do not apply earlier normalization CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) # load the meta data of target dataset if 'data_path' in CONFIG.keys(): dataset_items = glob.glob(os.path.join(CONFIG.data_path, '**', '*.wav'), recursive=True) else: dataset_items = load_meta_data( CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item if isinstance(item, str) else item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel**2).sum(axis=1) linear_square_sum += (linear**2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) output_file_path = args.out_path stats = {} stats['mel_mean'] = mel_mean stats['mel_std'] = mel_scale stats['linear_mean'] = linear_mean stats['linear_std'] = linear_scale print(f' > Avg mel spec mean: {mel_mean.mean()}') print(f' > Avg mel spec scale: {mel_scale.mean()}') print(f' > Avg linear spec mean: {linear_mean.mean()}') print(f' > Avg lienar spec scale: {linear_scale.mean()}') # set default config values for mean-var scaling CONFIG.audio['stats_path'] = output_file_path CONFIG.audio['signal_norm'] = True # remove redundant values del CONFIG.audio['max_norm'] del CONFIG.audio['min_level_db'] del CONFIG.audio['symmetric_norm'] del CONFIG.audio['clip_norm'] stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) print(f' > stats saved to {output_file_path}')
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") parser.add_argument( "--data_path", type=str, required=False, help="folder including the target set of wavs overriding dataset config.", ) args, overrides = parser.parse_known_args() CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) # load config CONFIG.audio.signal_norm = False # do not apply earlier normalization CONFIG.audio.stats_path = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio.to_dict()) # load the meta data of target dataset if args.data_path: dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True) else: dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item if isinstance(item, str) else item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel ** 2).sum(axis=1) linear_square_sum += (linear ** 2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) output_file_path = args.out_path stats = {} stats["mel_mean"] = mel_mean stats["mel_std"] = mel_scale stats["linear_mean"] = linear_mean stats["linear_std"] = linear_scale print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path CONFIG.audio.signal_norm = True # remove redundant values del CONFIG.audio.max_norm del CONFIG.audio.min_level_db del CONFIG.audio.symmetric_norm del CONFIG.audio.clip_norm stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}")
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**config.audio.to_dict()) if config.has("characters") and config.characters: symbols, phonemes = make_symbols(**config.characters.to_dict()) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) # set model characters model_characters = phonemes if config.use_phonemes else symbols num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( config, args, meta_data_train, OUT_PATH) # setup model model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = AlignTTSLoss(config) if args.restore_path: print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location="cpu") try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore optimizer.load_state_dict(checkpoint["optimizer"]) if config.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint["model"]) except: # pylint: disable=bare-except print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], config) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["initial_lr"] = config.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) if config.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if args.restore_step == 0 or not args.best_path: best_loss = float("inf") print(" > Starting with inf best loss.") else: print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] print(f" > Starting with loaded last best loss {best_loss}.") keep_all_best = config.keep_all_best keep_after = config.keep_after # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step def set_phase(): """Set AlignTTS training phase""" if isinstance(config.phase_start_steps, list): vals = [i < global_step for i in config.phase_start_steps] if not True in vals: phase = 0 else: phase = ( len(config.phase_start_steps) - [i < global_step for i in config.phase_start_steps][::-1].index(True) - 1) else: phase = None return phase for epoch in range(0, config.epochs): cur_phase = set_phase() print(f"\n > Current AlignTTS phase: {cur_phase}") c_logger.print_epoch_start(epoch, config.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, cur_phase) eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch, cur_phase) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict["avg_loss"] if config.run_eval: target_loss = eval_avg_loss_dict["avg_loss"] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, 1, OUT_PATH, model_characters, keep_all_best=keep_all_best, keep_after=keep_after, )
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)] # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 # setup model model = setup_model(num_chars, num_speakers, c) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = GlowTTSLoss() if c.apex_amp_level: # pylint: disable=import-outside-toplevel from apex import amp from apex.parallel import DistributedDataParallel as DDP model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: amp = None if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: #pylint: disable=bare-except print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) model.load_state_dict(model_dict) del model_dict if amp and 'amp' in checkpoint: amp.load_state_dict(checkpoint['amp']) for group in optimizer.param_groups: group['initial_lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = DDP(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step model = data_depended_init(model, ap) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(model, criterion, optimizer, scheduler, ap, global_step, epoch, amp) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, amp_state_dict=amp.state_dict() if amp else None)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int( len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int( len(meta_data_eval) * c.eval_portion)] # parse speakers num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( c, args, meta_data_train, OUT_PATH) model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) # scalers for mixed precision training scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None scaler_st = torch.cuda.amp.GradScaler( ) if c.mixed_precision and c.separate_stopnet else None params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Model.") model.load_state_dict(checkpoint['model']) # optimizer restore print(" > Restoring Optimizer.") optimizer.load_state_dict(checkpoint['optimizer']) if "scaler" in checkpoint and c.mixed_precision: print(" > Restoring AMP Scaler...") scaler.load_state_dict(checkpoint["scaler"]) if c.reinit_layers: raise RuntimeError except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, scaler, scaler_st, speaker_mapping) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model( target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, scaler=scaler.state_dict() if c.mixed_precision else None)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)] # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH)