def evaluate(self, model: ForwardTacotron, val_set: DataLoader) -> Dict[str, float]: model.eval() m_val_loss = 0 dur_val_loss = 0 pitch_val_loss = 0 energy_val_loss = 0 device = next(model.parameters()).device for i, batch in enumerate(val_set, 1): batch = to_device(batch, device=device) with torch.no_grad(): pred = model(batch) m1_loss = self.l1_loss(pred['mel'], batch['mel'], batch['mel_len']) m2_loss = self.l1_loss(pred['mel_post'], batch['mel'], batch['mel_len']) dur_loss = self.l1_loss(pred['dur'].unsqueeze(1), batch['dur'].unsqueeze(1), batch['x_len']) pitch_loss = self.l1_loss(pred['pitch'], batch['pitch'].unsqueeze(1), batch['x_len']) energy_loss = self.l1_loss(pred['energy'], batch['energy'].unsqueeze(1), batch['x_len']) pitch_val_loss += pitch_loss energy_val_loss += energy_loss m_val_loss += m1_loss.item() + m2_loss.item() dur_val_loss += dur_loss.item() return { 'mel_loss': m_val_loss / len(val_set), 'dur_loss': dur_val_loss / len(val_set), 'pitch_loss': pitch_val_loss / len(val_set), 'energy_loss': energy_val_loss / len(val_set) }
def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float,float]: model.eval() m_val_loss = 0 dur_val_loss = 0 pitch_val_loss = 0 device = next(model.parameters()).device for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate( val_set, 1 ): x, m, dur, x_lens, mel_lens, pitch, puncts = ( x.to(device), m.to(device), dur.to(device), x_lens.to(device), mel_lens.to(device), pitch.to(device), puncts.to(device), ) with torch.no_grad(): m1_hat, m2_hat, dur_hat, pitch_hat = model( x, m, dur, mel_lens, pitch, puncts ) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) pitch_val_loss += self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens) m_val_loss += m1_loss.item() + m2_loss.item() dur_val_loss += dur_loss.item() m_val_loss /= len(val_set) dur_val_loss /= len(val_set) pitch_val_loss /= len(val_set) return m_val_loss, dur_val_loss, pitch_val_loss
def generate_plots(self, model: ForwardTacotron, session: ForwardSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, mel_lens, dur = session.val_sample x, m, dur, mel_lens = x.to(device), m.to(device), dur.to( device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) # pitch_fig = plot_pitch(np_now(pitch[0])) # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0])) # self.writer.add_figure('Pitch/target', pitch_fig, model.step) # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist()) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze())) # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def generate_plots(self, model: ForwardTacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, lens, dur = session.val_sample x, m, dur = x.to(device), m.to(device), dur.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel( m2_hat), rescale_mel(m) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist()) m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float]: model.eval() m_val_loss = 0 dur_val_loss = 0 device = next(model.parameters()).device for i, (x, m, ids, lens, dur) in enumerate(val_set, 1): x, m, dur, lens = x.to(device), m.to(device), dur.to(device), lens.to(device) with torch.no_grad(): m1_hat, m2_hat, dur_hat = model(x, m, dur) m1_loss = self.l1_loss(m1_hat, m, lens) m2_loss = self.l1_loss(m2_hat, m, lens) dur_loss = F.l1_loss(dur_hat, dur) m_val_loss += m1_loss.item() + m2_loss.item() dur_val_loss += dur_loss.item() return m_val_loss / len(val_set), dur_val_loss / len(val_set)
def get_forward_model(model_path): device = torch.device('cuda') model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model.load(model_path) return model
def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None: forward_schedule = self.train_cfg['schedule'] forward_schedule = parse_schedule(forward_schedule) for i, session_params in enumerate(forward_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=1, model_type='forward', max_mel_len=self.train_cfg['max_mel_len'], filter_attention=self.train_cfg['filter_attention'], filter_min_alignment=self. train_cfg['min_attention_alignment'], filter_min_sharpness=self. train_cfg['min_attention_sharpness']) session = TTSSession(index=i, r=1, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session)
def load_forward_taco( checkpoint_path: str) -> Tuple[ForwardTacotron, Dict[str, Any]]: print(f'Loading tts checkpoint {checkpoint_path}') checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu')) config = checkpoint['config'] tts_model = ForwardTacotron.from_config(config) tts_model.load_state_dict(checkpoint['model']) print(f'Loaded forward taco with step {tts_model.get_step()}') return tts_model, config
def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None: for i, session_params in enumerate(hp.forward_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=1, model_type='forward') session = TTSSession( index=i, r=1, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session)
def __init__(self, tts_path: str, voc_path: str, device='cuda'): self.device = torch.device(device) tts_checkpoint = torch.load(tts_path, map_location=self.device) tts_config = tts_checkpoint['config'] tts_model = ForwardTacotron.from_config(tts_config) tts_model.load_state_dict(tts_checkpoint['model']) self.tts_model = tts_model self.wavernn = WaveRNN.from_checkpoint(voc_path) self.melgan = torch.hub.load('seungwonpark/melgan', 'melgan') self.melgan.to(device).eval() self.cleaner = Cleaner.from_config(tts_config) self.tokenizer = Tokenizer() self.dsp = DSP.from_config(tts_config)
def train(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer, optimizer_asr: Optimizer) -> None: print("Loading ASR training data...") asr_train_set = unpickle_binary('./data/speech-sme-asr/train_asr.pkl') asr_test_set = unpickle_binary('./data/speech-sme-asr/test_asr.pkl') # exit() asr_trainer = init_trainer(asr_train_set, asr_test_set) for i, session_params in enumerate(hp.forward_schedule, 1): lr, max_step, bs = session_params if model_tts.get_step() < max_step: path = self.paths.data # print(path) tts_train_set, tts_val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=1, model_type='forward') asr_train_set = asr_trainer.get_train_dataloader() asr_test_set = asr_trainer.get_test_dataloader(asr_test_set) asr_pr = Wav2Vec2Processor.from_pretrained( './asr_output/pretrained_processor') tts_session = ForwardSession( path, index=i, r=1, lr=lr, max_step=max_step, bs=bs, train_set=tts_train_set, val_set=tts_val_set, ) asr_session = ASRSession(asr_pr, index=i, r=1, lr=lr, max_step=max_step, bs=4, train_set=asr_train_set, test_set=asr_test_set) self.train_session(model_tts, model_asr, optimizer_tts, tts_session, asr_session, asr_trainer, optimizer_asr)
def synthesize(input_text: str, tts_model: ForwardTacotron, voc_model: torch.nn.Module, alpha=1.0, pitch_function: Callable[[torch.tensor], torch.tensor] = lambda x: x): text = clean_text(input_text.strip()) x = text_to_sequence(text) _, m, _, _ = tts_model.generate(x, alpha=alpha, pitch_function=pitch_function) if voc_model == 'griffinlim': wav = reconstruct_waveform(m, n_iter=32) elif isinstance(voc_model, WaveRNN): m = torch.tensor(m).unsqueeze(0) wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law) else: m = torch.tensor(m).unsqueeze(0).cuda() with torch.no_grad(): wav = voc_model.inference(m).cpu().numpy() return wav
'`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Forward TTS Model print('\nInitialising Forward TTS Model...\n') model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(phonemes), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, durpred_dropout=hp.forward_durpred_dropout, pitch_rnn_dims=hp.forward_pitch_rnn_dims, pitch_conv_dims=hp.forward_pitch_conv_dims, pitch_dropout=hp.forward_pitch_dropout, pitch_emb_dims=hp.forward_pitch_emb_dims, pitch_proj_dropout=hp.forward_pitch_proj_dropout, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'num params {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('forward',
def train_session(self, model: ForwardTacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr)]) for g in optimizer.param_groups: g['lr'] = session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() pitch_loss_avg = Averager() device = next(model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate( session.train_set, 1 ): start = time.time() model.train() x, m, dur, x_lens, mel_lens, pitch, puncts = ( x.to(device), m.to(device), dur.to(device), x_lens.to(device), mel_lens.to(device), pitch.to(device), puncts.to(device), ) # print("*" * 20) # print(x) # print("*" * 20) m1_hat, m2_hat, dur_hat, pitch_hat = model( x, m, dur, mel_lens, pitch, puncts ) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) pitch_loss = self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens) loss = m1_loss + m2_loss + 0.3 * dur_loss + 0.1 * pitch_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.forward_checkpoint_every == 0: ckpt_name = f'forward_step{k}K' save_checkpoint('forward', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.forward_plot_every == 0: self.generate_plots(model, session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) m_val_loss, dur_val_loss, pitch_val_loss = self.evaluate(model, session.val_set) self.writer.add_scalar('Mel_Loss/val', m_val_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/val', dur_val_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/val', pitch_val_loss, model.get_step()) save_checkpoint('forward', self.paths, model, optimizer, is_silent=True) m_loss_avg.reset() duration_avg.reset() pitch_loss_avg.reset() print(' ')
mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron( embed_dims=hp.forward_embed_dims, num_chars=len(phonemes), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, durpred_dropout=hp.forward_durpred_dropout, pitch_rnn_dims=hp.forward_pitch_rnn_dims, pitch_conv_dims=hp.forward_pitch_conv_dims, pitch_dropout=hp.forward_pitch_dropout, pitch_emb_dims=hp.forward_pitch_emb_dims, pitch_proj_dropout=hp.forward_pitch_proj_dropout, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights if tts_weights else paths.forward_latest_weights tts_model.load(tts_load_path) if input_text: text = clean_text(input_text.strip())
def generate_plots(self, model: ForwardTacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device batch = session.val_sample batch = to_device(batch, device=device) pred = model(batch) m1_hat = np_now(pred['mel'])[0, :600, :] m2_hat = np_now(pred['mel_post'])[0, :600, :] m_target = np_now(batch['mel'])[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_target_fig = plot_mel(m_target) pitch_fig = plot_pitch(np_now(batch['pitch'][0])) pitch_gta_fig = plot_pitch(np_now(pred['pitch'].squeeze()[0])) energy_fig = plot_pitch(np_now(batch['energy'][0])) energy_gta_fig = plot_pitch(np_now(pred['energy'].squeeze()[0])) self.writer.add_figure('Pitch/target', pitch_fig, model.step) self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step) self.writer.add_figure('Energy/target', energy_fig, model.step) self.writer.add_figure('Energy/ground_truth_aligned', energy_gta_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) target_wav = self.dsp.griffinlim(m_target) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) gen = model.generate(batch['x'][0:1, :batch['x_len'][0]]) m1_hat_fig = plot_mel(np_now(gen['mel'])) m2_hat_fig = plot_mel(np_now(gen['mel_post'])) pitch_gen_fig = plot_pitch(np_now(gen['pitch'].squeeze())) energy_gen_fig = plot_pitch(np_now(gen['energy'].squeeze())) self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step) self.writer.add_figure('Energy/generated', energy_gen_fig, model.step) self.writer.add_figure('Generated/target', m_target_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument( '--tts_weights', type=str, help='[string/path] Load in different FastSpeech weights') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument( '--alpha', type=float, default=1., help='Parameter for controlling length regulator for speedup ' 'or slow-down of generated speech, e.g. alpha=2.0 is double-time') if not os.path.exists('onnx'): os.mkdir('onnx') args = parser.parse_args() hp.configure(args.hp_file) input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." tts_weights = args.tts_weights paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) device = torch.device('cpu') print('Using device:', device) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights or paths.forward_latest_weights tts_model.load(tts_load_path) encoder = DurationPredictor(tts_model) decoder = Tacotron(tts_model) tts_model.eval() encoder.eval() decoder.eval() opset_version = 10 with torch.no_grad(): input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names) input_seq = torch.as_tensor(input_seq, dtype=torch.long, device=device).unsqueeze(0) ''' FIRST STEP: predict symbols duration ''' torch.onnx.export(encoder, input_seq, "./onnx/forward_tacotron_duration_prediction.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["input_seq"], output_names=["embeddings", "duration"]) x, durations = encoder(input_seq) ''' SECOND STEP: expand symbols by durations ''' x = encoder.lr(x, durations) ''' THIRD STEP: generate mel ''' torch.onnx.export(decoder, x, "./onnx/forward_tacotron_regression.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["data"], output_names=["mel"]) print('Done!')
res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights if tts_weights else paths.forward_latest_weights tts_model.load(tts_load_path) if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
def train_session(self, model: ForwardTacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr)]) for g in optimizer.param_groups: g['lr'] = session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() pitch_loss_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, batch in enumerate(session.train_set, 1): batch = to_device(batch, device=device) start = time.time() model.train() pitch_zoneout_mask = torch.rand( batch['x'].size()) > self.train_cfg['pitch_zoneout'] energy_zoneout_mask = torch.rand( batch['x'].size()) > self.train_cfg['energy_zoneout'] pitch_target = batch['pitch'].detach().clone() energy_target = batch['energy'].detach().clone() batch['pitch'] = batch['pitch'] * pitch_zoneout_mask.to( device).float() batch['energy'] = batch['energy'] * energy_zoneout_mask.to( device).float() pred = model(batch) m1_loss = self.l1_loss(pred['mel'], batch['mel'], batch['mel_len']) m2_loss = self.l1_loss(pred['mel_post'], batch['mel'], batch['mel_len']) dur_loss = self.l1_loss(pred['dur'].unsqueeze(1), batch['dur'].unsqueeze(1), batch['x_len']) pitch_loss = self.l1_loss(pred['pitch'], pitch_target.unsqueeze(1), batch['x_len']) energy_loss = self.l1_loss(pred['energy'], energy_target.unsqueeze(1), batch['x_len']) loss = m1_loss + m2_loss \ + self.train_cfg['dur_loss_factor'] * dur_loss \ + self.train_cfg['pitch_loss_factor'] * pitch_loss \ + self.train_cfg['energy_loss_factor'] * energy_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), self.train_cfg['clip_grad_norm']) optimizer.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % self.train_cfg['checkpoint_every'] == 0: save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.forward_checkpoints / f'forward_step{k}k.pt') if step % self.train_cfg['plot_every'] == 0: self.generate_plots(model, session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step()) self.writer.add_scalar('Energy_Loss/train', energy_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_out = self.evaluate(model, session.val_set) self.writer.add_scalar('Mel_Loss/val', val_out['mel_loss'], model.get_step()) self.writer.add_scalar('Duration_Loss/val', val_out['dur_loss'], model.get_step()) self.writer.add_scalar('Pitch_Loss/val', val_out['pitch_loss'], model.get_step()) self.writer.add_scalar('Energy_Loss/val', val_out['energy_loss'], model.get_step()) save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.forward_checkpoints / 'latest_model.pt') m_loss_avg.reset() duration_avg.reset() pitch_loss_avg.reset() print(' ')
config['git_hash'] = try_get_git_hash() dsp = DSP.from_config(config) paths = Paths(config['data_path'], config['voc_model_id'], config['tts_model_id']) assert len(os.listdir(paths.alg)) > 0, f'Could not find alignment files in {paths.alg}, please predict ' \ f'alignments first with python train_tacotron.py --force_align!' force_gta = args.force_gta device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print('Using device:', device) # Instantiate Forward TTS Model print('\nInitialising Forward TTS Model...\n') model = ForwardTacotron.from_config(config).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint(model=model, optim=optimizer, path=paths.forward_checkpoints / 'latest_model.pt', device=device) if force_gta: print('Creating Ground Truth Aligned Dataset...\n') train_set, val_set = get_tts_datasets(paths.data, 8, r=1, model_type='forward', filter_attention=False, max_mel_len=None) create_gta_features(model, train_set, val_set, paths.gta)
def train_session(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer, tts_session: ForwardSession, asr_session: ASRSession, asr_trainer, optimizer_asr) -> None: # print(tts_session.path) # exit() asr_trainer_state = {'logs': []} current_step = model_tts.get_step() tts_training_steps = tts_session.max_step - current_step try: _, asr_current_step = get_last_checkpoint( './checkpoints/sme_speech_tts.asr_forward/', 'model_at') asr_training_steps = tts_session.max_step - asr_current_step except: asr_current_step = 0 asr_training_steps = tts_training_steps total_iters = len(tts_session.train_set) epochs = tts_training_steps // total_iters + 1 simple_table([ ('TTS Steps', str(tts_training_steps // 1000) + 'k Steps'), ('ASR Steps', str(asr_training_steps // 1000) + 'k Steps'), ('Batch Size TTS', tts_session.bs), ('Learning Rate', tts_session.lr) ]) for g in optimizer_tts.param_groups: g['lr'] = tts_session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() device = next(model_tts.parameters() ).device # use same device as model parameters warnings.filterwarnings('ignore', category=UserWarning) for e in range(1, epochs + 1): #tts train loop for epoch for i, (x, m, ids, x_lens, mel_lens, dur) in enumerate(tts_session.train_set, 1): start = time.time() model_tts.train() x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\ x_lens.to(device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) tts_s_loss = m1_loss + m2_loss + 0.1 * dur_loss optimizer_tts.zero_grad() # tts_s_loss.backward() torch.nn.utils.clip_grad_norm_(model_tts.parameters(), hp.tts_clip_grad_norm) # optimizer_tts.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model_tts.get_step() k = step // 1000 duration_avg.add(time.time() - start) # pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg_tts = f'| TTS MODEL (supervised training ): '\ f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.forward_checkpoint_every == 0: ckpt_name = f'forward_step{k}K' save_checkpoint('forward', self.paths, model_tts, optimizer_tts, name=ckpt_name, is_silent=True) if step % hp.forward_plot_every == 0: self.generate_plots(model_tts, tts_session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model_tts.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model_tts.get_step()) self.writer.add_scalar('Params/batch_size', tts_session.bs, model_tts.get_step()) self.writer.add_scalar('Params/learning_rate', tts_session.lr, model_tts.get_step()) stream(msg_tts) # print(msg_tts) # print(torch.cuda.memory_allocated(device=device)) # model_tts = model_tts.to('cpu') for step, inputs in enumerate(asr_session.train_set): optimizer_asr.zero_grad() model_asr.to(device) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) model_asr.train() outputs = model_asr(**inputs) asr_s_loss = outputs["loss"] if isinstance( outputs, dict) else outputs[0] # asr_s_loss = asr_s_loss.mean() msg_asr = f'| ASR MODEL (supervised training) : '\ f'| Epoch: {e}/{epochs} ({step}/{len(asr_session.train_set)}) | Loss ASR: {asr_s_loss:#.4} '\ f' ||||||||||||||||||||||' stream(msg_asr) # # model_asr.to('cuda') m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set) eval_tts_msg = f'| TTS MODEL (supervised eval ): '\ f'| Epoch: {e}/{epochs} | Val Loss: {m_val_loss:#.4} ' \ f'| Dur Val Loss: {dur_val_loss:#.4} ' \ stream(eval_tts_msg) tts_eval_loss = m_val_loss + dur_val_loss # print(eval_tts_msg) # ASR eval supervised print('\nEvaluating ASR model ...') # model_asr.to('cpu') asr_eval_loss = 0 eval_wer = 0 for step, inputs in enumerate(asr_session.test_set): asr_eval_loss_i, logits_a, labels_a = asr_trainer.prediction_step( model_asr, inputs, False) asr_eval_loss += asr_eval_loss_i logits_a.to('cpu') eval_wer_i = asr_trainer.compute_metrics( EvalPrediction(predictions=logits_a, label_ids=labels_a)) eval_wer += eval_wer_i['wer'] # print(eval_wer) eval_wer = eval_wer / step asr_eval_loss = asr_eval_loss / step msg_asr_eval = f'| ASR MODEL (supervised eval) : Epoch {e}/{epochs} | Loss ASR: {asr_eval_loss:#.4} | WER: {eval_wer} |||||||||||||||||||||||||||||||||||||||||||||||||||||' stream(msg_asr_eval) # dual transformation loop # tts_s_loss = 3 # asr_s_loss = 1 tts_u_loss, asr_u_loss = self.dual_transform( model_tts, model_asr, optimizer_tts, optimizer_asr, asr_session.test_set, m_loss_avg, dur_loss_avg, device, asr_current_step, e, epochs, duration_avg, total_iters, tts_s_loss, asr_s_loss, tts_session.lr, tts_session.path) step += 1 asr_path = f'checkpoint-27364' modelasr_folder = './checkpoints/sme_speech_tts.asr_forward/' new_check = modelasr_folder + asr_path os.makedirs(new_check, exist_ok=True) # asr_path, asr_step = get_last_checkpoint(modelasr_folder, modelasr_name) save_checkpoint('forward', self.paths, model_tts, optimizer_tts, is_silent=True) # asr_u_loss = 2 if "logs" not in asr_trainer_state: asr_trainer_state['logs'] = [] asr_trainer_state['logs'].append({ 'step': step, 'epoch': e, 'asr_s_loss': int(asr_s_loss), 'asr_u_loss': int(asr_u_loss), 'tts_s_loss': int(tts_s_loss), 'tts_u_loss': int(tts_u_loss), 'tts_eval_loss': int(tts_eval_loss), 'asr_eval_loss': int(asr_eval_loss), 'eval_wer': eval_wer }) with open(f'{modelasr_folder+ asr_path}/dt_trainer_state.json', 'w') as f: json.dump(asr_trainer_state, f) model_asr.save_pretrained(f'{new_check}') torch.save(optimizer_asr.state_dict(), f'{new_check}/optimizer.pt') print("Exiting due to cuda OOM!") exit(11)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.forward_schedule: _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Forward TTS Model print('\nInitialising Forward TTS Model...\n') model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'num params {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('forward', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.forward_schedule): current_step = model.get_step() lr, max_step, batch_size = session training_steps = max_step - current_step simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr)]) train_set, mel_example = get_tts_datasets(paths.data, batch_size, 1, alignments=True) train_loop(paths, model, optimizer, train_set, lr, training_steps, mel_example) train_set, mel_example = get_tts_datasets(paths.data, 8, 1, alignments=True) create_gta_features(model, train_set, paths.gta) print('Training Complete.')