def generate_plots(self, model: ForwardTacotron, session: ForwardSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, mel_lens, dur = session.val_sample x, m, dur, mel_lens = x.to(device), m.to(device), dur.to( device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) # pitch_fig = plot_pitch(np_now(pitch[0])) # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0])) # self.writer.add_figure('Pitch/target', pitch_fig, model.step) # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist()) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze())) # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def generate_plots(self, model: ForwardTacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, lens, dur = session.val_sample x, m, dur = x.to(device), m.to(device), dur.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel( m2_hat), rescale_mel(m) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist()) m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def synthesize(input_text: str, tts_model: ForwardTacotron, voc_model: torch.nn.Module, alpha=1.0, pitch_function: Callable[[torch.tensor], torch.tensor] = lambda x: x): text = clean_text(input_text.strip()) x = text_to_sequence(text) _, m, _, _ = tts_model.generate(x, alpha=alpha, pitch_function=pitch_function) if voc_model == 'griffinlim': wav = reconstruct_waveform(m, n_iter=32) elif isinstance(voc_model, WaveRNN): m = torch.tensor(m).unsqueeze(0) wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law) else: m = torch.tensor(m).unsqueeze(0).cuda() with torch.no_grad(): wav = voc_model.inference(m).cpu().numpy() return wav
simple_table([('Forward Tacotron', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', args.iters)]) elif args.vocoder == 'melgan': simple_table([('Forward Tacotron', str(tts_k) + 'k'), ('Vocoder Type', 'MelGAN')]) # simpla amplification of pitch pitch_function = lambda x: x * args.amp for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, dur, pitch = tts_model.generate(x, alpha=args.alpha, pitch_function=pitch_function) if args.vocoder == 'griffinlim': v_type = args.vocoder elif args.vocoder == 'wavernn' and args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' if input_text: save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k_amp{args.amp}.wav' else: save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}_amp{args.amp}.wav' if args.vocoder == 'wavernn':
('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) elif args.vocoder == 'griffinlim': simple_table([('Forward Tacotron', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', args.iters)]) elif args.vocoder == 'melgan': simple_table([('Forward Tacotron', str(tts_k) + 'k'), ('Vocoder Type', 'MelGAN')]) for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, _ = tts_model.generate(x, alpha=args.alpha) if args.vocoder == 'griffinlim': v_type = args.vocoder elif args.vocoder == 'wavernn' and args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' if input_text: save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k.wav' else: save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}.wav' if args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0)
def generate_plots(self, model: ForwardTacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device batch = session.val_sample batch = to_device(batch, device=device) pred = model(batch) m1_hat = np_now(pred['mel'])[0, :600, :] m2_hat = np_now(pred['mel_post'])[0, :600, :] m_target = np_now(batch['mel'])[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_target_fig = plot_mel(m_target) pitch_fig = plot_pitch(np_now(batch['pitch'][0])) pitch_gta_fig = plot_pitch(np_now(pred['pitch'].squeeze()[0])) energy_fig = plot_pitch(np_now(batch['energy'][0])) energy_gta_fig = plot_pitch(np_now(pred['energy'].squeeze()[0])) self.writer.add_figure('Pitch/target', pitch_fig, model.step) self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step) self.writer.add_figure('Energy/target', energy_fig, model.step) self.writer.add_figure('Energy/ground_truth_aligned', energy_gta_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) target_wav = self.dsp.griffinlim(m_target) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) gen = model.generate(batch['x'][0:1, :batch['x_len'][0]]) m1_hat_fig = plot_mel(np_now(gen['mel'])) m2_hat_fig = plot_mel(np_now(gen['mel_post'])) pitch_gen_fig = plot_pitch(np_now(gen['pitch'].squeeze())) energy_gen_fig = plot_pitch(np_now(gen['energy'].squeeze())) self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step) self.writer.add_figure('Energy/generated', energy_gen_fig, model.step) self.writer.add_figure('Generated/target', m_target_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate)