Beispiel #1
0
    def generate_plots(self, model: ForwardTacotron,
                       session: ForwardSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, mel_lens, dur = session.val_sample
        x, m, dur, mel_lens = x.to(device), m.to(device), dur.to(
            device), mel_lens.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)
        # pitch_fig = plot_pitch(np_now(pitch[0]))
        # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0]))

        # self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist())
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze()))

        # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Beispiel #2
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, lens, dur = session.val_sample
        x, m, dur = x.to(device), m.to(device), dur.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel(
            m2_hat), rescale_mel(m)
        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist())
        m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Beispiel #3
0
def synthesize(input_text: str,
               tts_model: ForwardTacotron,
               voc_model: torch.nn.Module,
               alpha=1.0,
               pitch_function: Callable[[torch.tensor],
                                        torch.tensor] = lambda x: x):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _, _ = tts_model.generate(x,
                                    alpha=alpha,
                                    pitch_function=pitch_function)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    elif isinstance(voc_model, WaveRNN):
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target,
                                 hp.voc_overlap, hp.mu_law)
    else:
        m = torch.tensor(m).unsqueeze(0).cuda()
        with torch.no_grad():
            wav = voc_model.inference(m).cpu().numpy()
    return wav
Beispiel #4
0
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'Griffin-Lim'),
                      ('GL Iters', args.iters)])

    elif args.vocoder == 'melgan':
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'MelGAN')])

    # simpla amplification of pitch
    pitch_function = lambda x: x * args.amp

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, dur, pitch = tts_model.generate(x,
                                              alpha=args.alpha,
                                              pitch_function=pitch_function)

        if args.vocoder == 'griffinlim':
            v_type = args.vocoder
        elif args.vocoder == 'wavernn' and args.batched:
            v_type = 'wavernn_batched'
        else:
            v_type = 'wavernn_unbatched'

        if input_text:
            save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k_amp{args.amp}.wav'
        else:
            save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}_amp{args.amp}.wav'

        if args.vocoder == 'wavernn':
Beispiel #5
0
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

    elif args.vocoder == 'griffinlim':
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'Griffin-Lim'),
                      ('GL Iters', args.iters)])

    elif args.vocoder == 'melgan':
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'MelGAN')])

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, _ = tts_model.generate(x, alpha=args.alpha)

        if args.vocoder == 'griffinlim':
            v_type = args.vocoder
        elif args.vocoder == 'wavernn' and args.batched:
            v_type = 'wavernn_batched'
        else:
            v_type = 'wavernn_unbatched'

        if input_text:
            save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k.wav'
        else:
            save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}.wav'

        if args.vocoder == 'wavernn':
            m = torch.tensor(m).unsqueeze(0)
Beispiel #6
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        batch = session.val_sample
        batch = to_device(batch, device=device)

        pred = model(batch)
        m1_hat = np_now(pred['mel'])[0, :600, :]
        m2_hat = np_now(pred['mel_post'])[0, :600, :]
        m_target = np_now(batch['mel'])[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_target_fig = plot_mel(m_target)
        pitch_fig = plot_pitch(np_now(batch['pitch'][0]))
        pitch_gta_fig = plot_pitch(np_now(pred['pitch'].squeeze()[0]))
        energy_fig = plot_pitch(np_now(batch['energy'][0]))
        energy_gta_fig = plot_pitch(np_now(pred['energy'].squeeze()[0]))

        self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig,
                               model.step)
        self.writer.add_figure('Energy/target', energy_fig, model.step)
        self.writer.add_figure('Energy/ground_truth_aligned', energy_gta_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)
        target_wav = self.dsp.griffinlim(m_target)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)

        gen = model.generate(batch['x'][0:1, :batch['x_len'][0]])

        m1_hat_fig = plot_mel(np_now(gen['mel']))
        m2_hat_fig = plot_mel(np_now(gen['mel_post']))

        pitch_gen_fig = plot_pitch(np_now(gen['pitch'].squeeze()))
        energy_gen_fig = plot_pitch(np_now(gen['energy'].squeeze()))

        self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Energy/generated', energy_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_target_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)