Example #1
0
    def generate_plots(self, model: ForwardTacotron,
                       session: ForwardSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, mel_lens, dur = session.val_sample
        x, m, dur, mel_lens = x.to(device), m.to(device), dur.to(
            device), mel_lens.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)
        # pitch_fig = plot_pitch(np_now(pitch[0]))
        # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0]))

        # self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist())
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze()))

        # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Example #2
0
    def generate_plots(self, model: Tacotron, session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, m_lens = session.val_sample
        x, m = x.to(device), m.to(device)

        m1_hat, m2_hat, att = model(x, m)
        att = np_now(att)[0]
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, att = model.generate(x[0].tolist(),
                                             steps=m_lens[0] + 20)
        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/attention', att_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Example #3
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, lens, dur = session.val_sample
        x, m, dur = x.to(device), m.to(device), dur.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel(
            m2_hat), rescale_mel(m)
        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist())
        m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Example #4
0
    def gen_tacotron(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            _, m, attention = self.tts_model.generate(x)
            # Fix mel spectrogram scaling to be from 0 to 1
            m = (m + 4) / 8
            np.clip(m, 0, 1, out=m)

            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'
            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##
            if self.args.vocoder == 'wavernn':
                m = torch.tensor(m).unsqueeze(0)
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)
Example #5
0
def tsau(input_text, save_path):
    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    if args.vocoder == 'wavernn':
        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        simple_table([
            ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r),
            ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
            ('Generation Mode', 'Batched' if batched else 'Unbatched'),
            ('Target Samples', target if batched else 'N/A'),
            ('Overlap Samples', overlap if batched else 'N/A')
        ])

    elif args.vocoder == 'griffinlim':
        tts_k = tts_model.get_step() // 1000
        simple_table([('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r),
                      ('Vocoder Type', 'Griffin-Lim'),
                      ('GL Iters', args.iters)])

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)
        # Fix mel spectrogram scaling to be from 0 to 1
        m = (m + 4) / 8
        np.clip(m, 0, 1, out=m)

        if args.vocoder == 'griffinlim':
            v_type = args.vocoder
        elif args.vocoder == 'wavernn' and args.batched:
            v_type = 'wavernn_batched'
        else:
            v_type = 'wavernn_unbatched'

        if save_attn: save_attention(attention, save_path)

        if args.vocoder == 'wavernn':
            m = torch.tensor(m).unsqueeze(0)
            voc_model.generate(m, save_path, batched, target, overlap,
                               hp.mu_law)
        elif args.vocoder == 'griffinlim':
            wav = reconstruct_waveform(m, n_iter=args.iters)
            save_wav(wav, save_path)

    print('\n\nDone.\n')
Example #6
0
def synthesize(input_text, tts_model, voc_model, alpha=1.0, device=torch.device('cuda')):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _ = tts_model.generate(x, alpha=alpha)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    elif isinstance(voc_model, WaveRNN):
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law)
    else:
        m = torch.tensor(m).unsqueeze(0).to(device)
        with torch.no_grad():
            wav = voc_model.inference(m).cpu().numpy()
    return wav
def synthesize(input_text, tts_model, voc_model, alpha=1.0):
    x = text_to_sequence(input_text.strip(), ['english_cleaners'])
    m = tts_model.generate(x, alpha=alpha)
    # Fix mel spectrogram scaling to be from 0 to 1
    m = (m + 4) / 8
    np.clip(m, 0, 1, out=m)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    else:
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target,
                                 hp.voc_overlap, hp.mu_law)
        print()
    return wav
Example #8
0
    def gen_tacotron2(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            print(x)

            x = np.array(x)[None, :]
            x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long()

            self.tts_model.eval()
            mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference(
                x)
            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'

            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##

            if self.args.vocoder == 'wavernn':
                m = mel_outputs_postnet
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy()
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)
Example #9
0
def synthesize(input_text: str,
               tts_model: ForwardTacotron,
               voc_model: torch.nn.Module,
               alpha=1.0,
               pitch_function: Callable[[torch.tensor],
                                        torch.tensor] = lambda x: x):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _, _ = tts_model.generate(x,
                                    alpha=alpha,
                                    pitch_function=pitch_function)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    elif isinstance(voc_model, WaveRNN):
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target,
                                 hp.voc_overlap, hp.mu_law)
    else:
        m = torch.tensor(m).unsqueeze(0).cuda()
        with torch.no_grad():
            wav = voc_model.inference(m).cpu().numpy()
    return wav
Example #10
0
        _, m, dur, pitch = tts_model.generate(x,
                                              alpha=args.alpha,
                                              pitch_function=pitch_function)

        if args.vocoder == 'griffinlim':
            v_type = args.vocoder
        elif args.vocoder == 'wavernn' and args.batched:
            v_type = 'wavernn_batched'
        else:
            v_type = 'wavernn_unbatched'

        if input_text:
            save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k_amp{args.amp}.wav'
        else:
            save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}_amp{args.amp}.wav'

        if args.vocoder == 'wavernn':
            m = torch.tensor(m).unsqueeze(0)
            voc_model.generate(m, save_path, batched, hp.voc_target,
                               hp.voc_overlap, hp.mu_law)
        if args.vocoder == 'melgan':
            m = torch.tensor(m).unsqueeze(0)
            torch.save(
                m, paths.forward_output /
                f'{i}_{tts_k}_alpha{args.alpha}_amp{args.amp}.mel')
        elif args.vocoder == 'griffinlim':
            wav = reconstruct_waveform(m, n_iter=args.iters)
            save_wav(wav, save_path)

    print('\n\nDone.\n')
Example #11
0
    def dual_transform(self, model_tts, model_asr, optimizer_tts,
                       optimizer_asr, asr_test_set, m_loss_avg, dur_loss_avg,
                       device, asr_current_step, e, epochs, duration_avg,
                       total_iters, tts_s_loss, asr_s_loss, tts_lr,
                       tts_dt_path):
        print('\n\nStarting DualTransformation loop...\n')
        # exit()
        tmp_dir = './checkpoints/sme_speech_tts.asr_forward/dual_transform_tmp'
        os.makedirs(tmp_dir, exist_ok=True)
        # generate tmp ASR training data
        asr_train_data = []
        input_set = get_unpaired_txt(35)
        # print(input_set)
        text = [clean_text(v) for v in input_set]
        inputs = [text_to_sequence(t) for t in text]

        # generate unpaired data for ASR from TTS
        for i, x in enumerate(inputs, 1):
            _, m, dur = model_tts.generate(x, alpha=1.)
            wav = reconstruct_waveform(m, n_iter=32)
            wav_path = os.path.join(tmp_dir, f'{i}.wav')
            save_wav(wav, wav_path)
            asr_train_data.append((wav_path, text[i - 1]))

        # print(asr_train_data)
        dt_asr_data = load_dt_data(asr_train_data)
        # reinit trainer with only tmp train data
        asr_trainer_dt = init_trainer(dt_asr_data, None)
        dt_train = asr_trainer_dt.get_train_dataloader()

        # unsuper train loop for ASR
        for step, inputs in enumerate(dt_train, 1):
            # model_asr.cpu()
            model_asr.train()
            model_asr.to(device)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)
            # model_asr.train()
            outputs = model_asr(**inputs)
            asr_u_loss = outputs["loss"] if isinstance(outputs,
                                                       dict) else outputs[0]
            # asr_u_loss.detach()
            # asr_u_loss = asr_s_loss.mean()

            # model_name = step + asr_current_step
            msg_asr =   f'| ASR MODEL (unsupervised training) : '\
                    f'| Epoch: {e}/{epochs} ({step}/{len(dt_train)}) | Loss ASR: {asr_u_loss:#.4} '\
                    f' ||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr)

        # for f in os.listdir(tmp_dir):
        #     file_path = os.path.join(tmp_dir, f)
        #     if f.endswith('.wav'):
        #         os.unlink(file_path)

        # generate tmp TTS data from ASR
        # model_asr.to(device)
        asr_predict_for_dt(model_asr)

        subprocess.check_output(
            'python preprocess.py -p "./data/speech-sme-tts" -d=True',
            shell=True,
            stderr=subprocess.STDOUT)
        print('Finished preprocessing for tmp data!')

        tmp_tts_train = get_tts_datasets(tts_dt_path,
                                         batch_size=2,
                                         r=1,
                                         model_type='forward_dt')
        print("Loaded tmp dataset!")
        # unsuper TTS training

        for i, (x, m, ids, x_lens, mel_lens,
                dur) in enumerate(tmp_tts_train, 1):
            start = time.time()
            model_tts.to(device)
            model_tts.train()
            # optimizer_tts.zero_grad()
            x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                 x_lens.to(device), mel_lens.to(device)

            m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

            m1_loss = self.l1_loss(m1_hat, m, mel_lens)
            m2_loss = self.l1_loss(m2_hat, m, mel_lens)

            dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                    x_lens)

            tts_u_loss = m1_loss + m2_loss + 0.1 * dur_loss
            # optimizer_tts.zero_grad()
            # tts_u_loss.backward()
            torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                           hp.tts_clip_grad_norm)
            # optimizer_tts.step()
            m_loss_avg.add(m1_loss.item() + m2_loss.item())
            dur_loss_avg.add(dur_loss.item())
            step = model_tts.get_step()
            k = step // 1000

            duration_avg.add(time.time() - start)
            # pitch_loss_avg.add(pitch_loss.item())

            speed = 1. / duration_avg.get()
            msg_tts = f'| TTS MODEL (unsupervised training ): '\
                  f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                  f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                  f'| {speed:#.2} steps/s | Step: {k}k | '

            stream(msg_tts)
        # m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set)
        #TODO: combine L and update
        # asr_s_loss = torch.tensor(asr_s_loss).to(device)
        combined_loss = 0.5 * (tts_s_loss + asr_s_loss) + (tts_u_loss +
                                                           asr_u_loss)
        # backwards
        combined_loss.to(device)
        # print(combined_loss)
        combined_loss.backward()
        optimizer_tts.step()

        for state in optimizer_asr.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.to(device)

        optimizer_asr.step()

        m_loss_avg.reset()
        duration_avg.reset()
        # pitch_loss_avg.reset()
        dt_msg = f'\n\nFinished DT loop in epoch {e}!\n'
        stream(dt_msg)
        print(' ')
        return tts_u_loss, asr_u_loss