Beispiel #1
0
 def evaluate(self, model: ForwardTacotron,
              val_set: DataLoader) -> Dict[str, float]:
     model.eval()
     m_val_loss = 0
     dur_val_loss = 0
     pitch_val_loss = 0
     energy_val_loss = 0
     device = next(model.parameters()).device
     for i, batch in enumerate(val_set, 1):
         batch = to_device(batch, device=device)
         with torch.no_grad():
             pred = model(batch)
             m1_loss = self.l1_loss(pred['mel'], batch['mel'],
                                    batch['mel_len'])
             m2_loss = self.l1_loss(pred['mel_post'], batch['mel'],
                                    batch['mel_len'])
             dur_loss = self.l1_loss(pred['dur'].unsqueeze(1),
                                     batch['dur'].unsqueeze(1),
                                     batch['x_len'])
             pitch_loss = self.l1_loss(pred['pitch'],
                                       batch['pitch'].unsqueeze(1),
                                       batch['x_len'])
             energy_loss = self.l1_loss(pred['energy'],
                                        batch['energy'].unsqueeze(1),
                                        batch['x_len'])
             pitch_val_loss += pitch_loss
             energy_val_loss += energy_loss
             m_val_loss += m1_loss.item() + m2_loss.item()
             dur_val_loss += dur_loss.item()
     return {
         'mel_loss': m_val_loss / len(val_set),
         'dur_loss': dur_val_loss / len(val_set),
         'pitch_loss': pitch_val_loss / len(val_set),
         'energy_loss': energy_val_loss / len(val_set)
     }
 def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float,float]:
     model.eval()
     m_val_loss = 0
     dur_val_loss = 0
     pitch_val_loss = 0
     device = next(model.parameters()).device
     for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate(
         val_set, 1
     ):
         x, m, dur, x_lens, mel_lens, pitch, puncts = (
             x.to(device),
             m.to(device),
             dur.to(device),
             x_lens.to(device),
             mel_lens.to(device),
             pitch.to(device),
             puncts.to(device),
         )
         with torch.no_grad():
             m1_hat, m2_hat, dur_hat, pitch_hat = model(
                 x, m, dur, mel_lens, pitch, puncts
             )
             m1_loss = self.l1_loss(m1_hat, m, mel_lens)
             m2_loss = self.l1_loss(m2_hat, m, mel_lens)
             dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens)
             pitch_val_loss += self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens)
             m_val_loss += m1_loss.item() + m2_loss.item()
             dur_val_loss += dur_loss.item()
     m_val_loss /= len(val_set)
     dur_val_loss /= len(val_set)
     pitch_val_loss /= len(val_set)
     return m_val_loss, dur_val_loss, pitch_val_loss
Beispiel #3
0
    def generate_plots(self, model: ForwardTacotron,
                       session: ForwardSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, mel_lens, dur = session.val_sample
        x, m, dur, mel_lens = x.to(device), m.to(device), dur.to(
            device), mel_lens.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)
        # pitch_fig = plot_pitch(np_now(pitch[0]))
        # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0]))

        # self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist())
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze()))

        # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Beispiel #4
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, lens, dur = session.val_sample
        x, m, dur = x.to(device), m.to(device), dur.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel(
            m2_hat), rescale_mel(m)
        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist())
        m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
 def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float]:
     model.eval()
     m_val_loss = 0
     dur_val_loss = 0
     device = next(model.parameters()).device
     for i, (x, m, ids, lens, dur) in enumerate(val_set, 1):
         x, m, dur, lens = x.to(device), m.to(device), dur.to(device), lens.to(device)
         with torch.no_grad():
             m1_hat, m2_hat, dur_hat = model(x, m, dur)
             m1_loss = self.l1_loss(m1_hat, m, lens)
             m2_loss = self.l1_loss(m2_hat, m, lens)
             dur_loss = F.l1_loss(dur_hat, dur)
             m_val_loss += m1_loss.item() + m2_loss.item()
             dur_val_loss += dur_loss.item()
     return m_val_loss / len(val_set), dur_val_loss / len(val_set)
def get_forward_model(model_path):
    device = torch.device('cuda')
    model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                            num_chars=len(symbols),
                            durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                            durpred_conv_dims=hp.forward_durpred_conv_dims,
                            rnn_dim=hp.forward_rnn_dims,
                            postnet_k=hp.forward_postnet_K,
                            postnet_dims=hp.forward_postnet_dims,
                            prenet_k=hp.forward_prenet_K,
                            prenet_dims=hp.forward_prenet_dims,
                            highways=hp.forward_num_highways,
                            dropout=hp.forward_dropout,
                            n_mels=hp.num_mels).to(device)
    model.load(model_path)
    return model
Beispiel #7
0
 def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None:
     forward_schedule = self.train_cfg['schedule']
     forward_schedule = parse_schedule(forward_schedule)
     for i, session_params in enumerate(forward_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data,
                 batch_size=bs,
                 r=1,
                 model_type='forward',
                 max_mel_len=self.train_cfg['max_mel_len'],
                 filter_attention=self.train_cfg['filter_attention'],
                 filter_min_alignment=self.
                 train_cfg['min_attention_alignment'],
                 filter_min_sharpness=self.
                 train_cfg['min_attention_sharpness'])
             session = TTSSession(index=i,
                                  r=1,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set)
             self.train_session(model, optimizer, session)
def load_forward_taco(
        checkpoint_path: str) -> Tuple[ForwardTacotron, Dict[str, Any]]:
    print(f'Loading tts checkpoint {checkpoint_path}')
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    config = checkpoint['config']
    tts_model = ForwardTacotron.from_config(config)
    tts_model.load_state_dict(checkpoint['model'])
    print(f'Loaded forward taco with step {tts_model.get_step()}')
    return tts_model, config
 def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None:
     for i, session_params in enumerate(hp.forward_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data, batch_size=bs, r=1, model_type='forward')
             session = TTSSession(
                 index=i, r=1, lr=lr, max_step=max_step,
                 bs=bs, train_set=train_set, val_set=val_set)
             self.train_session(model, optimizer, session)
Beispiel #10
0
 def __init__(self, tts_path: str, voc_path: str, device='cuda'):
     self.device = torch.device(device)
     tts_checkpoint = torch.load(tts_path, map_location=self.device)
     tts_config = tts_checkpoint['config']
     tts_model = ForwardTacotron.from_config(tts_config)
     tts_model.load_state_dict(tts_checkpoint['model'])
     self.tts_model = tts_model
     self.wavernn = WaveRNN.from_checkpoint(voc_path)
     self.melgan = torch.hub.load('seungwonpark/melgan', 'melgan')
     self.melgan.to(device).eval()
     self.cleaner = Cleaner.from_config(tts_config)
     self.tokenizer = Tokenizer()
     self.dsp = DSP.from_config(tts_config)
Beispiel #11
0
    def train(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC,
              optimizer_tts: Optimizer, optimizer_asr: Optimizer) -> None:
        print("Loading ASR training data...")
        asr_train_set = unpickle_binary('./data/speech-sme-asr/train_asr.pkl')
        asr_test_set = unpickle_binary('./data/speech-sme-asr/test_asr.pkl')
        # exit()
        asr_trainer = init_trainer(asr_train_set, asr_test_set)

        for i, session_params in enumerate(hp.forward_schedule, 1):
            lr, max_step, bs = session_params
            if model_tts.get_step() < max_step:
                path = self.paths.data
                # print(path)
                tts_train_set, tts_val_set = get_tts_datasets(
                    path=self.paths.data,
                    batch_size=bs,
                    r=1,
                    model_type='forward')

                asr_train_set = asr_trainer.get_train_dataloader()
                asr_test_set = asr_trainer.get_test_dataloader(asr_test_set)
                asr_pr = Wav2Vec2Processor.from_pretrained(
                    './asr_output/pretrained_processor')

                tts_session = ForwardSession(
                    path,
                    index=i,
                    r=1,
                    lr=lr,
                    max_step=max_step,
                    bs=bs,
                    train_set=tts_train_set,
                    val_set=tts_val_set,
                )
                asr_session = ASRSession(asr_pr,
                                         index=i,
                                         r=1,
                                         lr=lr,
                                         max_step=max_step,
                                         bs=4,
                                         train_set=asr_train_set,
                                         test_set=asr_test_set)
                self.train_session(model_tts, model_asr, optimizer_tts,
                                   tts_session, asr_session, asr_trainer,
                                   optimizer_asr)
Beispiel #12
0
def synthesize(input_text: str,
               tts_model: ForwardTacotron,
               voc_model: torch.nn.Module,
               alpha=1.0,
               pitch_function: Callable[[torch.tensor],
                                        torch.tensor] = lambda x: x):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _, _ = tts_model.generate(x,
                                    alpha=alpha,
                                    pitch_function=pitch_function)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    elif isinstance(voc_model, WaveRNN):
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target,
                                 hp.voc_overlap, hp.mu_law)
    else:
        m = torch.tensor(m).unsqueeze(0).cuda()
        with torch.no_grad():
            wav = voc_model.inference(m).cpu().numpy()
    return wav
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Forward TTS Model
    print('\nInitialising Forward TTS Model...\n')
    model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                            num_chars=len(phonemes),
                            durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                            durpred_conv_dims=hp.forward_durpred_conv_dims,
                            durpred_dropout=hp.forward_durpred_dropout,
                            pitch_rnn_dims=hp.forward_pitch_rnn_dims,
                            pitch_conv_dims=hp.forward_pitch_conv_dims,
                            pitch_dropout=hp.forward_pitch_dropout,
                            pitch_emb_dims=hp.forward_pitch_emb_dims,
                            pitch_proj_dropout=hp.forward_pitch_proj_dropout,
                            rnn_dim=hp.forward_rnn_dims,
                            postnet_k=hp.forward_postnet_K,
                            postnet_dims=hp.forward_postnet_dims,
                            prenet_k=hp.forward_prenet_K,
                            prenet_dims=hp.forward_prenet_dims,
                            highways=hp.forward_num_highways,
                            dropout=hp.forward_dropout,
                            n_mels=hp.num_mels).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'num params {params}')

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('forward',
    def train_session(self, model: ForwardTacotron,
                      optimizer: Optimizer, session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate(
                session.train_set, 1
            ):
                start = time.time()
                model.train()
                x, m, dur, x_lens, mel_lens, pitch, puncts = (
                    x.to(device),
                    m.to(device),
                    dur.to(device),
                    x_lens.to(device),
                    mel_lens.to(device),
                    pitch.to(device),
                    puncts.to(device),
                )
                # print("*" * 20)
                # print(x)
                # print("*" * 20)
                m1_hat, m2_hat, dur_hat, pitch_hat = model(
                    x, m, dur, mel_lens, pitch, puncts
                )
                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)
                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens)
                pitch_loss = self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens)
                loss = m1_loss + m2_loss + 0.3 * dur_loss + 0.1 * pitch_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm)
                optimizer.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward', self.paths, model, optimizer,
                                    name=ckpt_name, is_silent=True)

                if step % hp.forward_plot_every == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs, model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step())

                stream(msg)

            m_val_loss, dur_val_loss, pitch_val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', m_val_loss, model.get_step())
            self.writer.add_scalar('Duration_Loss/val', dur_val_loss, model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', pitch_val_loss, model.get_step())
            save_checkpoint('forward', self.paths, model, optimizer, is_silent=True)

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
Beispiel #15
0
                            mode=hp.voc_mode).to(device)

        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
        voc_model.load(voc_load_path)

    print('\nInitialising Forward TTS Model...\n')
    tts_model = ForwardTacotron(
        embed_dims=hp.forward_embed_dims,
        num_chars=len(phonemes),
        durpred_rnn_dims=hp.forward_durpred_rnn_dims,
        durpred_conv_dims=hp.forward_durpred_conv_dims,
        durpred_dropout=hp.forward_durpred_dropout,
        pitch_rnn_dims=hp.forward_pitch_rnn_dims,
        pitch_conv_dims=hp.forward_pitch_conv_dims,
        pitch_dropout=hp.forward_pitch_dropout,
        pitch_emb_dims=hp.forward_pitch_emb_dims,
        pitch_proj_dropout=hp.forward_pitch_proj_dropout,
        rnn_dim=hp.forward_rnn_dims,
        postnet_k=hp.forward_postnet_K,
        postnet_dims=hp.forward_postnet_dims,
        prenet_k=hp.forward_prenet_K,
        prenet_dims=hp.forward_prenet_dims,
        highways=hp.forward_num_highways,
        dropout=hp.forward_dropout,
        n_mels=hp.num_mels).to(device)

    tts_load_path = tts_weights if tts_weights else paths.forward_latest_weights
    tts_model.load(tts_load_path)

    if input_text:
        text = clean_text(input_text.strip())
Beispiel #16
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        batch = session.val_sample
        batch = to_device(batch, device=device)

        pred = model(batch)
        m1_hat = np_now(pred['mel'])[0, :600, :]
        m2_hat = np_now(pred['mel_post'])[0, :600, :]
        m_target = np_now(batch['mel'])[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_target_fig = plot_mel(m_target)
        pitch_fig = plot_pitch(np_now(batch['pitch'][0]))
        pitch_gta_fig = plot_pitch(np_now(pred['pitch'].squeeze()[0]))
        energy_fig = plot_pitch(np_now(batch['energy'][0]))
        energy_gta_fig = plot_pitch(np_now(pred['energy'].squeeze()[0]))

        self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig,
                               model.step)
        self.writer.add_figure('Energy/target', energy_fig, model.step)
        self.writer.add_figure('Energy/ground_truth_aligned', energy_gta_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)
        target_wav = self.dsp.griffinlim(m_target)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)

        gen = model.generate(batch['x'][0:1, :batch['x_len'][0]])

        m1_hat_fig = plot_mel(np_now(gen['mel']))
        m2_hat_fig = plot_mel(np_now(gen['mel_post']))

        pitch_gen_fig = plot_pitch(np_now(gen['pitch'].squeeze()))
        energy_gen_fig = plot_pitch(np_now(gen['energy'].squeeze()))

        self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Energy/generated', energy_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_target_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='TTS Generator')

    parser.add_argument(
        '--tts_weights',
        type=str,
        help='[string/path] Load in different FastSpeech weights')

    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    parser.add_argument(
        '--alpha',
        type=float,
        default=1.,
        help='Parameter for controlling length regulator for speedup '
        'or slow-down of generated speech, e.g. alpha=2.0 is double-time')

    if not os.path.exists('onnx'):
        os.mkdir('onnx')

    args = parser.parse_args()

    hp.configure(args.hp_file)

    input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."
    tts_weights = args.tts_weights

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Forward TTS Model...\n')
    tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                                num_chars=len(symbols),
                                durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                                durpred_conv_dims=hp.forward_durpred_conv_dims,
                                rnn_dim=hp.forward_rnn_dims,
                                postnet_k=hp.forward_postnet_K,
                                postnet_dims=hp.forward_postnet_dims,
                                prenet_k=hp.forward_prenet_K,
                                prenet_dims=hp.forward_prenet_dims,
                                highways=hp.forward_num_highways,
                                dropout=hp.forward_dropout,
                                n_mels=hp.num_mels).to(device)

    tts_load_path = tts_weights or paths.forward_latest_weights
    tts_model.load(tts_load_path)

    encoder = DurationPredictor(tts_model)
    decoder = Tacotron(tts_model)

    tts_model.eval()
    encoder.eval()
    decoder.eval()

    opset_version = 10

    with torch.no_grad():
        input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names)
        input_seq = torch.as_tensor(input_seq, dtype=torch.long,
                                    device=device).unsqueeze(0)
        '''
        FIRST STEP: predict symbols duration
        '''
        torch.onnx.export(encoder,
                          input_seq,
                          "./onnx/forward_tacotron_duration_prediction.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["input_seq"],
                          output_names=["embeddings", "duration"])

        x, durations = encoder(input_seq)
        '''
        SECOND STEP: expand symbols by durations
        '''
        x = encoder.lr(x, durations)
        '''
        THIRD STEP: generate mel
        '''
        torch.onnx.export(decoder,
                          x,
                          "./onnx/forward_tacotron_regression.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["data"],
                          output_names=["mel"])

    print('Done!')
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode=hp.voc_mode).to(device)

        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
        voc_model.load(voc_load_path)

    print('\nInitialising Forward TTS Model...\n')
    tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                                num_chars=len(symbols),
                                durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                                durpred_conv_dims=hp.forward_durpred_conv_dims,
                                rnn_dim=hp.forward_rnn_dims,
                                postnet_k=hp.forward_postnet_K,
                                postnet_dims=hp.forward_postnet_dims,
                                prenet_k=hp.forward_prenet_K,
                                prenet_dims=hp.forward_prenet_dims,
                                highways=hp.forward_num_highways,
                                dropout=hp.forward_dropout,
                                n_mels=hp.num_mels).to(device)

    tts_load_path = tts_weights if tts_weights else paths.forward_latest_weights
    tts_model.load(tts_load_path)

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
Beispiel #19
0
    def train_session(self, model: ForwardTacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                batch = to_device(batch, device=device)
                start = time.time()
                model.train()

                pitch_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['pitch_zoneout']
                energy_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['energy_zoneout']

                pitch_target = batch['pitch'].detach().clone()
                energy_target = batch['energy'].detach().clone()
                batch['pitch'] = batch['pitch'] * pitch_zoneout_mask.to(
                    device).float()
                batch['energy'] = batch['energy'] * energy_zoneout_mask.to(
                    device).float()

                pred = model(batch)

                m1_loss = self.l1_loss(pred['mel'], batch['mel'],
                                       batch['mel_len'])
                m2_loss = self.l1_loss(pred['mel_post'], batch['mel'],
                                       batch['mel_len'])

                dur_loss = self.l1_loss(pred['dur'].unsqueeze(1),
                                        batch['dur'].unsqueeze(1),
                                        batch['x_len'])
                pitch_loss = self.l1_loss(pred['pitch'],
                                          pitch_target.unsqueeze(1),
                                          batch['x_len'])
                energy_loss = self.l1_loss(pred['energy'],
                                           energy_target.unsqueeze(1),
                                           batch['x_len'])

                loss = m1_loss + m2_loss \
                       + self.train_cfg['dur_loss_factor'] * dur_loss \
                       + self.train_cfg['pitch_loss_factor'] * pitch_loss \
                       + self.train_cfg['energy_loss_factor'] * energy_loss

                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()

                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.forward_checkpoints /
                                    f'forward_step{k}k.pt')

                if step % self.train_cfg['plot_every'] == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss,
                                       model.get_step())
                self.writer.add_scalar('Energy_Loss/train', energy_loss,
                                       model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_out = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', val_out['mel_loss'],
                                   model.get_step())
            self.writer.add_scalar('Duration_Loss/val', val_out['dur_loss'],
                                   model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', val_out['pitch_loss'],
                                   model.get_step())
            self.writer.add_scalar('Energy_Loss/val', val_out['energy_loss'],
                                   model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.forward_checkpoints /
                            'latest_model.pt')

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
        config['git_hash'] = try_get_git_hash()
    dsp = DSP.from_config(config)
    paths = Paths(config['data_path'], config['voc_model_id'],
                  config['tts_model_id'])

    assert len(os.listdir(paths.alg)) > 0, f'Could not find alignment files in {paths.alg}, please predict ' \
                                           f'alignments first with python train_tacotron.py --force_align!'

    force_gta = args.force_gta
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    print('Using device:', device)

    # Instantiate Forward TTS Model
    print('\nInitialising Forward TTS Model...\n')
    model = ForwardTacotron.from_config(config).to(device)
    optimizer = optim.Adam(model.parameters())
    restore_checkpoint(model=model,
                       optim=optimizer,
                       path=paths.forward_checkpoints / 'latest_model.pt',
                       device=device)

    if force_gta:
        print('Creating Ground Truth Aligned Dataset...\n')
        train_set, val_set = get_tts_datasets(paths.data,
                                              8,
                                              r=1,
                                              model_type='forward',
                                              filter_attention=False,
                                              max_mel_len=None)
        create_gta_features(model, train_set, val_set, paths.gta)
Beispiel #21
0
    def train_session(self, model_tts: ForwardTacotron,
                      model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer,
                      tts_session: ForwardSession, asr_session: ASRSession,
                      asr_trainer, optimizer_asr) -> None:
        # print(tts_session.path)
        # exit()
        asr_trainer_state = {'logs': []}
        current_step = model_tts.get_step()
        tts_training_steps = tts_session.max_step - current_step
        try:
            _, asr_current_step = get_last_checkpoint(
                './checkpoints/sme_speech_tts.asr_forward/', 'model_at')
            asr_training_steps = tts_session.max_step - asr_current_step
        except:
            asr_current_step = 0
            asr_training_steps = tts_training_steps

        total_iters = len(tts_session.train_set)
        epochs = tts_training_steps // total_iters + 1
        simple_table([
            ('TTS Steps', str(tts_training_steps // 1000) + 'k Steps'),
            ('ASR Steps', str(asr_training_steps // 1000) + 'k Steps'),
            ('Batch Size TTS', tts_session.bs),
            ('Learning Rate', tts_session.lr)
        ])

        for g in optimizer_tts.param_groups:
            g['lr'] = tts_session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()

        device = next(model_tts.parameters()
                      ).device  # use same device as model parameters
        warnings.filterwarnings('ignore', category=UserWarning)
        for e in range(1, epochs + 1):

            #tts train loop for epoch
            for i, (x, m, ids, x_lens, mel_lens,
                    dur) in enumerate(tts_session.train_set, 1):
                start = time.time()
                model_tts.train()
                x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                     x_lens.to(device), mel_lens.to(device)

                m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)

                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                        x_lens)

                tts_s_loss = m1_loss + m2_loss + 0.1 * dur_loss
                optimizer_tts.zero_grad()
                # tts_s_loss.backward()
                torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                               hp.tts_clip_grad_norm)
                # optimizer_tts.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model_tts.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                # pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg_tts = f'| TTS MODEL (supervised training ): '\
                      f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward',
                                    self.paths,
                                    model_tts,
                                    optimizer_tts,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.forward_plot_every == 0:

                    self.generate_plots(model_tts, tts_session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/batch_size', tts_session.bs,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/learning_rate', tts_session.lr,
                                       model_tts.get_step())

                stream(msg_tts)
                # print(msg_tts)
            # print(torch.cuda.memory_allocated(device=device))
            # model_tts = model_tts.to('cpu')

            for step, inputs in enumerate(asr_session.train_set):

                optimizer_asr.zero_grad()

                model_asr.to(device)
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        inputs[k] = v.to(device)
                model_asr.train()
                outputs = model_asr(**inputs)
                asr_s_loss = outputs["loss"] if isinstance(
                    outputs, dict) else outputs[0]
                # asr_s_loss = asr_s_loss.mean()

                msg_asr =  f'| ASR MODEL (supervised training) : '\
                            f'| Epoch: {e}/{epochs} ({step}/{len(asr_session.train_set)}) | Loss ASR: {asr_s_loss:#.4} '\
                            f' ||||||||||||||||||||||'

                stream(msg_asr)
            # # model_asr.to('cuda')

            m_val_loss, dur_val_loss = self.evaluate(model_tts,
                                                     tts_session.val_set)
            eval_tts_msg = f'| TTS MODEL (supervised eval ): '\
                        f'| Epoch: {e}/{epochs} | Val Loss: {m_val_loss:#.4} ' \
                        f'| Dur Val Loss: {dur_val_loss:#.4} ' \

            stream(eval_tts_msg)
            tts_eval_loss = m_val_loss + dur_val_loss
            #     print(eval_tts_msg)

            # ASR eval supervised
            print('\nEvaluating ASR model ...')
            # model_asr.to('cpu')
            asr_eval_loss = 0
            eval_wer = 0

            for step, inputs in enumerate(asr_session.test_set):
                asr_eval_loss_i, logits_a, labels_a = asr_trainer.prediction_step(
                    model_asr, inputs, False)
                asr_eval_loss += asr_eval_loss_i
                logits_a.to('cpu')
                eval_wer_i = asr_trainer.compute_metrics(
                    EvalPrediction(predictions=logits_a, label_ids=labels_a))
                eval_wer += eval_wer_i['wer']
                # print(eval_wer)
            eval_wer = eval_wer / step
            asr_eval_loss = asr_eval_loss / step

            msg_asr_eval = f'| ASR MODEL (supervised eval) : Epoch {e}/{epochs} | Loss ASR: {asr_eval_loss:#.4} | WER: {eval_wer} |||||||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr_eval)

            # dual transformation loop
            # tts_s_loss = 3
            # asr_s_loss = 1
            tts_u_loss, asr_u_loss = self.dual_transform(
                model_tts, model_asr, optimizer_tts, optimizer_asr,
                asr_session.test_set, m_loss_avg, dur_loss_avg, device,
                asr_current_step, e, epochs, duration_avg, total_iters,
                tts_s_loss, asr_s_loss, tts_session.lr, tts_session.path)
            step += 1
            asr_path = f'checkpoint-27364'
            modelasr_folder = './checkpoints/sme_speech_tts.asr_forward/'
            new_check = modelasr_folder + asr_path
            os.makedirs(new_check, exist_ok=True)

            # asr_path, asr_step = get_last_checkpoint(modelasr_folder, modelasr_name)

            save_checkpoint('forward',
                            self.paths,
                            model_tts,
                            optimizer_tts,
                            is_silent=True)

            # asr_u_loss = 2

            if "logs" not in asr_trainer_state:
                asr_trainer_state['logs'] = []
            asr_trainer_state['logs'].append({
                'step':
                step,
                'epoch':
                e,
                'asr_s_loss':
                int(asr_s_loss),
                'asr_u_loss':
                int(asr_u_loss),
                'tts_s_loss':
                int(tts_s_loss),
                'tts_u_loss':
                int(tts_u_loss),
                'tts_eval_loss':
                int(tts_eval_loss),
                'asr_eval_loss':
                int(asr_eval_loss),
                'eval_wer':
                eval_wer
            })

            with open(f'{modelasr_folder+ asr_path}/dt_trainer_state.json',
                      'w') as f:
                json.dump(asr_trainer_state, f)

            model_asr.save_pretrained(f'{new_check}')

            torch.save(optimizer_asr.state_dict(), f'{new_check}/optimizer.pt')

            print("Exiting due to cuda OOM!")
            exit(11)
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train Tacotron TTS')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--force_gta',
                        '-g',
                        action='store_true',
                        help='Force the model to create GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # Load hparams from file

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    force_gta = args.force_gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        for session in hp.forward_schedule:
            _, _, batch_size = session
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Forward TTS Model
    print('\nInitialising Forward TTS Model...\n')
    model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                            num_chars=len(symbols),
                            durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                            durpred_conv_dims=hp.forward_durpred_conv_dims,
                            rnn_dim=hp.forward_rnn_dims,
                            postnet_k=hp.forward_postnet_K,
                            postnet_dims=hp.forward_postnet_dims,
                            prenet_k=hp.forward_prenet_K,
                            prenet_dims=hp.forward_prenet_dims,
                            highways=hp.forward_num_highways,
                            dropout=hp.forward_dropout,
                            n_mels=hp.num_mels).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'num params {params}')

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('forward',
                       paths,
                       model,
                       optimizer,
                       create_if_missing=True)

    if not force_gta:
        for i, session in enumerate(hp.forward_schedule):
            current_step = model.get_step()

            lr, max_step, batch_size = session

            training_steps = max_step - current_step

            simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                          ('Batch Size', batch_size), ('Learning Rate', lr)])

            train_set, mel_example = get_tts_datasets(paths.data,
                                                      batch_size,
                                                      1,
                                                      alignments=True)
            train_loop(paths, model, optimizer, train_set, lr, training_steps,
                       mel_example)

    train_set, mel_example = get_tts_datasets(paths.data,
                                              8,
                                              1,
                                              alignments=True)
    create_gta_features(model, train_set, paths.gta)
    print('Training Complete.')