Ejemplo n.º 1
0
 def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None:
     forward_schedule = self.train_cfg['schedule']
     forward_schedule = parse_schedule(forward_schedule)
     for i, session_params in enumerate(forward_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data,
                 batch_size=bs,
                 r=1,
                 model_type='forward',
                 max_mel_len=self.train_cfg['max_mel_len'],
                 filter_attention=self.train_cfg['filter_attention'],
                 filter_min_alignment=self.
                 train_cfg['min_attention_alignment'],
                 filter_min_sharpness=self.
                 train_cfg['min_attention_sharpness'])
             session = TTSSession(index=i,
                                  r=1,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set)
             self.train_session(model, optimizer, session)
Ejemplo n.º 2
0
 def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None:
     for i, session_params in enumerate(hp.forward_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data, batch_size=bs, r=1, model_type='forward')
             session = TTSSession(
                 index=i, r=1, lr=lr, max_step=max_step,
                 bs=bs, train_set=train_set, val_set=val_set)
             self.train_session(model, optimizer, session)
Ejemplo n.º 3
0
    def train(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC,
              optimizer_tts: Optimizer, optimizer_asr: Optimizer) -> None:
        print("Loading ASR training data...")
        asr_train_set = unpickle_binary('./data/speech-sme-asr/train_asr.pkl')
        asr_test_set = unpickle_binary('./data/speech-sme-asr/test_asr.pkl')
        # exit()
        asr_trainer = init_trainer(asr_train_set, asr_test_set)

        for i, session_params in enumerate(hp.forward_schedule, 1):
            lr, max_step, bs = session_params
            if model_tts.get_step() < max_step:
                path = self.paths.data
                # print(path)
                tts_train_set, tts_val_set = get_tts_datasets(
                    path=self.paths.data,
                    batch_size=bs,
                    r=1,
                    model_type='forward')

                asr_train_set = asr_trainer.get_train_dataloader()
                asr_test_set = asr_trainer.get_test_dataloader(asr_test_set)
                asr_pr = Wav2Vec2Processor.from_pretrained(
                    './asr_output/pretrained_processor')

                tts_session = ForwardSession(
                    path,
                    index=i,
                    r=1,
                    lr=lr,
                    max_step=max_step,
                    bs=bs,
                    train_set=tts_train_set,
                    val_set=tts_val_set,
                )
                asr_session = ASRSession(asr_pr,
                                         index=i,
                                         r=1,
                                         lr=lr,
                                         max_step=max_step,
                                         bs=4,
                                         train_set=asr_train_set,
                                         test_set=asr_test_set)
                self.train_session(model_tts, model_asr, optimizer_tts,
                                   tts_session, asr_session, asr_trainer,
                                   optimizer_asr)
Ejemplo n.º 4
0
 def train(self, model: Tacotron, optimizer: Optimizer) -> None:
     tts_schedule = self.train_cfg['schedule']
     tts_schedule = parse_schedule(tts_schedule)
     for i, session_params in enumerate(tts_schedule, 1):
         r, lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data,
                 batch_size=bs,
                 r=r,
                 model_type='tacotron',
                 max_mel_len=self.train_cfg['max_mel_len'],
                 filter_attention=False)
             session = TTSSession(index=i,
                                  r=r,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set)
             self.train_session(model, optimizer, session=session)
Ejemplo n.º 5
0
                            prenet_k=hp.forward_prenet_K,
                            prenet_dims=hp.forward_prenet_dims,
                            highways=hp.forward_num_highways,
                            dropout=hp.forward_dropout,
                            n_mels=hp.num_mels).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'num params {params}')

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('forward',
                       paths,
                       model,
                       optimizer,
                       create_if_missing=True)

    if force_gta:
        print('Creating Ground Truth Aligned Dataset...\n')
        train_set, val_set = get_tts_datasets(paths.data,
                                              8,
                                              r=1,
                                              model_type='forward')
        create_gta_features(model, train_set, val_set, paths.gta)
        print(
            '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
        )
    else:
        trainer = ForwardTrainer(paths)
        trainer.train(model, optimizer)
Ejemplo n.º 6
0
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train Tacotron TTS')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--force_gta',
                        '-g',
                        action='store_true',
                        help='Force the model to create GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # Load hparams from file
    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    force_train = args.force_train
    force_gta = args.force_gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        for session in hp.tts_schedule:
            _, _, _, batch_size = session
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Tacotron Model
    print('\nInitialising Tacotron Model...\n')
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout,
                     stop_threshold=hp.tts_stop_threshold).to(device)

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True)

    if not force_gta:
        for i, session in enumerate(hp.tts_schedule):
            current_step = model.get_step()

            r, lr, max_step, batch_size = session

            training_steps = max_step - current_step

            # Do we need to change to the next session?
            if current_step >= max_step:
                # Are there no further sessions than the current one?
                if i == len(hp.tts_schedule) - 1:
                    # There are no more sessions. Check if we force training.
                    if force_train:
                        # Don't finish the loop - train forever
                        training_steps = 999_999_999
                    else:
                        # We have completed training. Breaking is same as continue
                        break
                else:
                    # There is a following session, go to it
                    continue

            model.r = r

            simple_table([('Steps with r=%s' % (repr1(r)),
                           str(training_steps // 1000) + 'k Steps'),
                          ('Batch Size', batch_size), ('Learning Rate', lr),
                          ('Outputs/Step (r)', model.r)])

            train_set, attn_example = get_tts_datasets(paths.data, batch_size,
                                                       r)
            tts_train_loop(paths, model, optimizer, train_set, lr,
                           training_steps, attn_example)

        print('Training Complete.')
        print(
            'To continue training increase tts_total_steps in hparams.py or use --force_train\n'
        )

    print('Creating Ground Truth Aligned Dataset...\n')

    train_set, attn_example = get_tts_datasets(paths.data, 8, model.r)
    create_gta_features(model, train_set, paths.gta)

    print(
        '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
    )
Ejemplo n.º 7
0
                     stop_threshold=hp.tts_stop_threshold).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'Num Params: {params}')
    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('tts',
                       paths,
                       model,
                       optimizer,
                       create_if_missing=True,
                       device=device)

    if args.force_gta:
        print('Creating Ground Truth Aligned Dataset...\n')
        train_set, val_set = get_tts_datasets(paths.data, 8, model.r)
        create_gta_features(model, train_set, val_set, paths.gta)
        print(
            '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
        )
    elif args.force_align:
        print('Creating Attention Alignments and Pitch Values...')
        train_set, val_set = get_tts_datasets(paths.data, 1, model.r)
        create_align_features(model, train_set, val_set,
                              paths.alg)  # paths.phon_pitch)
        print(
            '\n\nYou can now train ForwardTacotron - use python train_forward.py\n'
        )
    elif args.fa_dt:

        print('Creating Attention Alignments for DT...')
Ejemplo n.º 8
0
    force_gta = args.force_gta
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    print('Using device:', device)

    # Instantiate Forward TTS Model
    print('\nInitialising Forward TTS Model...\n')
    model = ForwardTacotron.from_config(config).to(device)
    optimizer = optim.Adam(model.parameters())
    restore_checkpoint(model=model,
                       optim=optimizer,
                       path=paths.forward_checkpoints / 'latest_model.pt',
                       device=device)

    if force_gta:
        print('Creating Ground Truth Aligned Dataset...\n')
        train_set, val_set = get_tts_datasets(paths.data,
                                              8,
                                              r=1,
                                              model_type='forward',
                                              filter_attention=False,
                                              max_mel_len=None)
        create_gta_features(model, train_set, val_set, paths.gta)
        print(
            '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
        )
    else:
        trainer = ForwardTrainer(paths=paths, dsp=dsp, config=config)
        trainer.train(model, optimizer)
Ejemplo n.º 9
0
    print('Device:', device)
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'Num Params: {params}')
    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('tts',
                       paths,
                       model,
                       optimizer,
                       create_if_missing=True,
                       device=device)
    model = model

    if args.force_gta:
        print('Creating Ground Truth Aligned Dataset...\n')
        train_set, val_set = get_tts_datasets(paths.data, 8, model.r)
        create_gta_features(model, train_set, val_set, paths.gta)
        print(
            '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
        )
    elif args.force_align:
        print('Creating Attention Alignments and Pitch Values...')
        train_set, val_set = get_tts_datasets(paths.data, 1, model.r)
        create_align_features(model, train_set, val_set, paths.alg,
                              paths.phon_pitch)
        extract_pitch(paths.phon_pitch)
        print(
            '\n\nYou can now train ForwardTacotron - use python train_forward.py\n'
        )
    else:
        trainer = TacoTrainer(paths)
Ejemplo n.º 10
0
    def dual_transform(self, model_tts, model_asr, optimizer_tts,
                       optimizer_asr, asr_test_set, m_loss_avg, dur_loss_avg,
                       device, asr_current_step, e, epochs, duration_avg,
                       total_iters, tts_s_loss, asr_s_loss, tts_lr,
                       tts_dt_path):
        print('\n\nStarting DualTransformation loop...\n')
        # exit()
        tmp_dir = './checkpoints/sme_speech_tts.asr_forward/dual_transform_tmp'
        os.makedirs(tmp_dir, exist_ok=True)
        # generate tmp ASR training data
        asr_train_data = []
        input_set = get_unpaired_txt(35)
        # print(input_set)
        text = [clean_text(v) for v in input_set]
        inputs = [text_to_sequence(t) for t in text]

        # generate unpaired data for ASR from TTS
        for i, x in enumerate(inputs, 1):
            _, m, dur = model_tts.generate(x, alpha=1.)
            wav = reconstruct_waveform(m, n_iter=32)
            wav_path = os.path.join(tmp_dir, f'{i}.wav')
            save_wav(wav, wav_path)
            asr_train_data.append((wav_path, text[i - 1]))

        # print(asr_train_data)
        dt_asr_data = load_dt_data(asr_train_data)
        # reinit trainer with only tmp train data
        asr_trainer_dt = init_trainer(dt_asr_data, None)
        dt_train = asr_trainer_dt.get_train_dataloader()

        # unsuper train loop for ASR
        for step, inputs in enumerate(dt_train, 1):
            # model_asr.cpu()
            model_asr.train()
            model_asr.to(device)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)
            # model_asr.train()
            outputs = model_asr(**inputs)
            asr_u_loss = outputs["loss"] if isinstance(outputs,
                                                       dict) else outputs[0]
            # asr_u_loss.detach()
            # asr_u_loss = asr_s_loss.mean()

            # model_name = step + asr_current_step
            msg_asr =   f'| ASR MODEL (unsupervised training) : '\
                    f'| Epoch: {e}/{epochs} ({step}/{len(dt_train)}) | Loss ASR: {asr_u_loss:#.4} '\
                    f' ||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr)

        # for f in os.listdir(tmp_dir):
        #     file_path = os.path.join(tmp_dir, f)
        #     if f.endswith('.wav'):
        #         os.unlink(file_path)

        # generate tmp TTS data from ASR
        # model_asr.to(device)
        asr_predict_for_dt(model_asr)

        subprocess.check_output(
            'python preprocess.py -p "./data/speech-sme-tts" -d=True',
            shell=True,
            stderr=subprocess.STDOUT)
        print('Finished preprocessing for tmp data!')

        tmp_tts_train = get_tts_datasets(tts_dt_path,
                                         batch_size=2,
                                         r=1,
                                         model_type='forward_dt')
        print("Loaded tmp dataset!")
        # unsuper TTS training

        for i, (x, m, ids, x_lens, mel_lens,
                dur) in enumerate(tmp_tts_train, 1):
            start = time.time()
            model_tts.to(device)
            model_tts.train()
            # optimizer_tts.zero_grad()
            x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                 x_lens.to(device), mel_lens.to(device)

            m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

            m1_loss = self.l1_loss(m1_hat, m, mel_lens)
            m2_loss = self.l1_loss(m2_hat, m, mel_lens)

            dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                    x_lens)

            tts_u_loss = m1_loss + m2_loss + 0.1 * dur_loss
            # optimizer_tts.zero_grad()
            # tts_u_loss.backward()
            torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                           hp.tts_clip_grad_norm)
            # optimizer_tts.step()
            m_loss_avg.add(m1_loss.item() + m2_loss.item())
            dur_loss_avg.add(dur_loss.item())
            step = model_tts.get_step()
            k = step // 1000

            duration_avg.add(time.time() - start)
            # pitch_loss_avg.add(pitch_loss.item())

            speed = 1. / duration_avg.get()
            msg_tts = f'| TTS MODEL (unsupervised training ): '\
                  f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                  f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                  f'| {speed:#.2} steps/s | Step: {k}k | '

            stream(msg_tts)
        # m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set)
        #TODO: combine L and update
        # asr_s_loss = torch.tensor(asr_s_loss).to(device)
        combined_loss = 0.5 * (tts_s_loss + asr_s_loss) + (tts_u_loss +
                                                           asr_u_loss)
        # backwards
        combined_loss.to(device)
        # print(combined_loss)
        combined_loss.backward()
        optimizer_tts.step()

        for state in optimizer_asr.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.to(device)

        optimizer_asr.step()

        m_loss_avg.reset()
        duration_avg.reset()
        # pitch_loss_avg.reset()
        dt_msg = f'\n\nFinished DT loop in epoch {e}!\n'
        stream(dt_msg)
        print(' ')
        return tts_u_loss, asr_u_loss
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train Tacotron TTS')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--force_gta',
                        '-g',
                        action='store_true',
                        help='Force the model to create GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # Load hparams from file

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    force_gta = args.force_gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        for session in hp.forward_schedule:
            _, _, batch_size = session
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Forward TTS Model
    print('\nInitialising Forward TTS Model...\n')
    model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                            num_chars=len(symbols),
                            durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                            durpred_conv_dims=hp.forward_durpred_conv_dims,
                            rnn_dim=hp.forward_rnn_dims,
                            postnet_k=hp.forward_postnet_K,
                            postnet_dims=hp.forward_postnet_dims,
                            prenet_k=hp.forward_prenet_K,
                            prenet_dims=hp.forward_prenet_dims,
                            highways=hp.forward_num_highways,
                            dropout=hp.forward_dropout,
                            n_mels=hp.num_mels).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'num params {params}')

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('forward',
                       paths,
                       model,
                       optimizer,
                       create_if_missing=True)

    if not force_gta:
        for i, session in enumerate(hp.forward_schedule):
            current_step = model.get_step()

            lr, max_step, batch_size = session

            training_steps = max_step - current_step

            simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                          ('Batch Size', batch_size), ('Learning Rate', lr)])

            train_set, mel_example = get_tts_datasets(paths.data,
                                                      batch_size,
                                                      1,
                                                      alignments=True)
            train_loop(paths, model, optimizer, train_set, lr, training_steps,
                       mel_example)

    train_set, mel_example = get_tts_datasets(paths.data,
                                              8,
                                              1,
                                              alignments=True)
    create_gta_features(model, train_set, paths.gta)
    print('Training Complete.')