def __init__(self, checkpoint_path, device="cuda"):
        self.checkpoint_path = checkpoint_path
        assert exists(checkpoint_path)
        self.device = torch.device(device)

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        self.tacotron = tts_model = Tacotron(
            embed_dims=hp.embed_dims,
            num_chars=len(symbols),
            encoder_dims=hp.encoder_dims,
            decoder_dims=hp.decoder_dims,
            n_mels=hp.n_mels,
            fft_bins=hp.fft_bins,
            postnet_dims=hp.postnet_dims,
            encoder_K=hp.encoder_K,
            lstm_dims=hp.lstm_dims,
            postnet_K=hp.postnet_K,
            num_highways=hp.num_highways,
            dropout=hp.dropout,
            speaker_latent_dims=hp.speaker_latent_dims,
            speaker_encoder_dims=hp.speaker_encoder_dims,
            n_speakers=hp.n_speakers,
            noise_latent_dims=hp.noise_latent_dims,
            noise_encoder_dims=hp.noise_encoder_dims).to(device=self.device)

        print("\nInitializing STFT Model...\n")

        self.stft = MelSTFT(filter_length=hp.n_fft,
                            hop_length=hp.hop_length,
                            win_length=hp.win_length,
                            n_mel_channels=hp.n_mels,
                            sampling_rate=hp.sampling_rate,
                            mel_fmin=hp.min_f,
                            mel_fmax=hp.max_f).to(device=self.device)

        tts_model.restore(self.checkpoint_path)
        tts_model.eval()
        # print some information
        self.tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([
            (f'Tacotron(r={r})', str(self.tts_k) + 'k'),
            ("Sample Rate", hp.sampling_rate),
            ("NFFT", hp.n_fft),
            ("NMel", hp.n_mels),
            ("Speakers", hp.n_speakers),
            ("SPKD", hp.speaker_latent_dims),
            ("NOID", hp.noise_latent_dims),
        ])
Esempio n. 2
0
def tsau(input_text, save_path):
    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    if args.vocoder == 'wavernn':
        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        simple_table([
            ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r),
            ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
            ('Generation Mode', 'Batched' if batched else 'Unbatched'),
            ('Target Samples', target if batched else 'N/A'),
            ('Overlap Samples', overlap if batched else 'N/A')
        ])

    elif args.vocoder == 'griffinlim':
        tts_k = tts_model.get_step() // 1000
        simple_table([('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r),
                      ('Vocoder Type', 'Griffin-Lim'),
                      ('GL Iters', args.iters)])

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)
        # Fix mel spectrogram scaling to be from 0 to 1
        m = (m + 4) / 8
        np.clip(m, 0, 1, out=m)

        if args.vocoder == 'griffinlim':
            v_type = args.vocoder
        elif args.vocoder == 'wavernn' and args.batched:
            v_type = 'wavernn_batched'
        else:
            v_type = 'wavernn_unbatched'

        if save_attn: save_attention(attention, save_path)

        if args.vocoder == 'wavernn':
            m = torch.tensor(m).unsqueeze(0)
            voc_model.generate(m, save_path, batched, target, overlap,
                               hp.mu_law)
        elif args.vocoder == 'griffinlim':
            wav = reconstruct_waveform(m, n_iter=args.iters)
            save_wav(wav, save_path)

    print('\n\nDone.\n')
Esempio n. 3
0
    if input_text:
        text = clean_text(input_text.strip())
        inputs = [text_to_sequence(text)]
    else:
        with open('sentences.txt') as f:
            inputs = [clean_text(l.strip()) for l in f]
        inputs = [text_to_sequence(t) for t in inputs]

    tts_k = tts_model.get_step() // 1000

    if args.vocoder == 'wavernn':
        voc_k = voc_model.get_step() // 1000
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'WaveRNN'),
                      ('WaveRNN', str(voc_k) + 'k'),
                      ('Generation Mode',
                       'Batched' if batched else 'Unbatched'),
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

    elif args.vocoder == 'griffinlim':
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'Griffin-Lim'),
                      ('GL Iters', args.iters)])

    elif args.vocoder == 'melgan':
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'MelGAN')])

    # simpla amplification of pitch
    pitch_function = lambda x: x * args.amp
    def train_session(self, model: ForwardTacotron,
                      optimizer: Optimizer, session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate(
                session.train_set, 1
            ):
                start = time.time()
                model.train()
                x, m, dur, x_lens, mel_lens, pitch, puncts = (
                    x.to(device),
                    m.to(device),
                    dur.to(device),
                    x_lens.to(device),
                    mel_lens.to(device),
                    pitch.to(device),
                    puncts.to(device),
                )
                # print("*" * 20)
                # print(x)
                # print("*" * 20)
                m1_hat, m2_hat, dur_hat, pitch_hat = model(
                    x, m, dur, mel_lens, pitch, puncts
                )
                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)
                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens)
                pitch_loss = self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens)
                loss = m1_loss + m2_loss + 0.3 * dur_loss + 0.1 * pitch_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm)
                optimizer.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward', self.paths, model, optimizer,
                                    name=ckpt_name, is_silent=True)

                if step % hp.forward_plot_every == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs, model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step())

                stream(msg)

            m_val_loss, dur_val_loss, pitch_val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', m_val_loss, model.get_step())
            self.writer.add_scalar('Duration_Loss/val', dur_val_loss, model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', pitch_val_loss, model.get_step())
            save_checkpoint('forward', self.paths, model, optimizer, is_silent=True)

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
Esempio n. 5
0
    def train_session(self, model: WaveRNN, optimizer: Optimizer,
                      session: VocSession, train_gta: bool) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps ', str(training_steps // 1000) + 'k'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Sequence Length', self.train_cfg['seq_len']),
                      ('GTA Training', train_gta)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters

        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                batch = to_device(batch, device=device)
                x, y = batch['x'], batch['y']
                y_hat = model(x, batch['mel'])
                if model.mode == 'RAW':
                    y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
                elif model.mode == 'MOL':
                    y = batch['y'].float()
                y = y.unsqueeze(-1)

                loss = self.loss_func(y_hat, y)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['gen_samples_every'] == 0:
                    stream(msg + 'generating samples...')
                    gen_result = self.generate_samples(model, session)
                    if gen_result is not None:
                        mel_loss, gen_wav = gen_result
                        self.writer.add_scalar('Loss/generated_mel_l1',
                                               mel_loss, model.get_step())
                        self.track_top_models(mel_loss, gen_wav, model)

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.voc_checkpoints /
                                    f'wavernn_step{k}k.pt')

                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.voc_checkpoints /
                            'latest_model.pt')

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
Esempio n. 6
0
    def __init__(self):
        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS')
        self.args = parser.parse_args()
        self.args.vocoder = 'wavernn'
        self.args.hp_file = 'hparams.py'
        self.args.voc_weights = False
        self.args.tts_weights = False
        self.args.save_attn = False
        self.args.batched = True
        self.args.target = None
        self.args.overlap = None
        self.args.force_cpu = False
        #================ vocoder ================#
        if self.args.vocoder in ['griffinlim', 'gl']:
            self.args.vocoder = 'griffinlim'
        elif self.args.vocoder in ['wavernn', 'wr']:
            self.args.vocoder = 'wavernn'
        else:
            raise argparse.ArgumentError('Must provide a valid vocoder type!')

        hp.configure(self.args.hp_file)  # Load hparams from file

        # set defaults for any arguments that depend on hparams
        if self.args.vocoder == 'wavernn':
            if self.args.target is None:
                self.args.target = hp.voc_target
            if self.args.overlap is None:
                self.args.overlap = hp.voc_overlap
            if self.args.batched is None:
                self.args.batched = hp.voc_gen_batched

        #================ others ================#
        paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
        print("hello")
        print(paths.base)
        if not self.args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        # === Wavernn === #
        if self.args.vocoder == 'wavernn':
            print('\nInitialising WaveRNN Model...\n')
            self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                                     fc_dims=hp.voc_fc_dims,
                                     bits=hp.bits,
                                     pad=hp.voc_pad,
                                     upsample_factors=hp.voc_upsample_factors,
                                     feat_dims=hp.num_mels,
                                     compute_dims=hp.voc_compute_dims,
                                     res_out_dims=hp.voc_res_out_dims,
                                     res_blocks=hp.voc_res_blocks,
                                     hop_length=hp.hop_length,
                                     sample_rate=hp.sample_rate,
                                     mode=hp.voc_mode).to(device)

            voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights
            #print(paths.voc_latest_weights)
            self.voc_model.load(voc_load_path)

        # === Tacotron === #
        if hp.tts_model == 'tacotron':
            print('\nInitialising Tacotron Model...\n')
            self.tts_model = Tacotron(
                embed_dims=hp.tts_embed_dims,
                num_chars=len(symbols),
                encoder_dims=hp.tts_encoder_dims,
                decoder_dims=hp.tts_decoder_dims,
                n_mels=hp.num_mels,
                fft_bins=hp.num_mels,
                postnet_dims=hp.tts_postnet_dims,
                encoder_K=hp.tts_encoder_K,
                lstm_dims=hp.tts_lstm_dims,
                postnet_K=hp.tts_postnet_K,
                num_highways=hp.tts_num_highways,
                dropout=hp.tts_dropout,
                stop_threshold=hp.tts_stop_threshold).to(device)

            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Tacotron2 === #
        elif hp.tts_model == 'tacotron2':
            print('\nInitializing Tacotron2 Model...\n')
            self.tts_model = Tacotron2().to(device)
            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Infomation === #
        if hp.tts_model == 'tacotron':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron', str(tts_k) + 'k'),
                              ('r', self.tts_model.r),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

        elif hp.tts_model == 'tacotron2':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron2', str(tts_k) + 'k'),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron2', str(tts_k) + 'k'),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])
Esempio n. 7
0
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    tts_model.to(device)
    cleaner = Cleaner.from_config(config)
    tokenizer = Tokenizer()

    print('Using device:', device)
    if args.input_text:
        texts = [args.input_text]
    else:
        with open('sentences.txt', 'r', encoding='utf-8') as f:
            texts = f.readlines()

    tts_k = tts_model.get_step() // 1000

    if args.vocoder == 'griffinlim':
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'Griffin-Lim')])

    elif args.vocoder == 'melgan':
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'MelGAN')])

    # simple amplification of pitch
    pitch_function = lambda x: x * args.amp

    for i, x in enumerate(texts, 1):
        print(f'\n| Generating {i}/{len(texts)}')
        x = cleaner(x)
        x = tokenizer(x)
        x = torch.as_tensor(x, dtype=torch.long, device=device).unsqueeze(0)

        wav_name = f'{i}_taco_{tts_k}k_{args.vocoder}'
Esempio n. 8
0
    tts_model.restore(tts_restore_path)

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r.item()),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', target if batched else 'N/A'),
                  ('Overlap Samples', overlap if batched else 'N/A')])

    for i, x in enumerate(inputs, 1):

        spk_embds, file_name = get_spk_embed(files, enc_path)

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x, spk_embds)

        if input_text:
            # save_path = f'{paths.tts_output}__input_{input_text[:10]}_{tts_k}k.wav'
            save_path = f'{out}{i}_{file_name}_batched{str(batched)}_{tts_k}k.wav'
        else:
            save_path = f'{out}{i}_{file_name}_batched{str(batched)}_{tts_k}k.wav'
Esempio n. 9
0
    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    voc_model.restore(paths.voc_latest_weights)

    optimiser = optim.Adam(voc_model.parameters())

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size,
                                               train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([
        ('Remaining', str(
            (total_steps - voc_model.get_step()) // 1000) + 'k Steps'),
        ('Batch Size', batch_size), ('LR', lr),
        ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta)
    ])

    loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(voc_model, loss_func, optimiser, train_set, test_set, lr,
                   total_steps)

    print('Training Complete.')
    print(
        'To continue training increase voc_total_steps in hparams.py or use --force_train'
    )
Esempio n. 10
0
    def TTS_Wave(self):
        os.makedirs('quick_start/tts_weights/', exist_ok=True)
        os.makedirs('quick_start/voc_weights/', exist_ok=True)

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r')
        zip_ref.extractall('quick_start/voc_weights/')
        zip_ref.close()

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r')
        zip_ref.extractall('quick_start/tts_weights/')
        zip_ref.close()

        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS Generator')
        parser.add_argument('-name', metavar='name', type=str,help='name of pdf')
        parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
        parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)')
        parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)')
        parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
        parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
        parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
        parser.set_defaults(batched=hp.voc_gen_batched)
        parser.set_defaults(target=hp.voc_target)
        parser.set_defaults(overlap=hp.voc_overlap)
        parser.set_defaults(input_text=None)
        parser.set_defaults(weights_path=None)
        args = parser.parse_args()

        batched = args.batched
        target = args.target
        overlap = args.overlap
        input_text = args.input_text
        weights_path = args.weights_path

        if not args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.set_device(0)
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        print('\nInitialising WaveRNN Model...\n')

        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode='MOL').to(device)

        voc_model.restore('quick_start/voc_weights/latest_weights.pyt')

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                             num_chars=len(symbols),
                             encoder_dims=hp.tts_encoder_dims,
                             decoder_dims=hp.tts_decoder_dims,
                             n_mels=hp.num_mels,
                             fft_bins=hp.num_mels,
                             postnet_dims=hp.tts_postnet_dims,
                             encoder_K=hp.tts_encoder_K,
                             lstm_dims=hp.tts_lstm_dims,
                             postnet_K=hp.tts_postnet_K,
                             num_highways=hp.tts_num_highways,
                             dropout=hp.tts_dropout).to(device)


        tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

        if input_text:
            inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
        else:
            with open('final.txt') as f:
                inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]

        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([('WaveRNN', str(voc_k) + 'k'),
                      (f'Tacotron(r={r})', str(tts_k) + 'k'),
                      ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

        for i, x in enumerate(inputs, 1):

            print("f'\n| Generating {i}/{len(inputs)}'")
            _, m, attention = tts_model.generate(x)

            if input_text:
                save_path = './output_audio/'+str(i)+'.wav'
            else:
                save_path = './output_audio/'+str(i)+'.wav'

            # save_attention(attention, save_path)

            m = torch.tensor(m).unsqueeze(0)
            m = (m + 4) / 8

            voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)


            if i == 2:

                temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav")
                temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = temp1 + temp2

                os.remove("./output_audio/"+str(i-1)+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")

            elif i > 2:

                preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav")

                newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = preTemp + newTemp

                os.remove("./output_audio/"+self.path[:-4]+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")


        print("Done")
Esempio n. 11
0
    def train_session(self, model: ForwardTacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                batch = to_device(batch, device=device)
                start = time.time()
                model.train()

                pitch_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['pitch_zoneout']
                energy_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['energy_zoneout']

                pitch_target = batch['pitch'].detach().clone()
                energy_target = batch['energy'].detach().clone()
                batch['pitch'] = batch['pitch'] * pitch_zoneout_mask.to(
                    device).float()
                batch['energy'] = batch['energy'] * energy_zoneout_mask.to(
                    device).float()

                pred = model(batch)

                m1_loss = self.l1_loss(pred['mel'], batch['mel'],
                                       batch['mel_len'])
                m2_loss = self.l1_loss(pred['mel_post'], batch['mel'],
                                       batch['mel_len'])

                dur_loss = self.l1_loss(pred['dur'].unsqueeze(1),
                                        batch['dur'].unsqueeze(1),
                                        batch['x_len'])
                pitch_loss = self.l1_loss(pred['pitch'],
                                          pitch_target.unsqueeze(1),
                                          batch['x_len'])
                energy_loss = self.l1_loss(pred['energy'],
                                           energy_target.unsqueeze(1),
                                           batch['x_len'])

                loss = m1_loss + m2_loss \
                       + self.train_cfg['dur_loss_factor'] * dur_loss \
                       + self.train_cfg['pitch_loss_factor'] * pitch_loss \
                       + self.train_cfg['energy_loss_factor'] * energy_loss

                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()

                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.forward_checkpoints /
                                    f'forward_step{k}k.pt')

                if step % self.train_cfg['plot_every'] == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss,
                                       model.get_step())
                self.writer.add_scalar('Energy_Loss/train', energy_loss,
                                       model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_out = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', val_out['mel_loss'],
                                   model.get_step())
            self.writer.add_scalar('Duration_Loss/val', val_out['dur_loss'],
                                   model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', val_out['pitch_loss'],
                                   model.get_step())
            self.writer.add_scalar('Energy_Loss/val', val_out['energy_loss'],
                                   model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.forward_checkpoints /
                            'latest_model.pt')

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
Esempio n. 12
0
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    tts_model.to(device)
    cleaner = Cleaner.from_config(config)
    tokenizer = Tokenizer()

    print(f'Using device: {device}\n')
    if args.input_text:
        texts = [args.input_text]
    else:
        with open('sentences.txt', 'r', encoding='utf-8') as f:
            texts = f.readlines()

    tts_k = tts_model.get_step() // 1000

    simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                  ('Vocoder Type', args.vocoder)])

    # simple amplification of pitch
    pitch_function = lambda x: x * args.amp
    energy_function = lambda x: x

    for i, x in enumerate(texts, 1):
        print(f'\n| Generating {i}/{len(texts)}')
        text = x
        x = cleaner(x)
        x = tokenizer(x)
        x = torch.as_tensor(x, dtype=torch.long, device=device).unsqueeze(0)

        wav_name = f'{i}_forward_{tts_k}k_alpha{args.alpha}_amp{args.amp}_{args.vocoder}'

        gen = tts_model.generate(x=x,
Esempio n. 13
0
                        sample_rate=hp.sample_rate,
                        pad_val=hp.voc_pad_val,
                        mode=hp.voc_mode).cuda()

    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    voc_model.restore(paths.voc_latest_weights)

    optimizer = torch.optim.Adam(voc_model.parameters())

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([('Remaining', str((total_steps - voc_model.get_step())//1000) + 'k Steps'),
                  ('Batch Size', batch_size),
                  ('Initial learning rate', init_lr),
                  ('Final learnging rate', final_lr),
                  ('Sequence Len', hp.voc_seq_len),
                  ('GTA Train', train_gta)])

    loss_func = torch.nn.functional.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(voc_model, loss_func, optimizer, train_set, test_set, init_lr, final_lr, total_steps)

    print('Training Complete.')
    print('To continue training increase voc_total_steps in hparams.py or use --force_train')
Esempio n. 14
0
    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    model.restore(paths.tts_latest_weights)

    optimiser = optim.Adam(model.parameters())

    train_set = get_tts_dataset(paths.data, batch_size)

    if not force_gta:

        total_steps = 10_000_000 if force_train else hp.tts_total_steps

        simple_table([
            ('Remaining', str(
                (total_steps - model.get_step()) // 1000) + 'k Steps'),
            ('Batch Size', batch_size), ('Learning Rate', lr)
        ])

        tts_train_loop(model, optimiser, train_set, lr, total_steps)

        print('Training Complete.')
        print(
            'To continue training increase tts_total_steps in hparams.py or use --force_train\n'
        )

    print('Creating Ground Truth Aligned Dataset...\n')

    create_gta_features(model, train_set, paths.gta)

    print(
Esempio n. 15
0
    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    r = tts_model.r

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  ('Tacotron(r=%s)' % (repr1(r)), str(tts_k) + 'k'),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', target if batched else 'N/A'),
                  ('Overlap Samples', overlap if batched else 'N/A')])

    for i, x in enumerate(inputs, 1):

        print('\n| Generating %s/%s' % (repr1(i), repr1(len(inputs))))
        _, m, attention = tts_model.generate(x)

        if input_text:
            save_path = 'quick_start/__input_%s_%sk.wav' % (repr1(
                input_text[:10]), repr1(tts_k))
        else:
            save_path = 'quick_start/%s_batched%s_%sk.wav' % (
                repr1(i), repr1(str(batched)), repr1(tts_k))
Esempio n. 16
0
                  fc_dims=hp.voc_fc_dims,
                  bits=hp.bits,
                  pad=hp.voc_pad,
                  upsample_factors=hp.voc_upsample_factors,
                  feat_dims=hp.num_mels,
                  compute_dims=hp.voc_compute_dims,
                  res_out_dims=hp.voc_res_out_dims,
                  res_blocks=hp.voc_res_blocks,
                  hop_length=hp.hop_length,
                  sample_rate=hp.sample_rate).cuda()

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    restore_path = args.weights if args.weights else paths.voc_latest_weights

    model.restore(restore_path)

    simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', target if batched else 'N/A'),
                  ('Overlap Samples', overlap if batched else 'N/A')])

    _, test_set = get_vocoder_datasets(paths.data, 1, gta)

    if file:
        gen_from_file(model, file, paths.voc_output, batched, target, overlap)
    else:
        gen_testset(model, test_set, samples, batched, target, overlap,
                    paths.voc_output)

    print('\n\nExiting...\n')
Esempio n. 17
0
    def train_session(self, model: Tacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        model.r = session.r
        simple_table([(f'Steps with r={session.r}',
                       str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Outputs/Step (r)', model.r)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens,
                    mel_lens) in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                x, m = x.to(device), m.to(device)

                m1_hat, m2_hat, attention = model(x, m)

                m1_loss = F.l1_loss(m1_hat, m)
                m2_loss = F.l1_loss(m2_hat, m)
                loss = m1_loss + m2_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hp.tts_clip_grad_norm)
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.tts_checkpoint_every == 0:
                    ckpt_name = f'taco_step{k}K'
                    save_checkpoint('tts',
                                    self.paths,
                                    model,
                                    optimizer,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.tts_plot_every == 0:
                    self.generate_plots(model, session)

                _, att_score = attention_score(attention, mel_lens)
                att_score = torch.mean(att_score)
                self.writer.add_scalar('Attention_Score/train', att_score,
                                       model.get_step())
                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/reduction_factor', session.r,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss, val_att_score = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            self.writer.add_scalar('Attention_Score/val', val_att_score,
                                   model.get_step())
            save_checkpoint('tts',
                            self.paths,
                            model,
                            optimizer,
                            is_silent=True)

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
Esempio n. 18
0
    def train_session(self, model_tts: ForwardTacotron,
                      model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer,
                      tts_session: ForwardSession, asr_session: ASRSession,
                      asr_trainer, optimizer_asr) -> None:
        # print(tts_session.path)
        # exit()
        asr_trainer_state = {'logs': []}
        current_step = model_tts.get_step()
        tts_training_steps = tts_session.max_step - current_step
        try:
            _, asr_current_step = get_last_checkpoint(
                './checkpoints/sme_speech_tts.asr_forward/', 'model_at')
            asr_training_steps = tts_session.max_step - asr_current_step
        except:
            asr_current_step = 0
            asr_training_steps = tts_training_steps

        total_iters = len(tts_session.train_set)
        epochs = tts_training_steps // total_iters + 1
        simple_table([
            ('TTS Steps', str(tts_training_steps // 1000) + 'k Steps'),
            ('ASR Steps', str(asr_training_steps // 1000) + 'k Steps'),
            ('Batch Size TTS', tts_session.bs),
            ('Learning Rate', tts_session.lr)
        ])

        for g in optimizer_tts.param_groups:
            g['lr'] = tts_session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()

        device = next(model_tts.parameters()
                      ).device  # use same device as model parameters
        warnings.filterwarnings('ignore', category=UserWarning)
        for e in range(1, epochs + 1):

            #tts train loop for epoch
            for i, (x, m, ids, x_lens, mel_lens,
                    dur) in enumerate(tts_session.train_set, 1):
                start = time.time()
                model_tts.train()
                x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                     x_lens.to(device), mel_lens.to(device)

                m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)

                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                        x_lens)

                tts_s_loss = m1_loss + m2_loss + 0.1 * dur_loss
                optimizer_tts.zero_grad()
                # tts_s_loss.backward()
                torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                               hp.tts_clip_grad_norm)
                # optimizer_tts.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model_tts.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                # pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg_tts = f'| TTS MODEL (supervised training ): '\
                      f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward',
                                    self.paths,
                                    model_tts,
                                    optimizer_tts,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.forward_plot_every == 0:

                    self.generate_plots(model_tts, tts_session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/batch_size', tts_session.bs,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/learning_rate', tts_session.lr,
                                       model_tts.get_step())

                stream(msg_tts)
                # print(msg_tts)
            # print(torch.cuda.memory_allocated(device=device))
            # model_tts = model_tts.to('cpu')

            for step, inputs in enumerate(asr_session.train_set):

                optimizer_asr.zero_grad()

                model_asr.to(device)
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        inputs[k] = v.to(device)
                model_asr.train()
                outputs = model_asr(**inputs)
                asr_s_loss = outputs["loss"] if isinstance(
                    outputs, dict) else outputs[0]
                # asr_s_loss = asr_s_loss.mean()

                msg_asr =  f'| ASR MODEL (supervised training) : '\
                            f'| Epoch: {e}/{epochs} ({step}/{len(asr_session.train_set)}) | Loss ASR: {asr_s_loss:#.4} '\
                            f' ||||||||||||||||||||||'

                stream(msg_asr)
            # # model_asr.to('cuda')

            m_val_loss, dur_val_loss = self.evaluate(model_tts,
                                                     tts_session.val_set)
            eval_tts_msg = f'| TTS MODEL (supervised eval ): '\
                        f'| Epoch: {e}/{epochs} | Val Loss: {m_val_loss:#.4} ' \
                        f'| Dur Val Loss: {dur_val_loss:#.4} ' \

            stream(eval_tts_msg)
            tts_eval_loss = m_val_loss + dur_val_loss
            #     print(eval_tts_msg)

            # ASR eval supervised
            print('\nEvaluating ASR model ...')
            # model_asr.to('cpu')
            asr_eval_loss = 0
            eval_wer = 0

            for step, inputs in enumerate(asr_session.test_set):
                asr_eval_loss_i, logits_a, labels_a = asr_trainer.prediction_step(
                    model_asr, inputs, False)
                asr_eval_loss += asr_eval_loss_i
                logits_a.to('cpu')
                eval_wer_i = asr_trainer.compute_metrics(
                    EvalPrediction(predictions=logits_a, label_ids=labels_a))
                eval_wer += eval_wer_i['wer']
                # print(eval_wer)
            eval_wer = eval_wer / step
            asr_eval_loss = asr_eval_loss / step

            msg_asr_eval = f'| ASR MODEL (supervised eval) : Epoch {e}/{epochs} | Loss ASR: {asr_eval_loss:#.4} | WER: {eval_wer} |||||||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr_eval)

            # dual transformation loop
            # tts_s_loss = 3
            # asr_s_loss = 1
            tts_u_loss, asr_u_loss = self.dual_transform(
                model_tts, model_asr, optimizer_tts, optimizer_asr,
                asr_session.test_set, m_loss_avg, dur_loss_avg, device,
                asr_current_step, e, epochs, duration_avg, total_iters,
                tts_s_loss, asr_s_loss, tts_session.lr, tts_session.path)
            step += 1
            asr_path = f'checkpoint-27364'
            modelasr_folder = './checkpoints/sme_speech_tts.asr_forward/'
            new_check = modelasr_folder + asr_path
            os.makedirs(new_check, exist_ok=True)

            # asr_path, asr_step = get_last_checkpoint(modelasr_folder, modelasr_name)

            save_checkpoint('forward',
                            self.paths,
                            model_tts,
                            optimizer_tts,
                            is_silent=True)

            # asr_u_loss = 2

            if "logs" not in asr_trainer_state:
                asr_trainer_state['logs'] = []
            asr_trainer_state['logs'].append({
                'step':
                step,
                'epoch':
                e,
                'asr_s_loss':
                int(asr_s_loss),
                'asr_u_loss':
                int(asr_u_loss),
                'tts_s_loss':
                int(tts_s_loss),
                'tts_u_loss':
                int(tts_u_loss),
                'tts_eval_loss':
                int(tts_eval_loss),
                'asr_eval_loss':
                int(asr_eval_loss),
                'eval_wer':
                eval_wer
            })

            with open(f'{modelasr_folder+ asr_path}/dt_trainer_state.json',
                      'w') as f:
                json.dump(asr_trainer_state, f)

            model_asr.save_pretrained(f'{new_check}')

            torch.save(optimizer_asr.state_dict(), f'{new_check}/optimizer.pt')

            print("Exiting due to cuda OOM!")
            exit(11)
Esempio n. 19
0
def main():

    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder')
    parser.add_argument('--lr',
                        '-l',
                        type=float,
                        help='[float] override hparams.py learning rate')
    parser.add_argument('--batch_size',
                        '-b',
                        type=int,
                        help='[int] override hparams.py batch size')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--gta',
                        '-g',
                        action='store_true',
                        help='train wavernn on GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # load hparams from file
    if args.lr is None:
        args.lr = hp.voc_lr
    if args.batch_size is None:
        args.batch_size = hp.voc_batch_size

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    batch_size = args.batch_size
    force_train = args.force_train
    train_gta = args.gta
    lr = args.lr

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        if batch_size % torch.cuda.device_count() != 0:
            raise ValueError(
                '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Model...\n')

    # Instantiate WaveRNN Model
    voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                        fc_dims=hp.voc_fc_dims,
                        bits=hp.bits,
                        pad=hp.voc_pad,
                        upsample_factors=hp.voc_upsample_factors,
                        feat_dims=hp.num_mels,
                        compute_dims=hp.voc_compute_dims,
                        res_out_dims=hp.voc_res_out_dims,
                        res_blocks=hp.voc_res_blocks,
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        mode=hp.voc_mode).to(device)

    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    optimizer = optim.Adam(voc_model.parameters())
    restore_checkpoint('voc',
                       paths,
                       voc_model,
                       optimizer,
                       create_if_missing=True)

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size,
                                               train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([
        ('Remaining', str(
            (total_steps - voc_model.get_step()) // 1000) + 'k Steps'),
        ('Batch Size', batch_size), ('LR', lr),
        ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta)
    ])

    loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set,
                   lr, total_steps)

    print('Training Complete.')
    print(
        'To continue training increase voc_total_steps in hparams.py or use --force_train'
    )
Esempio n. 20
0
              bits=hp.bits,
              pad=hp.pad,
              upsample_factors=hp.upsample_factors,
              feat_dims=hp.num_mels,
              compute_dims=hp.compute_dims,
              res_out_dims=hp.res_out_dims,
              res_blocks=hp.res_blocks,
              hop_length=hp.hop_length,
              sample_rate=hp.sample_rate).cuda()

paths = Paths(hp.data_path, hp.model_id)

model.restore(paths.latest_weights)

optimiser = optim.Adam(model.parameters())

train_set, test_set = get_datasets(paths.data, batch_size)

total_steps = 10_000_000 if force_train else hp.total_steps

simple_table([('Steps Left', str(
    (total_steps - model.get_step()) // 1000) + 'k'),
              ('Batch Size', batch_size), ('LR', lr),
              ('Sequence Len', hp.seq_len)])

train_loop(model, optimiser, train_set, test_set, lr, total_steps)

print(
    'Training Complete. To continue training increase total_steps in hparams.py or use --force_train'
)