コード例 #1
0
def main():
    testset = TextDataset(args.testset)
    test_loader = DataLoader(dataset=testset,
                             batch_size=args.test_batch,
                             drop_last=False,
                             shuffle=False,
                             collate_fn=synth_collate_fn,
                             pin_memory=True)

    model = Tacotron().to(DEVICE)

    model_path = sorted(
        glob.glob(os.path.join(args.logdir, model.name,
                               'model-*.tar')))[-1]  # latest model
    state = torch.load(model_path)
    model.load_state_dict(state['model'])
    args.global_step = state['global_step']

    print('The model is loaded. Step: {}'.format(args.global_step))

    model.eval()

    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))
    synthesize(model, test_loader, args.test_batch)
コード例 #2
0
def main():
    ap = AudioProcessor()

    # load model
    num_chars = len(phonemes)
    model = Tacotron(num_chars).to(device)
    cp = torch.load(args.model_path)
    model.load_state_dict(cp['model'])
    model.eval()

    print('Text: {}'.format(args.text))
    wav = tts(model, args.text, ap)

    file_name = args.text.replace(' ', '_') + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    ap.save_wav(wav, out_path)
コード例 #3
0
ファイル: generate.py プロジェクト: ktho22/vctts
if args.gpu is None:
    args.use_gpu = False
    args.gpu = []
else:
    args.use_gpu = True
    torch.cuda.manual_seed(args.seed)
    torch.cuda.set_device(args.gpu[0])

model = Tacotron(args)
if args.init_from:
    model.load_state_dict(checkpoint['state_dict'])
    model.reset_decoder_states()
    print('loaded checkpoint %s' % (args.init_from))

stft = STFT(filter_length=args.n_fft)
model = model.eval()
if args.use_gpu:
    model = model.cuda()
    stft = stft.cuda()


def main():
    db = TTSDataset()
    collate = collate_class(use_txt=args.use_txt)
    loader = torch.utils.data.DataLoader(db,
                                         batch_size=1,
                                         shuffle=False,
                                         collate_fn=collate.fn,
                                         drop_last=True)
    model_name = args.init_from.split('/')[-1][:-3]
コード例 #4
0
ファイル: generate.py プロジェクト: root20/Tacotron_pytorch
def main():
    parser = argparse.ArgumentParser(description='training script')
    # data load
    parser.add_argument('--data', type=str, default='blizzard', help='blizzard / nancy')
    parser.add_argument('--batch_size', type=int, default=6, help='batch size')
    parser.add_argument('--text_limit', type=int, default=1500, help='maximum length of text to include in training set')
    parser.add_argument('--wave_limit', type=int, default=800, help='maximum length of spectrogram to include in training set')
    parser.add_argument('--shuffle_data', type=int, default=0, help='whether to shuffle data loader')
    parser.add_argument('--batch_idx', type=int, default=0, help='n-th batch of the dataset')
    parser.add_argument('--load_queue_size', type=int, default=1, help='maximum number of batches to load on the memory')
    parser.add_argument('--n_workers', type=int, default=1, help='number of workers used in data loader')
    # generation option
    parser.add_argument('--exp_no', type=int, default=0, help='')
    parser.add_argument('--out_dir', type=str, default='generated', help='')
    parser.add_argument('--init_from', type=str, default='', help='load parameters from...')
    parser.add_argument('--caption', type=str, default='', help='text to generate speech')
    parser.add_argument('--teacher_forcing_ratio', type=float, default=0, help='value between 0~1, use this for scheduled sampling')
    # audio related option
    parser.add_argument('--n_fft', type=int, default=2048, help='fft bin size')
    parser.add_argument('--sample_rate', type=int, default=16000, help='sampling rate')
    parser.add_argument('--frame_len_inMS', type=int, default=50, help='used to determine window size of fft')
    parser.add_argument('--frame_shift_inMS', type=int, default=12.5, help='used to determine stride in sfft')
    parser.add_argument('--num_recon_iters', type=int, default=50, help='# of iteration in griffin-lim recon')
    # misc
    parser.add_argument('--gpu', type=int, nargs='+', help='index of gpu machines to run')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    new_args = vars(parser.parse_args())

    # load and override some arguments
    checkpoint = torch.load(new_args['init_from'], map_location=lambda storage, loc: storage)
    args = checkpoint['args']
    for i in new_args:
        args.__dict__[i] = new_args[i]

    torch.manual_seed(args.seed)

    # set dataset option
    if args.data == 'blizzard':
        args.dir_bin = '/data2/lyg0722/TTS_corpus/blizzard/segmented/bin/'
    elif args.data == 'etri':
        args.dir_bin = '/data2/lyg0722/TTS_corpus/etri/bin/'
    else:
        print('no dataset')
        return

    if args.gpu is None:
        args.use_gpu = False
        args.gpu = []
    else:
        args.use_gpu = True
        torch.cuda.manual_seed(0)
        torch.cuda.set_device(args.gpu[0])

    model = Tacotron(args)
    criterion_mel = nn.L1Loss(size_average=False)
    criterion_lin = nn.L1Loss(size_average=False)

    window_len = int(np.ceil(args.frame_len_inMS * args.sample_rate / 1000))
    hop_length = int(np.ceil(args.frame_shift_inMS * args.sample_rate / 1000))

    if args.init_from:
        model.load_state_dict(checkpoint['state_dict'])
        print('loaded checkpoint %s' % (args.init_from))

    model = model.eval()

    if args.use_gpu:
        model = model.cuda()
        criterion_mel = criterion_mel.cuda()
        criterion_lin = criterion_lin.cuda()

    if args.caption:
        text_raw = args.caption

        if args.data == 'etri':
            text_raw = decompose_hangul(text_raw)       # For Korean dataset

        vocab_dict = torch.load(args.dir_bin + 'vocab.t7')

        enc_input = [vocab_dict[i] for i in text_raw]
        enc_input = enc_input + [0]                                   # null-padding at tail
        text_lengths = [len(enc_input)]
        enc_input = Variable(torch.LongTensor(enc_input).view(1,-1))

        dec_input = torch.Tensor(1, 1, args.dec_out_size).fill_(0)          # null-padding for start flag
        dec_input = Variable(dec_input)
        wave_lengths = [args.wave_limit]        # TODO: use <EOS> later...

        prev_h = (None, None, None)  # set prev_h = h_0 when new sentences are loaded

        if args.gpu:
            enc_input = enc_input.cuda()
            dec_input = dec_input.cuda()

        _, pred_lin, prev_h = model(enc_input, dec_input, wave_lengths, text_lengths, prev_h)

        # start generation
        wave = spectrogram2wav(
            pred_lin.data.view(-1, args.post_out_size).cpu().numpy(),
            n_fft=args.n_fft,
            win_length=window_len,
            hop_length=hop_length,
            num_iters=args.num_recon_iters
        )

        # write to file
        outpath1 = '%s/%s_%s.wav' % (args.out_dir, args.exp_no, args.caption)
        outpath2 = '%s/%s_%s.png' % (args.out_dir, args.exp_no, args.caption)
        librosa.output.write_wav(outpath1, wave, 16000)
        saveAttention(text_raw, torch.cat(model.attn_weights, dim=-1).squeeze(), outpath2)
    else:
        loader = DataLoader(args)
        args.vocab_size = loader.get_num_vocab()

        for iter in range(1, loader.iter_per_epoch + 1):
            if loader.is_subbatch_end:
                prev_h = (None, None, None)  # set prev_h = h_0 when new sentences are loaded

            for i in range(args.batch_idx):
                loader.next_batch('train')

            enc_input, target_mel, target_lin, wave_lengths, text_lengths = loader.next_batch('train')
            enc_input = Variable(enc_input, volatile=True)
            target_mel = Variable(target_mel, volatile=True)
            target_lin = Variable(target_lin, volatile=True)

            prev_h = loader.mask_prev_h(prev_h)

            if args.gpu:
                enc_input = enc_input.cuda()
                target_mel = target_mel.cuda()
                target_lin = target_lin.cuda()

            pred_mel, pred_lin, prev_h = model(enc_input, target_mel[:, :-1], wave_lengths, text_lengths, prev_h)

            loss_mel = criterion_mel(pred_mel, target_mel[:, 1:]) \
                .div(max(wave_lengths) * args.batch_size * args.dec_out_size)
            loss_linear = criterion_lin(pred_lin, target_lin[:, 1:]) \
                .div(max(wave_lengths) * args.batch_size * args.post_out_size)
            loss = torch.sum(loss_mel + loss_linear)

            print('loss:' , loss.data[0])

            attentions = torch.cat(model.attn_weights, dim=-1)

            # write to file
            for n in range(enc_input.size(0)):
                wave = spectrogram2wav(
                    pred_lin.data[n].view(-1, args.post_out_size).cpu().numpy(),
                    n_fft=args.n_fft,
                    win_length=window_len,
                    hop_length=hop_length,
                    num_iters=args.num_recon_iters
                )
                outpath1 = '%s/%s_%s_%s.wav' % (args.out_dir, args.exp_no, n, args.caption)
                librosa.output.write_wav(outpath1, wave, 16000)
                outpath2 = '%s/%s_%s_%s.png' % (args.out_dir, args.exp_no, n, args.caption)
                saveAttention(None, attentions[n], outpath2)


            # showPlot(plot_losses)
            break