Beispiel #1
0
def main():
    model = Tacotron().to(DEVICE)
    print('Model {} is working...'.format(model.name))
    print('{} threads are used...'.format(torch.get_num_threads()))
    ckpt_dir = os.path.join(args.logdir, model.name)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = StepLR(optimizer,
                       step_size=args.lr_decay_step // 10,
                       gamma=0.933)  # around 1/2 per decay step

    if not os.path.exists(ckpt_dir):
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
    elif not os.path.exists(os.path.join(ckpt_dir, 'ckpt.csv')):
        shutil.rmtree(ckpt_dir)
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
    else:
        print('Already exists. Retrain the model.')
        ckpt = pd.read_csv(os.path.join(ckpt_dir, 'ckpt.csv'),
                           sep=',',
                           header=None)
        ckpt.columns = ['models', 'loss']
        ckpt = ckpt.sort_values(by='loss', ascending=True)
        state = torch.load(os.path.join(ckpt_dir, ckpt.models.loc[0]))
        model.load_state_dict(state['model'])
        args.global_step = state['global_step']
        optimizer.load_state_dict(state['optimizer'])
        scheduler.load_state_dict(state['scheduler'])

    # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE)

    dataset = SpeechDataset(args.data_path,
                            args.meta_train,
                            model.name,
                            mem_mode=args.mem_mode)
    validset = SpeechDataset(args.data_path,
                             args.meta_eval,
                             model.name,
                             mem_mode=args.mem_mode)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             collate_fn=collate_fn,
                             drop_last=True,
                             pin_memory=True)
    valid_loader = DataLoader(dataset=validset,
                              batch_size=args.test_batch,
                              shuffle=False,
                              collate_fn=collate_fn,
                              pin_memory=True)

    writer = SummaryWriter(ckpt_dir)
    train(model,
          data_loader,
          valid_loader,
          optimizer,
          scheduler,
          batch_size=args.batch_size,
          ckpt_dir=ckpt_dir,
          writer=writer)
    return None
Beispiel #2
0
def main():
    testset = TextDataset(args.testset)
    test_loader = DataLoader(dataset=testset,
                             batch_size=args.test_batch,
                             drop_last=False,
                             shuffle=False,
                             collate_fn=synth_collate_fn,
                             pin_memory=True)

    model = Tacotron().to(DEVICE)

    model_path = sorted(
        glob.glob(os.path.join(args.logdir, model.name,
                               'model-*.tar')))[-1]  # latest model
    state = torch.load(model_path)
    model.load_state_dict(state['model'])
    args.global_step = state['global_step']

    print('The model is loaded. Step: {}'.format(args.global_step))

    model.eval()

    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))
    synthesize(model, test_loader, args.test_batch)
Beispiel #3
0
def main():
    ap = AudioProcessor()

    # load model
    num_chars = len(phonemes)
    model = Tacotron(num_chars).to(device)
    cp = torch.load(args.model_path)
    model.load_state_dict(cp['model'])
    model.eval()

    print('Text: {}'.format(args.text))
    wav = tts(model, args.text, ap)

    file_name = args.text.replace(' ', '_') + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    ap.save_wav(wav, out_path)
Beispiel #4
0
                                         pin_memory=hparams.pin_memory)

    valset = PyTorchDataset(X_val, Mel_val, Y_val)
    val_loader = data_utils.DataLoader(valset,
                                       batch_size=hparams.batch_size,
                                       num_workers=hparams.num_workers,
                                       shuffle=True,
                                       collate_fn=collate_fn_phonesNqF0s,
                                       pin_memory=hparams.pin_memory)

    # Model
    model = Tacotron(
        n_vocab=1 + len(ph_ids),
        embedding_dim=256,
        mel_dim=hparams.num_mels,
        linear_dim=hparams.num_freq,
        r=hparams.outputs_per_step,
        padding_idx=hparams.padding_idx,
        use_memory_mask=hparams.use_memory_mask,
    )
    model = model.cuda()
    #model = DataParallelFix(model)

    optimizer = optim.Adam(model.parameters(),
                           lr=hparams.initial_learning_rate,
                           betas=(hparams.adam_beta1, hparams.adam_beta2),
                           weight_decay=hparams.weight_decay)

    # Load checkpoint
    if checkpoint_path:
        print("Load checkpoint from: {}".format(checkpoint_path))
Beispiel #5
0
def main():
    # DataSet Loader
    if args.dataset == "ljspeech":
        from datasets.ljspeech import LJSpeech

        # LJSpeech-1.1 dataset loader
        ljs = LJSpeech(
            path=cfg.dataset_path,
            save_to='npy',
            load_from=None
            if not os.path.exists(cfg.dataset_path + "/npy") else "npy",
            verbose=cfg.verbose)
    else:
        raise NotImplementedError("[-] Not Implemented Yet...")

    # Train/Test split
    tr_size = int(len(ljs) * (1. - cfg.test_size))

    tr_text_data, va_text_data = \
        ljs.text_data[:tr_size], ljs.text_data[tr_size:]
    tr_text_len_data, va_text_len_data = \
        ljs.text_len_data[:tr_size], ljs.text_len_data[tr_size:]
    tr_mels, va_mels = ljs.mels[:tr_size], ljs.mels[tr_size:]
    tr_mags, va_mags = ljs.mags[:tr_size], ljs.mags[tr_size:]

    del ljs  # memory release

    # Data Iterator
    di = DataIterator(text=tr_text_data,
                      text_len=tr_text_len_data,
                      mel=tr_mels,
                      mag=tr_mags,
                      batch_size=cfg.batch_size)

    if cfg.verbose:
        print("[*] Train/Test split : %d/%d (%.2f/%.2f)" %
              (tr_text_data.shape[0], va_text_data.shape[0],
               1. - cfg.test_size, cfg.test_size))
        print("  Train")
        print("\ttext     : ", tr_text_data.shape)
        print("\ttext_len : ", tr_text_len_data.shape)
        print("\tmels     : ", tr_mels.shape)
        print("\tmags     : ", tr_mags.shape)
        print("  Test")
        print("\ttext     : ", va_text_data.shape)
        print("\ttext_len : ", va_text_len_data.shape)
        print("\tmels     : ", va_mels.shape)
        print("\tmags     : ", va_mags.shape)

    # Model Loading
    gpu_config = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False,
                            gpu_options=gpu_config)

    with tf.Session(config=config) as sess:
        if cfg.model == "Tacotron":
            model = Tacotron(sess=sess,
                             mode=args.mode,
                             sample_rate=cfg.sample_rate,
                             vocab_size=cfg.vocab_size,
                             embed_size=cfg.embed_size,
                             n_mels=cfg.n_mels,
                             n_fft=cfg.n_fft,
                             reduction_factor=cfg.reduction_factor,
                             n_encoder_banks=cfg.n_encoder_banks,
                             n_decoder_banks=cfg.n_decoder_banks,
                             n_highway_blocks=cfg.n_highway_blocks,
                             lr=cfg.lr,
                             lr_decay=cfg.lr_decay,
                             optimizer=cfg.optimizer,
                             grad_clip=cfg.grad_clip,
                             model_path=cfg.model_path)
        else:
            raise NotImplementedError("[-] Not Implemented Yet...")

        if cfg.verbose:
            print("[*] %s model is loaded!" % cfg.model)

        # Initializing
        sess.run(tf.global_variables_initializer())

        # Load model & Graph & Weights
        global_step = 0
        ckpt = tf.train.get_checkpoint_state(cfg.model_path)
        if ckpt and ckpt.model_checkpoint_path:
            # Restores from checkpoint
            model.saver.restore(sess, ckpt.model_checkpoint_path)

            global_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            print("[+] global step : %d" % global_step, " successfully loaded")
        else:
            print('[-] No checkpoint file found')

        start_time = time.time()

        best_loss = np.inf
        batch_size = cfg.batch_size
        model.global_step.assign(tf.constant(global_step))
        restored_epochs = global_step // (di.text.shape[0] // batch_size)
        for epoch in range(restored_epochs, cfg.epochs):
            for text, text_len, mel, mag in di.iterate():
                batch_start = time.time()
                _, y_loss, z_loss = sess.run(
                    [model.train_op, model.y_loss, model.z_loss],
                    feed_dict={
                        model.x: text,
                        model.x_len: text_len,
                        model.y: mel,
                        model.z: mag,
                    })
                batch_end = time.time()

                if global_step and global_step % cfg.logging_step == 0:
                    va_y_loss, va_z_loss = 0., 0.

                    va_batch = 20
                    va_iter = len(va_text_data)
                    for idx in range(0, va_iter, va_batch):
                        va_y, va_z = sess.run(
                            [model.y_loss, model.z_loss],
                            feed_dict={
                                model.x:
                                va_text_data[va_batch * idx:va_batch *
                                             (idx + 1)],
                                model.x_len:
                                va_text_len_data[va_batch * idx:va_batch *
                                                 (idx + 1)],
                                model.y:
                                va_mels[va_batch * idx:va_batch * (idx + 1)],
                                model.z:
                                va_mags[va_batch * idx:va_batch * (idx + 1)],
                            })

                        va_y_loss += va_y
                        va_z_loss += va_z

                    va_y_loss /= (va_iter // va_batch)
                    va_z_loss /= (va_iter // va_batch)

                    print(
                        "[*] epoch %03d global step %07d [%.03f sec/step]" %
                        (epoch, global_step, (batch_end - batch_start)),
                        " Train \n"
                        " y_loss : {:.6f} z_loss : {:.6f}".format(
                            y_loss, z_loss), " Valid \n"
                        " y_loss : {:.6f} z_loss : {:.6f}".format(
                            va_y_loss, va_z_loss))

                    # summary
                    summary = sess.run(model.merged,
                                       feed_dict={
                                           model.x:
                                           va_text_data[:batch_size],
                                           model.x_len:
                                           va_text_len_data[:batch_size],
                                           model.y:
                                           va_mels[:batch_size],
                                           model.z:
                                           va_mags[:batch_size],
                                       })

                    # getting/plotting alignment (important)
                    alignment = sess.run(model.alignments,
                                         feed_dict={
                                             model.x:
                                             va_text_data[:batch_size],
                                             model.x_len:
                                             va_text_len_data[:batch_size],
                                             model.y:
                                             va_mels[:batch_size],
                                         })

                    plot_alignment(alignments=alignment,
                                   gs=global_step,
                                   path=os.path.join(cfg.model_path,
                                                     "alignments"))

                    # Summary saver
                    model.writer.add_summary(summary, global_step)

                    # Model save
                    model.saver.save(sess,
                                     cfg.model_path + '%s.ckpt' % cfg.model,
                                     global_step=global_step)

                    if va_y_loss + va_z_loss < best_loss:
                        model.best_saver.save(sess,
                                              cfg.model_path +
                                              '%s-best_loss.ckpt' % cfg.model,
                                              global_step=global_step)
                        best_loss = va_y_loss + va_z_loss

                model.global_step.assign_add(tf.constant(1))
                global_step += 1

        end_time = time.time()

        print("[+] Training Done! Elapsed {:.8f}s".format(end_time -
                                                          start_time))
    file_name_suffix = args["--file-name-suffix"]

    checkpoint = torch.load(checkpoint_path)
    checkpoints_dir = os.path.dirname(checkpoint_path)
    with open(checkpoints_dir + '/ids_phones.json') as  f:
       phids = json.load(f)

    with open(checkpoints_dir + '/spk_ids') as  f:
       speakers_dict = json.load(f)


    model = Tacotron(n_vocab=len(phids)+1,
                     embedding_dim=256,
                     mel_dim=hparams.num_mels,
                     linear_dim=hparams.num_freq,
                     r=hparams.outputs_per_step,
                     padding_idx=hparams.padding_idx,
                     use_memory_mask=hparams.use_memory_mask,
                     num_spk=len(speakers_dict.keys())
                     )
    checkpoint = torch.load(checkpoint_path)
    checkpoints_dir = os.path.dirname(checkpoint_path)
    with open(checkpoints_dir + '/ids_phones.json') as  f:
       phids = json.load(f)
    phids = dict(phids)

    model.load_state_dict(checkpoint["state_dict"])
    model.decoder.max_decoder_steps = max_decoder_steps


    ids2speakers = {v:k for (k,v) in speakers_dict.items()}
Beispiel #7
0
                                         collate_fn=collate_fn_spk,
                                         pin_memory=hparams.pin_memory)

    phi_loader = data_utils.DataLoader(phiset,
                                       batch_size=hparams.batch_size,
                                       num_workers=hparams.num_workers,
                                       shuffle=True,
                                       collate_fn=collate_fn_spk,
                                       pin_memory=hparams.pin_memory)

    # Model
    theta_model = learn2learn.algorithms.MAML(Tacotron(
        n_vocab=1 + len(ph_ids),
        num_spk=2,
        embedding_dim=256,
        mel_dim=hparams.num_mels,
        linear_dim=hparams.num_freq,
        r=hparams.outputs_per_step,
        padding_idx=hparams.padding_idx,
        use_memory_mask=hparams.use_memory_mask,
    ),
                                              lr=0.01,
                                              allow_unused=True)
    theta_model = theta_model.cuda()

    phi_model = Tacotron(
        n_vocab=1 + len(ph_ids),
        num_spk=2,
        embedding_dim=256,
        mel_dim=hparams.num_mels,
        linear_dim=hparams.num_freq,
        r=hparams.outputs_per_step,
Beispiel #8
0
def main():
    ap = AudioProcessor()

    train_dataset = TTSDataset('data/LJSpeech-1.1',
                               'train.list',
                               outputs_per_step=r)
    valid_dataset = TTSDataset('data/LJSpeech-1.1',
                               'valid.list',
                               outputs_per_step=r)

    print('train data:', len(train_dataset))
    print('valid data:', len(valid_dataset))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        drop_last=False,
        num_workers=0,
        pin_memory=False)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=valid_dataset.collate_fn,
        drop_last=False,
        num_workers=0,
        pin_memory=False)

    # Create models
    num_chars = len(phonemes)
    model = Tacotron(num_chars, r=r).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0)
    # StopNetは二値分類タスクなので独自に訓練する
    optimizer_st = optim.Adam(model.decoder.stopnet.parameters(),
                              lr=lr,
                              weight_decay=0.0)

    criterion = L1LossMasked()
    criterion_st = nn.BCEWithLogitsLoss()

    num_params = count_parameters(model)
    print('Model has {} parameters'.format(num_params))

    # Training
    best_loss = float('inf')
    global_step = 0
    for epoch in range(0, epochs + 1):
        train_loss, global_step = train(train_loader, model, criterion,
                                        criterion_st, optimizer, optimizer_st,
                                        ap, global_step, epoch)

        valid_loss = evaluate(valid_loader, model, criterion, criterion_st, ap,
                              global_step, epoch)

        print('Epoch [{}/{}] train_loss: {:.5f} valid_loss: {:.5f}'.format(
            epoch, epochs, train_loss, valid_loss))

        if valid_loss < best_loss:
            print('  => valid_loss improved from {:.5f} to {:.5f}!'.format(
                best_loss, valid_loss))
            new_state_dict = model.state_dict()
            state = {
                'model': new_state_dict,
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                'linear_loss': valid_loss
            }
            best_loss = valid_loss
            best_model_path = os.path.join(writer.logdir, 'best_model.pth')
            torch.save(state, best_model_path)
Beispiel #9
0
                        map_location=lambda storage, loc: storage)
args = checkpoint['args']
for i in new_args:
    args.__dict__[i] = new_args[i]

torch.manual_seed(args.seed)

if args.gpu is None:
    args.use_gpu = False
    args.gpu = []
else:
    args.use_gpu = True
    torch.cuda.manual_seed(args.seed)
    torch.cuda.set_device(args.gpu[0])

model = Tacotron(args)
if args.init_from:
    model.load_state_dict(checkpoint['state_dict'])
    model.reset_decoder_states()
    print('loaded checkpoint %s' % (args.init_from))

stft = STFT(filter_length=args.n_fft)
model = model.eval()
if args.use_gpu:
    model = model.cuda()
    stft = stft.cuda()


def main():
    db = TTSDataset()
    collate = collate_class(use_txt=args.use_txt)
    checkpoint_path = args["<checkpoint>"]
    text_list_file_path = args["<text_list_file>"]
    dst_dir = args["<dst_dir>"]
    max_decoder_steps = int(args["--max-decoder-steps"])
    file_name_suffix = args["--file-name-suffix"]

    checkpoints_dir = os.path.dirname(checkpoint_path)
    with open(checkpoints_dir + '/ids_phones.json') as f:
        phids = json.load(f)

    with open(checkpoints_dir + '/ids_tones.json') as f:
        toneids = json.load(f)
    toneids = dict(toneids)

    checkpoint = torch.load(checkpoint_path)
    model = Tacotron(n_vocab=len(phids) + 1, n_tones=1 + len(toneids))
    model.load_state_dict(checkpoint["state_dict"])
    #model.decoder.max_decoder_steps = max_decoder_steps

    os.makedirs(dst_dir, exist_ok=True)

    with open(text_list_file_path, "rb") as f:
        lines = f.readlines()
        for idx, line in enumerate(lines):

            fname = line.decode("utf-8").split()[0].zfill(8)
            cmd = 'cp vox/wav/' + fname + '.wav ' + dst_dir + '/' + fname + '_original.wav'
            print(cmd)
            os.system(cmd)

            # Load phones
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(description='training script')
    # data load
    parser.add_argument('--data', type=str, default='blizzard', help='blizzard / nancy')
    parser.add_argument('--batch_size', type=int, default=32, help='batch size')
    parser.add_argument('--text_limit', type=int, default=1000, help='maximum length of text to include in training set')
    parser.add_argument('--wave_limit', type=int, default=1400, help='maximum length of spectrogram to include in training set')
    parser.add_argument('--trunc_size', type=int, default=700, help='used for truncated-BPTT when memory is not enough.')
    parser.add_argument('--shuffle_data', type=int, default=1, help='whether to shuffle data loader')
    parser.add_argument('--load_queue_size', type=int, default=8, help='maximum number of batches to load on the memory')
    parser.add_argument('--n_workers', type=int, default=2, help='number of workers used in data loader')
    # model
    parser.add_argument('--charvec_dim', type=int, default=256, help='')
    parser.add_argument('--hidden_size', type=int, default=128, help='')
    parser.add_argument('--dec_out_size', type=int, default=80, help='decoder output size')
    parser.add_argument('--post_out_size', type=int, default=1025, help='should be n_fft / 2 + 1(check n_fft from "input_specL" ')
    parser.add_argument('--num_filters', type=int, default=16, help='number of filters in filter bank of CBHG')
    parser.add_argument('--r_factor', type=int, default=5, help='reduction factor(# of multiple output)')
    parser.add_argument('--dropout', type=float, default=0.5, help='')
    # optimization
    parser.add_argument('--max_epochs', type=int, default=100000, help='maximum epoch to train')
    parser.add_argument('--grad_clip', type=float, default=1, help='gradient clipping')
    parser.add_argument('--learning_rate', type=float, default=1e-3, help='2e-3 from Ito, I used to use 5e-4')
    parser.add_argument('--lr_decay_every', type=int, default=25000, help='decay learning rate every...')
    parser.add_argument('--lr_decay_factor', type=float, default=0.5, help='decay learning rate by this factor')
    parser.add_argument('--teacher_forcing_ratio', type=float, default=1, help='value between 0~1, use this for scheduled sampling')
    # loading
    parser.add_argument('--init_from', type=str, default='', help='load parameters from...')
    parser.add_argument('--resume', type=int, default=0, help='1 for resume from saved epoch')
    # misc
    parser.add_argument('--exp_no', type=int, default=0, help='')
    parser.add_argument('--print_every', type=int, default=-1, help='')
    parser.add_argument('--plot_every', type=int, default=-1, help='')
    parser.add_argument('--save_every', type=int, default=-1, help='')
    parser.add_argument('--save_dir', type=str, default='checkpoint', help='')
    parser.add_argument('--pinned_memory', type=int, default=1, help='1 to use pinned memory')
    parser.add_argument('--gpu', type=int, nargs='+', help='index of gpu machines to run')
    # debug
    parser.add_argument('--debug', type=int, default=0, help='1 for debug mode')
    args = parser.parse_args()

    torch.manual_seed(0)

    # set dataset option
    if args.data == 'blizzard':
        args.dir_bin = '/home/lyg0722/TTS_corpus/blizzard/segmented/bin/'
    elif args.data == 'etri':
        args.dir_bin = '/data2/lyg0722/TTS_corpus/etri/bin/'
    else:
        print('no dataset')
        return

    if args.gpu is None:
        args.use_gpu = False
        args.gpu = []
    else:
        args.use_gpu = True
        torch.cuda.manual_seed(0)
        torch.cuda.set_device(args.gpu[0])

    loader = DataLoader(args)

    # set misc options
    args.vocab_size = loader.get_num_vocab()
    if args.print_every == -1:
        args.print_every = loader.iter_per_epoch
    if args.plot_every == -1:
        args.plot_every = args.print_every
    if args.save_every == -1:
        args.save_every = loader.iter_per_epoch * 10    # save every 10 epoch by default

    model = Tacotron(args)
    model_optim = optim.Adam(model.parameters(), args.learning_rate)
    criterion_mel = nn.L1Loss(size_average=False)
    criterion_lin = nn.L1Loss(size_average=False)

    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    start_epoch = 0
    iter = 1

    if args.init_from:
        checkpoint = torch.load(args.init_from, map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint['state_dict'])
        if args.resume != 0:
            start_epoch = checkpoint['epoch']
            plot_losses = checkpoint['plot_losses']
        print('loaded checkpoint %s (epoch %d)' % (args.init_from, start_epoch))

    model = model.train()
    if args.use_gpu:
        model = model.cuda()
        criterion_mel = criterion_mel.cuda()
        criterion_lin = criterion_lin.cuda()

    print('Start training... (1 epoch = %s iters)' % (loader.iter_per_epoch))
    while iter < args.max_epochs * loader.iter_per_epoch + 1:
        if loader.is_subbatch_end:
            prev_h = (None, None, None)             # set prev_h = h_0 when new sentences are loaded
        enc_input, target_mel, target_lin, wave_lengths, text_lengths = loader.next_batch('train')

        max_wave_len = max(wave_lengths)

        enc_input = Variable(enc_input, requires_grad=False)
        target_mel = Variable(target_mel, requires_grad=False)
        target_lin = Variable(target_lin, requires_grad=False)

        prev_h = loader.mask_prev_h(prev_h)

        model_optim.zero_grad()
        pred_mel, pred_lin, prev_h = model(enc_input, target_mel[:, :-1], wave_lengths, text_lengths, prev_h)

        loss_mel = criterion_mel(pred_mel, target_mel[:, 1:])\
                        .div(max_wave_len * args.batch_size * args.dec_out_size)
        loss_linear = criterion_lin(pred_lin, target_lin[:, 1:])\
                        .div(max_wave_len * args.batch_size * args.post_out_size)
        loss = torch.sum(loss_mel + loss_linear)

        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)         # gradient clipping
        model_optim.step()

        print_loss_total += loss.data[0]
        plot_loss_total += loss.data[0]

        if iter % args.print_every == 0:
            print_loss_avg = print_loss_total / args.print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / args.max_epochs),
                                         iter, iter / args.max_epochs * 100, print_loss_avg))
        if iter % args.plot_every == 0:
            plot_loss_avg = plot_loss_total / args.plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

            save_name = '%s/%dth_exp_loss.png' % (args.save_dir, args.exp_no)
            savePlot(plot_losses, save_name)


        if iter % args.save_every == 0:
            epoch = start_epoch + iter // loader.iter_per_epoch
            save_name = '%s/%d_%dth.t7' % (args.save_dir, args.exp_no, epoch)
            state = {
                'epoch': epoch,
                'args': args,
                'state_dict': model.state_dict(),
                'optimizer': model_optim.state_dict(),
                'plot_losses': plot_losses
            }
            torch.save(state, save_name)
            print('model saved to', save_name)
            # if is_best:               # TODO: implement saving best model.
            #     shutil.copyfile(save_name, '%s/%d_best.t7' % (args.save_dir, args.exp_no))

        iter += 1
Beispiel #12
0
        trainset, batch_size=hparams.batch_size,
        num_workers=hparams.num_workers, shuffle=True,
        collate_fn=collate_fn, pin_memory=hparams.pin_memory)

    valset = PyTorchDataset(X_val, Mel_val, Y_val)
    val_loader = data_utils.DataLoader(
        valset, batch_size=hparams.batch_size,
        num_workers=hparams.num_workers, shuffle=True,
        collate_fn=collate_fn, pin_memory=hparams.pin_memory)

    # Model
    model = Tacotron(n_vocab=1+ len(ph_ids),
                     embedding_dim=256,
                     mel_dim=hparams.num_mels,
                     linear_dim=hparams.num_freq,
                     r=hparams.outputs_per_step,
                     num_attention_heads = 4, num_encoder_layers = 4,
                     padding_idx=hparams.padding_idx,
                     use_memory_mask=hparams.use_memory_mask,
                     )
    model = model.cuda()
    #model = DataParallelFix(model)

    optimizer = optim.Adam(model.parameters(),
                           lr=hparams.initial_learning_rate, betas=(
                               hparams.adam_beta1, hparams.adam_beta2),
                           weight_decay=hparams.weight_decay)

    # Load checkpoint
    if checkpoint_path:
        print("Load checkpoint from: {}".format(checkpoint_path))
Beispiel #13
0
    checkpoint_path = args["<checkpoint>"]
    text_list_file_path = args["<text_list_file>"]
    dst_dir = args["<dst_dir>"]
    max_decoder_steps = int(args["--max-decoder-steps"])
    file_name_suffix = args["--file-name-suffix"]

    checkpoint = torch.load(checkpoint_path)
    checkpoints_dir = os.path.dirname(checkpoint_path)
    with open(checkpoints_dir + '/ids_phones.json') as  f:
       phids = json.load(f)

    model = Tacotron(n_vocab=len(phids)+1,
                     embedding_dim=256,
                     mel_dim=80,
                     linear_dim=1025,
                     r=5,
                     padding_idx=hparams.padding_idx,
                     use_memory_mask=hparams.use_memory_mask,
                     )
    checkpoint = torch.load(checkpoint_path)
    checkpoints_dir = os.path.dirname(checkpoint_path)
    with open(checkpoints_dir + '/ids_phones.json') as  f:
       phids = json.load(f)
    phids = dict(phids)

    model.load_state_dict(checkpoint["state_dict"])
    model.decoder.max_decoder_steps = max_decoder_steps

    os.makedirs(dst_dir, exist_ok=True)

    with open(text_list_file_path, "rb") as f:
Beispiel #14
0
def main():
    parser = argparse.ArgumentParser(description='training script')
    # data load
    parser.add_argument('--data', type=str, default='blizzard', help='blizzard / nancy')
    parser.add_argument('--batch_size', type=int, default=6, help='batch size')
    parser.add_argument('--text_limit', type=int, default=1500, help='maximum length of text to include in training set')
    parser.add_argument('--wave_limit', type=int, default=800, help='maximum length of spectrogram to include in training set')
    parser.add_argument('--shuffle_data', type=int, default=0, help='whether to shuffle data loader')
    parser.add_argument('--batch_idx', type=int, default=0, help='n-th batch of the dataset')
    parser.add_argument('--load_queue_size', type=int, default=1, help='maximum number of batches to load on the memory')
    parser.add_argument('--n_workers', type=int, default=1, help='number of workers used in data loader')
    # generation option
    parser.add_argument('--exp_no', type=int, default=0, help='')
    parser.add_argument('--out_dir', type=str, default='generated', help='')
    parser.add_argument('--init_from', type=str, default='', help='load parameters from...')
    parser.add_argument('--caption', type=str, default='', help='text to generate speech')
    parser.add_argument('--teacher_forcing_ratio', type=float, default=0, help='value between 0~1, use this for scheduled sampling')
    # audio related option
    parser.add_argument('--n_fft', type=int, default=2048, help='fft bin size')
    parser.add_argument('--sample_rate', type=int, default=16000, help='sampling rate')
    parser.add_argument('--frame_len_inMS', type=int, default=50, help='used to determine window size of fft')
    parser.add_argument('--frame_shift_inMS', type=int, default=12.5, help='used to determine stride in sfft')
    parser.add_argument('--num_recon_iters', type=int, default=50, help='# of iteration in griffin-lim recon')
    # misc
    parser.add_argument('--gpu', type=int, nargs='+', help='index of gpu machines to run')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    new_args = vars(parser.parse_args())

    # load and override some arguments
    checkpoint = torch.load(new_args['init_from'], map_location=lambda storage, loc: storage)
    args = checkpoint['args']
    for i in new_args:
        args.__dict__[i] = new_args[i]

    torch.manual_seed(args.seed)

    # set dataset option
    if args.data == 'blizzard':
        args.dir_bin = '/data2/lyg0722/TTS_corpus/blizzard/segmented/bin/'
    elif args.data == 'etri':
        args.dir_bin = '/data2/lyg0722/TTS_corpus/etri/bin/'
    else:
        print('no dataset')
        return

    if args.gpu is None:
        args.use_gpu = False
        args.gpu = []
    else:
        args.use_gpu = True
        torch.cuda.manual_seed(0)
        torch.cuda.set_device(args.gpu[0])

    model = Tacotron(args)
    criterion_mel = nn.L1Loss(size_average=False)
    criterion_lin = nn.L1Loss(size_average=False)

    window_len = int(np.ceil(args.frame_len_inMS * args.sample_rate / 1000))
    hop_length = int(np.ceil(args.frame_shift_inMS * args.sample_rate / 1000))

    if args.init_from:
        model.load_state_dict(checkpoint['state_dict'])
        print('loaded checkpoint %s' % (args.init_from))

    model = model.eval()

    if args.use_gpu:
        model = model.cuda()
        criterion_mel = criterion_mel.cuda()
        criterion_lin = criterion_lin.cuda()

    if args.caption:
        text_raw = args.caption

        if args.data == 'etri':
            text_raw = decompose_hangul(text_raw)       # For Korean dataset

        vocab_dict = torch.load(args.dir_bin + 'vocab.t7')

        enc_input = [vocab_dict[i] for i in text_raw]
        enc_input = enc_input + [0]                                   # null-padding at tail
        text_lengths = [len(enc_input)]
        enc_input = Variable(torch.LongTensor(enc_input).view(1,-1))

        dec_input = torch.Tensor(1, 1, args.dec_out_size).fill_(0)          # null-padding for start flag
        dec_input = Variable(dec_input)
        wave_lengths = [args.wave_limit]        # TODO: use <EOS> later...

        prev_h = (None, None, None)  # set prev_h = h_0 when new sentences are loaded

        if args.gpu:
            enc_input = enc_input.cuda()
            dec_input = dec_input.cuda()

        _, pred_lin, prev_h = model(enc_input, dec_input, wave_lengths, text_lengths, prev_h)

        # start generation
        wave = spectrogram2wav(
            pred_lin.data.view(-1, args.post_out_size).cpu().numpy(),
            n_fft=args.n_fft,
            win_length=window_len,
            hop_length=hop_length,
            num_iters=args.num_recon_iters
        )

        # write to file
        outpath1 = '%s/%s_%s.wav' % (args.out_dir, args.exp_no, args.caption)
        outpath2 = '%s/%s_%s.png' % (args.out_dir, args.exp_no, args.caption)
        librosa.output.write_wav(outpath1, wave, 16000)
        saveAttention(text_raw, torch.cat(model.attn_weights, dim=-1).squeeze(), outpath2)
    else:
        loader = DataLoader(args)
        args.vocab_size = loader.get_num_vocab()

        for iter in range(1, loader.iter_per_epoch + 1):
            if loader.is_subbatch_end:
                prev_h = (None, None, None)  # set prev_h = h_0 when new sentences are loaded

            for i in range(args.batch_idx):
                loader.next_batch('train')

            enc_input, target_mel, target_lin, wave_lengths, text_lengths = loader.next_batch('train')
            enc_input = Variable(enc_input, volatile=True)
            target_mel = Variable(target_mel, volatile=True)
            target_lin = Variable(target_lin, volatile=True)

            prev_h = loader.mask_prev_h(prev_h)

            if args.gpu:
                enc_input = enc_input.cuda()
                target_mel = target_mel.cuda()
                target_lin = target_lin.cuda()

            pred_mel, pred_lin, prev_h = model(enc_input, target_mel[:, :-1], wave_lengths, text_lengths, prev_h)

            loss_mel = criterion_mel(pred_mel, target_mel[:, 1:]) \
                .div(max(wave_lengths) * args.batch_size * args.dec_out_size)
            loss_linear = criterion_lin(pred_lin, target_lin[:, 1:]) \
                .div(max(wave_lengths) * args.batch_size * args.post_out_size)
            loss = torch.sum(loss_mel + loss_linear)

            print('loss:' , loss.data[0])

            attentions = torch.cat(model.attn_weights, dim=-1)

            # write to file
            for n in range(enc_input.size(0)):
                wave = spectrogram2wav(
                    pred_lin.data[n].view(-1, args.post_out_size).cpu().numpy(),
                    n_fft=args.n_fft,
                    win_length=window_len,
                    hop_length=hop_length,
                    num_iters=args.num_recon_iters
                )
                outpath1 = '%s/%s_%s_%s.wav' % (args.out_dir, args.exp_no, n, args.caption)
                librosa.output.write_wav(outpath1, wave, 16000)
                outpath2 = '%s/%s_%s_%s.png' % (args.out_dir, args.exp_no, n, args.caption)
                saveAttention(None, attentions[n], outpath2)


            # showPlot(plot_losses)
            break
    args = docopt(__doc__)
    print("Command line args:\n", args)

    checkpoint_path = args["<checkpoint>"]
    text_list_file_path = args["<text_list_file>"]
    dst_dir = args["<dst_dir>"]
    max_decoder_steps = int(args["--max-decoder-steps"])
    file_name_suffix = args["--file-name-suffix"]

    #checkpoint = torch.load(checkpoint_path)
    checkpoints_dir = os.path.dirname(checkpoint_path)
    with open(checkpoints_dir + '/ids_phones.json') as f:
        phids = json.load(f)

    model = Tacotron(n_vocab=len(phids) + 1)
    checkpoint = torch.load(checkpoint_path)
    #checkpoints_dir = os.path.dirname(checkpoint_path)
    with open(checkpoints_dir + '/ids_phones.json') as f:
        phids = json.load(f)
    phids = dict(phids)

    model.load_state_dict(checkpoint["state_dict"])
    #model.decoder.max_decoder_steps = max_decoder_steps

    os.makedirs(dst_dir, exist_ok=True)

    with open(text_list_file_path, "rb") as f:
        lines = f.readlines()
        for idx, line in enumerate(lines):
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser(description='training script')
    # Mendatory arguments
    parser.add_argument('--data', type=str, default='KEspeech', help='dataset type')
    parser.add_argument('-m', '--message', type=str, help='')

    # data load
    parser.add_argument('--batch_size', type=int, default=32, help='batch size')
    # model
    parser.add_argument('--charvec_dim', type=int, default=256, help='')
    parser.add_argument('--hidden_size', type=int, default=128, help='')
    parser.add_argument('--dec_out_size', type=int, default=80, help='decoder output size')
    parser.add_argument('--post_out_size', type=int, default=1025, help='should be n_fft / 2 + 1(check n_fft from "input_specL" ')
    parser.add_argument('--style_embed_size', type=int, default=32, help='should be n_fft / 2 + 1(check n_fft from "input_specL" ')
    parser.add_argument('--num_filters', type=int, default=16, help='number of filters in filter bank of CBHG')
    parser.add_argument('--r_factor', type=int, default=5, help='reduction factor(# of multiple output)')
    parser.add_argument('--use_txt', type=float, default=0.5, help='0~1, higher value means y_t batch is more sampled')
    # optimization
    parser.add_argument('--max_epochs', type=int, default=100000, help='maximum epoch to train')
    parser.add_argument('--grad_clip', type=float, default=1., help='gradient clipping')
    parser.add_argument('--learning_rate', type=float, default=1e-3, help='2e-3 from Ito, I used to use 5e-4')
    parser.add_argument('--teacher_forcing_ratio', type=float, default=1, help='value between 0~1, use this for scheduled sampling')
    # loading
    parser.add_argument('--init_from', type=str, default='', help='load parameters from...')
    parser.add_argument('--resume', type=int, default=0, help='1 for resume from saved epoch')
    # misc
    parser.add_argument('--print_every', type=int, default=10, help='')
    parser.add_argument('--save_every', type=int, default=10, help='')
    parser.add_argument('--save_dir', type=str, default='result', help='')
    parser.add_argument('-g', '--gpu', type=int, nargs='+', help='index of gpu machines to run')
    args = parser.parse_args()

    torch.manual_seed(0)

    kwargs = {'num_workers': 0, 'pin_memory': True}

    if args.gpu is None:
        args.use_gpu = False
        args.gpu = []
    else:
        args.use_gpu = True
        torch.cuda.manual_seed(0)
        torch.cuda.set_device(args.gpu[0])

    if args.data == 'KEspeech':
        dataset = TTSDataset()
    
    print('[*] Dataset: {}'.format(args.data))

    assert args.message is not None, "You have to set message"

    today = time.strftime('%y%m%d')
    savepath = join('result', '{}_{}'.format(today, args.message))
    if not exists(savepath):
        os.makedirs(savepath)
    elif args.message=='test':
        os.system("rm -rf {}/*".format(savepath))
    else:
        input("Path already exists, wish to continue?")
        os.system("rm -rf {}/*".format(savepath))
        os.system("rm -rf wandb/*{}*{}*".format(today, args.message))
    
    collate = collate_class(use_txt=0.5)
    loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, \
            shuffle=True, collate_fn=collate.fn, drop_last=True, **kwargs)

    # set misc options
    args.vocab_size = dataset.get_vocab_size()
    args.gender_num = len(dataset.gen_lu)
    args.age_num = len(dataset.age_lu)
    args.emotion_num = len(dataset.emo_lu)

    # model define
    model = Tacotron(args)
    model_optim = optim.Adam(model.parameters(), args.learning_rate)
    scheduler = lr_scheduler.StepLR(model_optim, step_size=10)
    criterion_mel = nn.L1Loss()
    criterion_lin = nn.L1Loss()

    # wandb
    wandb.init(project='disentangle_tts', name=args.message)
    wandb.config['hostname'] = os.uname()[1]
    wandb.config.update(args)
    wandb.watch(model)
    with open(join(savepath, 'model.txt'), 'w') as f:
        f.write(str(model))
    torch.save(args, join(savepath, 'arg.pt'))
    os.system('cp *.py {}'.format(savepath))

    start = time.time()
    iter_per_epoch = len(dataset)//args.batch_size
    losses = []
    loss_total = 0
    start_epoch = 0
    it = 1

    if args.init_from:
        checkpoint = torch.load(args.init_from, map_location=lambda storage, loc: storage)
        pretrained_weight = checkpoint['state_dict'].copy()
        
        our_state_dict = model.state_dict()

        for k, v in checkpoint['state_dict'].items():
            if k in our_state_dict.keys():
                if checkpoint['state_dict'][k].shape != our_state_dict[k].shape:
                    pretrained_weight[k] = our_state_dict[k]
            else:
                del pretrained_weight[k]

        for k, v in our_state_dict.items():
            if k not in pretrained_weight.keys():
                pretrained_weight[k] = v

        for name, param in model.named_parameters():
            print('{}\t{}'.format(name, param.requires_grad))

        if args.resume:
            start_epoch = checkpoint['epoch']
            model_optim.load_state_dict(checkpoint['optimizer'])
            plot_losses = checkpoint['plot_losses']
        print('loaded checkpoint %s (epoch %d)' % (args.init_from, start_epoch))

    epoch = start_epoch
    model = model.train()
    if args.use_gpu:
        model = model.cuda()
        criterion_mel = criterion_mel.cuda()
        criterion_lin = criterion_lin.cuda()


    print('Start training... {} iter per epoch'.format(iter_per_epoch))
    for epoch in range(args.max_epochs):
        for it, this_batch in enumerate(loader):
            start_it = time.time()

            if args.use_gpu:
                for k, v in this_batch.items():
                    try:
                        this_batch[k] = Variable(v.cuda(), requires_grad=False)
                    except AttributeError:
                        pass

            for param_group in model_optim.param_groups:
                param_group['lr'] = decay_learning_rate(args.learning_rate, it, iter_per_epoch, start_epoch)

            model.reset_decoder_states()
            model.mask_decoder_states()
            model_optim.zero_grad()
            
            pred_mel, pred_lin, att = model(**this_batch)

            loss_mel = criterion_mel(pred_mel, this_batch['target_mel'])
            loss_linear = criterion_lin(pred_lin, this_batch['lin'])

            loss = loss_mel + loss_linear
            loss.backward()

            nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
            model_optim.step()
            #scheduler.step()

            losses.append(loss.data.item())
            loss_total += loss.data.item()

            if it % args.print_every == 0:
                seen_it = iter_per_epoch * epoch  + it
                seen_samples = epoch * len(loader.dataset) + it  * args.batch_size
                seen_epochs = seen_samples / float(len(loader.dataset))
                print('epoch: {:2d} iter: {:3d} loss: {:5.3f} elapsed: {}  periter: {:4.2f}s'.format(
                    epoch, it, np.mean(losses[-args.print_every:]), asHMS(time.time()-start), time.time()-start_it))

                log_dict = {
                        'epoch/train': seen_epochs,
                        'mel_loss/train': loss_mel,
                        'lin_loss/train': loss_linear,
                        'total_loss/train': loss,
                        'att': wandb.Image(torch.cat(att, dim=-1)[0].detach().cpu().numpy().T, caption='Attention graph'),
                        }
                wandb.log(log_dict, step=seen_it)


        if epoch % args.save_every == 0:
            save_name = '{}/model_{}th.pt'.format(savepath, epoch)
            state = {
                'epoch': epoch,
                'args': args,
                'state_dict': model.state_dict(),
                'optimizer': model_optim.state_dict(),
                'plot_losses': losses
            }
            torch.save(state, save_name)
            print('model saved to', save_name)