Exemple #1
0
def main(args):
    # Check if the output folder is exist
    if not os.path.exists(args.folder):
        os.mkdir(args.folder)

    # Load data
    torch.manual_seed(args.seed)
    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        './data', train=True, download=True, transform=transforms.ToTensor()),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)

    # Load model
    model = CVAE().cuda() if torch.cuda.is_available() else CVAE()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Train and generate sample every epoch
    loss_list = []
    for epoch in range(1, args.epochs + 1):
        model.train()
        _loss = train(epoch, model, train_loader, optimizer)
        loss_list.append(_loss)
        model.eval()
        sample = torch.randn(100, 20)
        label = torch.from_numpy(np.asarray(list(range(10)) * 10))
        sample = Variable(
            sample).cuda() if torch.cuda.is_available() else Variable(sample)
        sample = model.decode(sample, label).cpu()
        save_image(sample.view(100, 1, 28, 28).data,
                   os.path.join(args.folder, 'sample_' + str(epoch) + '.png'),
                   nrow=10)
    plt.plot(range(len(loss_list)), loss_list, '-o')
    plt.savefig(os.path.join(args.folder, 'cvae_loss_curve.png'))
    torch.save(model.state_dict(), os.path.join(args.folder, 'cvae.pth'))
Exemple #2
0
def train(train_A_dir,
          train_B_dir,
          model_dir,
          model_name,
          random_seed,
          val_A_dir,
          val_B_dir,
          output_dir,
          tensorboard_dir,
          load_path,
          gen_eval=True):
    np.random.seed(random_seed)

    # For now, copy hyperparams used in the CycleGAN
    num_epochs = 100000
    mini_batch_size = 1  # mini_batch_size = 1 is better
    learning_rate = 0.0002
    learning_rate_decay = learning_rate / 200000
    sampling_rate = 16000
    num_mcep = 24
    frame_period = 5.0
    n_frames = 128
    lambda_cycle = 10
    lambda_identity = 5
    device = 'cuda'

    # Use the same pre-processing as the CycleGAN
    print("Begin Preprocessing")

    wavs_A = load_wavs(wav_dir=train_A_dir, sr=sampling_rate)
    wavs_B = load_wavs(wav_dir=train_B_dir, sr=sampling_rate)
    print("Finished Loading")

    f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data(
        wavs=wavs_A,
        fs=sampling_rate,
        frame_period=frame_period,
        coded_dim=num_mcep)
    f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data(
        wavs=wavs_B,
        fs=sampling_rate,
        frame_period=frame_period,
        coded_dim=num_mcep)
    print("Finished Encoding")

    log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A)
    log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B)

    print('Log Pitch A')
    print('Mean: %f, Std: %f' % (log_f0s_mean_A, log_f0s_std_A))
    print('Log Pitch B')
    print('Mean: %f, Std: %f' % (log_f0s_mean_B, log_f0s_std_B))

    coded_sps_A_transposed = transpose_in_list(lst=coded_sps_A)
    coded_sps_B_transposed = transpose_in_list(lst=coded_sps_B)

    coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform(
        coded_sps=coded_sps_A_transposed)
    print("Input data fixed.")
    coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform(
        coded_sps=coded_sps_B_transposed)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'),
             mean_A=log_f0s_mean_A,
             std_A=log_f0s_std_A,
             mean_B=log_f0s_mean_B,
             std_B=log_f0s_std_B)
    np.savez(os.path.join(model_dir, 'mcep_normalization.npz'),
             mean_A=coded_sps_A_mean,
             std_A=coded_sps_A_std,
             mean_B=coded_sps_B_mean,
             std_B=coded_sps_B_std)

    if val_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if val_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    print("End Preprocessing")

    if load_path is not None:
        model = CVAE(num_mcep, 128, num_mcep, 2)
        model.load_state_dict(torch.load(load_path))
        model.eval()
        if device == 'cuda':
            model.cuda()
        print("Loaded Model from path %s" % load_path)
        if val_A_dir is not None and gen_eval:
            print("Generating Evaluation Data")
            for file in os.listdir(val_A_dir):
                filepath = os.path.join(val_A_dir, file)
                print(
                    "Converting {0} from Class 0 to Class 1".format(filepath))
                wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True)
                wav = wav_padding(wav=wav,
                                  sr=sampling_rate,
                                  frame_period=frame_period,
                                  multiple=4)
                f0, timeaxis, sp, ap = world_decompose(
                    wav=wav, fs=sampling_rate, frame_period=frame_period)
                f0_converted = pitch_conversion(f0=f0,
                                                mean_log_src=log_f0s_mean_A,
                                                std_log_src=log_f0s_std_A,
                                                mean_log_target=log_f0s_mean_B,
                                                std_log_target=log_f0s_std_B)
                coded_sp = world_encode_spectral_envelop(sp=sp,
                                                         fs=sampling_rate,
                                                         dim=num_mcep)
                coded_sp_transposed = coded_sp.T
                coded_sp_norm = (coded_sp_transposed -
                                 coded_sps_A_mean) / coded_sps_A_std
                coded_sp_converted_norm, _, _ = model.convert(
                    np.array([coded_sp_norm]), 0, 1, device)
                coded_sp_converted_norm = coded_sp_converted_norm.cpu().numpy()
                coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm)
                coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                coded_sp_converted = coded_sp_converted.T
                coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
                decoded_sp_converted = world_decode_spectral_envelop(
                    coded_sp=coded_sp_converted, fs=sampling_rate)
                wav_transformed = world_speech_synthesis(
                    f0=f0_converted,
                    decoded_sp=decoded_sp_converted,
                    ap=ap,
                    fs=sampling_rate,
                    frame_period=frame_period)
                librosa.output.write_wav(
                    os.path.join(validation_A_output_dir,
                                 'eval_' + os.path.basename(file)),
                    wav_transformed, sampling_rate)
            exit(0)

    print("Begin Training")

    model = CVAE(num_mcep, 128, num_mcep, 2)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    writer = SummaryWriter(tensorboard_dir)

    if device == 'cuda':
        model.cuda()

    for epoch in tqdm(range(num_epochs)):
        dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm,
                                                 dataset_B=coded_sps_B_norm,
                                                 n_frames=n_frames)
        dataset_A = torch.tensor(dataset_A).to(torch.float)
        dataset_B = torch.tensor(dataset_B).to(torch.float)

        n_samples, input_dim, depth = dataset_A.shape
        y_A = F.one_hot(torch.zeros(depth).to(torch.int64),
                        num_classes=2).to(torch.float).T
        y_B = F.one_hot(torch.ones(depth).to(torch.int64),
                        num_classes=2).to(torch.float).T
        (y_A, y_B) = (y_A.reshape((1, 2, depth)), y_B.reshape((1, 2, depth)))
        y_A = torch.cat([y_A] * n_samples)
        y_B = torch.cat([y_B] * n_samples)

        # dataset_A = torch.cat((dataset_A, y_A), axis=1)
        # dataset_B = torch.cat((dataset_B, y_B), axis=1)

        X = torch.cat((dataset_A, dataset_B)).to(device)
        Y = torch.cat((y_A, y_B)).to(device)

        # out, z_mu, z_var = model(dataset_A, y_A)
        # rec_loss = F.binary_cross_entropy(out, dataset_A, size_average=False)
        # kl_diver = -0.5 * torch.sum(1 + z_var - z_mu.pow(2) - z_var.exp())
        out, z_mu, z_var = model(X, Y)

        rec_loss = F.binary_cross_entropy(out, X, size_average=False)
        kl_diver = -0.5 * torch.sum(1 + z_var - z_mu.pow(2) - z_var.exp())

        loss = rec_loss + kl_diver

        writer.add_scalar('Reconstruction Loss', rec_loss, epoch)
        writer.add_scalar('KL-Divergence', kl_diver, epoch)
        writer.add_scalar('Total Loss', loss, epoch)

        # print("loss = {0} || rec = {1} || kl = {2}".format(loss, rec_loss, kl_diver))

        loss.backward()
        optimizer.step()

        if val_A_dir is not None:
            if epoch % 1000 == 0:
                print('Generating Validation Data...')
                for file in os.listdir(val_A_dir):
                    filepath = os.path.join(val_A_dir, file)
                    print("Converting {0} from Class 0 to Class 1".format(
                        filepath))
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_A,
                        std_log_src=log_f0s_std_A,
                        mean_log_target=log_f0s_mean_B,
                        std_log_target=log_f0s_std_B)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_A_mean) / coded_sps_A_std
                    coded_sp_converted_norm, _, _ = model.convert(
                        np.array([coded_sp_norm]), 0, 1, device)
                    coded_sp_converted_norm = coded_sp_converted_norm.cpu(
                    ).numpy()
                    coded_sp_converted_norm = np.squeeze(
                        coded_sp_converted_norm)
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_A_output_dir,
                                     str(epoch) + '_' +
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
                    break
        if epoch % 1000 == 0:
            print('Saving Checkpoint')
            filepath = os.path.join(model_dir, model_name)
            if not os.path.exists(filepath):
                os.makedirs(filepath)
            torch.save(model.state_dict(),
                       os.path.join(filepath, '{0}.ckpt'.format(epoch)))
def test():
    model.eval()
    te_loss = 0
    batch_x, batch_y = create_batch(testx, testy)
    with th.no_grad():
        for x, y in zip(batch_x, batch_y):
            if gpu:
                x, y = V(th.Tensor(x).cuda()), V(th.Tensor(y).cuda())
            else:
                x, y = V(th.Tensor(x)), V(th.Tensor(y))
            x_hat, mu, sigma = model(x, y)
            loss = Loss_function(x_hat, x, mu, sigma)
            te_loss += loss.item()

    return te_loss / test_N


tr_loss, te_loss = [], []
for epoch in range(max_epoch):
    trl = train()
    tel = test()
    tr_loss.append(trl)
    te_loss.append(tel)
    if epoch % 2 == 0:
        print(epoch, trl, tel)

th.save(model.state_dict(), f"save_model/vae_adadelta_{max_epoch}.pth")
plt.plot(tr_loss)
plt.plot(te_loss)
plt.show()
Exemple #4
0
            state_dict['outputs2vocab.weight'] = torch.randn(
                len(i2w), args.hidden_size * model.hidden_factor)
            state_dict['outputs2vocab.bias'] = torch.randn(len(i2w))

            print(state_dict['embedding.weight'].size(),
                  model.embedding.weight.size())
        model.load_state_dict(state_dict)
    else:
        model.embedding.weight.data.copy_(vocab.vectors)

    model = to_device(model)
    print(model)

    train(model, datasets, args)
    if args.save_model is not None:
        torch.save(model.state_dict(), args.save_model)

    if args.n_generated > 0:

        model.eval()

        samples, z, y_onehot = model.inference(n=args.n_generated)
        intent = y_onehot.data.max(1)[1].cpu().numpy()
        delexicalised = idx2word(samples, i2w=i2w, pad_idx=pad_idx)
        if args.input_type == 'delexicalised':
            labelling, utterance = surface_realisation(samples,
                                                       i2w=i2w,
                                                       pad_idx=pad_idx)
        print('----------GENERATED----------')
        for i in range(args.n_generated):
            print('Intent : ', i2int[intent[i]])
Exemple #5
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    pad_idx = vocab.word2idx['<pad>']
    sos_idx = vocab.word2idx['<start>']
    eos_idx = vocab.word2idx['<end>']
    unk_idx = vocab.word2idx['<unk>']

    # Build data loader
    train_data_loader, valid_data_loader = get_loader(
        args.train_image_dir,
        args.val_image_dir,
        args.train_caption_path,
        args.val_caption_path,
        vocab,
        args.batch_size,
        shuffle=True,
        num_workers=args.num_workers)

    def kl_anneal_function(anneal_function, step, k, x0):
        if anneal_function == 'logistic':
            # return float(1 / (1 + np.exp(-k * (step - x0))))
            return float(expit(k * (step - x0)))
        elif anneal_function == 'linear':
            return min(1, step / x0)

    nll = torch.nn.NLLLoss(ignore_index=pad_idx)

    def loss_fn(logp, target, length, mean, logv, anneal_function, step, k,
                x0):
        # cut-off unnecessary padding from target, and flatten
        target = target[:, :torch.max(length).data[0]].contiguous().view(-1)
        logp = logp.view(-1, logp.size(2))

        # Negative Log Likelihood
        nll_loss = nll(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step, k, x0)

        return nll_loss, KL_loss, KL_weight

    # Build the models
    model = CVAE(vocab_size=len(vocab),
                 embedding_size=args.embedding_size,
                 rnn_type=args.rnn_type,
                 hidden_size=args.hidden_size,
                 word_dropout=args.word_dropout,
                 embedding_dropout=args.embedding_dropout,
                 latent_size=args.latent_size,
                 max_sequence_length=args.max_sequence_length,
                 num_layers=args.num_layers,
                 bidirectional=args.bidirectional,
                 pad_idx=pad_idx,
                 sos_idx=sos_idx,
                 eos_idx=eos_idx,
                 unk_idx=unk_idx)
    model.to(device)
    # Loss and optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    # Train the models
    total_step = len(train_data_loader)
    step_for_kl_annealing = 0
    best_valid_loss = float("inf")
    patience = 0

    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(train_data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions_src = captions[:, :captions.size()[1] - 1]
            captions_tgt = captions[:, 1:]
            captions_src = captions_src.to(device)
            captions_tgt = captions_tgt.to(device)
            lengths = lengths - 1
            lengths = lengths.to(device)

            # Forward, backward and optimize
            logp, mean, logv, z = model(images, captions_src, lengths)

            #loss calculation
            NLL_loss, KL_loss, KL_weight = loss_fn(logp, captions_tgt, lengths,
                                                   mean, logv,
                                                   args.anneal_function,
                                                   step_for_kl_annealing,
                                                   args.k, args.x0)

            loss = (NLL_loss + KL_weight * KL_loss) / args.batch_size

            # backward + optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            step_for_kl_annealing += 1

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))
                outputs = model._sample(logp)
                outputs = outputs.cpu().numpy()

                # Convert word_ids to words
                sampled_caption = []
                ground_truth_caption = []
                for word_id in outputs[-1]:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break

                captions_tgt = captions_tgt.cpu().numpy()
                for word_id in captions_tgt[-1]:
                    word = vocab.idx2word[word_id]
                    ground_truth_caption.append(word)
                    if word == '<end>':
                        break
                reconstructed = ' '.join(sampled_caption)
                ground_truth = ' '.join(ground_truth_caption)
                print("ground_truth: {0} \n reconstructed: {1}\n".format(
                    ground_truth, reconstructed))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    model.state_dict(),
                    os.path.join(args.model_path,
                                 'model-{}-{}.ckpt'.format(epoch + 1, i + 1)))

        torch.save(
            model.state_dict(),
            os.path.join(args.model_path,
                         'model-{}-epoch.ckpt'.format(epoch + 1)))

        valid_loss = 0

        #check against validation set and early stop if the validation score is not improving within patience period
        for j, (images, captions, lengths) in enumerate(valid_data_loader):
            # Set mini-batch dataset
            images = images.to(device)
            captions_src = captions[:, :captions.size()[1] - 1]
            captions_tgt = captions[:, 1:]
            captions_src = captions_src.to(device)
            captions_tgt = captions_tgt.to(device)
            lengths = lengths - 1
            lengths = lengths.to(device)

            # Forward, backward and optimize
            logp, mean, logv, z = model(images, captions_src, lengths)

            # loss calculation
            NLL_loss, KL_loss, KL_weight = loss_fn(logp, captions_tgt, lengths,
                                                   mean, logv,
                                                   args.anneal_function,
                                                   step_for_kl_annealing,
                                                   args.k, args.x0)

            valid_loss += (NLL_loss + KL_weight * KL_loss) / args.batch_size

            if j == 2:
                break
        print("validation loss for epoch {}: {}".format(epoch + 1, valid_loss))
        print("patience is at {}".format(patience))
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            patience = 0
        else:
            patience += 1

        if patience == 5:
            print("early stopping at epoch {}".format(epoch + 1))
            break