def main(args): # Check if the output folder is exist if not os.path.exists(args.folder): os.mkdir(args.folder) # Load data torch.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transforms.ToTensor()), batch_size=args.batch_size, shuffle=True, **kwargs) # Load model model = CVAE().cuda() if torch.cuda.is_available() else CVAE() optimizer = optim.Adam(model.parameters(), lr=1e-3) # Train and generate sample every epoch loss_list = [] for epoch in range(1, args.epochs + 1): model.train() _loss = train(epoch, model, train_loader, optimizer) loss_list.append(_loss) model.eval() sample = torch.randn(100, 20) label = torch.from_numpy(np.asarray(list(range(10)) * 10)) sample = Variable( sample).cuda() if torch.cuda.is_available() else Variable(sample) sample = model.decode(sample, label).cpu() save_image(sample.view(100, 1, 28, 28).data, os.path.join(args.folder, 'sample_' + str(epoch) + '.png'), nrow=10) plt.plot(range(len(loss_list)), loss_list, '-o') plt.savefig(os.path.join(args.folder, 'cvae_loss_curve.png')) torch.save(model.state_dict(), os.path.join(args.folder, 'cvae.pth'))
def train(train_A_dir, train_B_dir, model_dir, model_name, random_seed, val_A_dir, val_B_dir, output_dir, tensorboard_dir, load_path, gen_eval=True): np.random.seed(random_seed) # For now, copy hyperparams used in the CycleGAN num_epochs = 100000 mini_batch_size = 1 # mini_batch_size = 1 is better learning_rate = 0.0002 learning_rate_decay = learning_rate / 200000 sampling_rate = 16000 num_mcep = 24 frame_period = 5.0 n_frames = 128 lambda_cycle = 10 lambda_identity = 5 device = 'cuda' # Use the same pre-processing as the CycleGAN print("Begin Preprocessing") wavs_A = load_wavs(wav_dir=train_A_dir, sr=sampling_rate) wavs_B = load_wavs(wav_dir=train_B_dir, sr=sampling_rate) print("Finished Loading") f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data( wavs=wavs_A, fs=sampling_rate, frame_period=frame_period, coded_dim=num_mcep) f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data( wavs=wavs_B, fs=sampling_rate, frame_period=frame_period, coded_dim=num_mcep) print("Finished Encoding") log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A) log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B) print('Log Pitch A') print('Mean: %f, Std: %f' % (log_f0s_mean_A, log_f0s_std_A)) print('Log Pitch B') print('Mean: %f, Std: %f' % (log_f0s_mean_B, log_f0s_std_B)) coded_sps_A_transposed = transpose_in_list(lst=coded_sps_A) coded_sps_B_transposed = transpose_in_list(lst=coded_sps_B) coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform( coded_sps=coded_sps_A_transposed) print("Input data fixed.") coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform( coded_sps=coded_sps_B_transposed) if not os.path.exists(model_dir): os.makedirs(model_dir) np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'), mean_A=log_f0s_mean_A, std_A=log_f0s_std_A, mean_B=log_f0s_mean_B, std_B=log_f0s_std_B) np.savez(os.path.join(model_dir, 'mcep_normalization.npz'), mean_A=coded_sps_A_mean, std_A=coded_sps_A_std, mean_B=coded_sps_B_mean, std_B=coded_sps_B_std) if val_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if val_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) print("End Preprocessing") if load_path is not None: model = CVAE(num_mcep, 128, num_mcep, 2) model.load_state_dict(torch.load(load_path)) model.eval() if device == 'cuda': model.cuda() print("Loaded Model from path %s" % load_path) if val_A_dir is not None and gen_eval: print("Generating Evaluation Data") for file in os.listdir(val_A_dir): filepath = os.path.join(val_A_dir, file) print( "Converting {0} from Class 0 to Class 1".format(filepath)) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion(f0=f0, mean_log_src=log_f0s_mean_A, std_log_src=log_f0s_std_A, mean_log_target=log_f0s_mean_B, std_log_target=log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm, _, _ = model.convert( np.array([coded_sp_norm]), 0, 1, device) coded_sp_converted_norm = coded_sp_converted_norm.cpu().numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_A_output_dir, 'eval_' + os.path.basename(file)), wav_transformed, sampling_rate) exit(0) print("Begin Training") model = CVAE(num_mcep, 128, num_mcep, 2) optimizer = optim.Adam(model.parameters(), lr=learning_rate) writer = SummaryWriter(tensorboard_dir) if device == 'cuda': model.cuda() for epoch in tqdm(range(num_epochs)): dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm, dataset_B=coded_sps_B_norm, n_frames=n_frames) dataset_A = torch.tensor(dataset_A).to(torch.float) dataset_B = torch.tensor(dataset_B).to(torch.float) n_samples, input_dim, depth = dataset_A.shape y_A = F.one_hot(torch.zeros(depth).to(torch.int64), num_classes=2).to(torch.float).T y_B = F.one_hot(torch.ones(depth).to(torch.int64), num_classes=2).to(torch.float).T (y_A, y_B) = (y_A.reshape((1, 2, depth)), y_B.reshape((1, 2, depth))) y_A = torch.cat([y_A] * n_samples) y_B = torch.cat([y_B] * n_samples) # dataset_A = torch.cat((dataset_A, y_A), axis=1) # dataset_B = torch.cat((dataset_B, y_B), axis=1) X = torch.cat((dataset_A, dataset_B)).to(device) Y = torch.cat((y_A, y_B)).to(device) # out, z_mu, z_var = model(dataset_A, y_A) # rec_loss = F.binary_cross_entropy(out, dataset_A, size_average=False) # kl_diver = -0.5 * torch.sum(1 + z_var - z_mu.pow(2) - z_var.exp()) out, z_mu, z_var = model(X, Y) rec_loss = F.binary_cross_entropy(out, X, size_average=False) kl_diver = -0.5 * torch.sum(1 + z_var - z_mu.pow(2) - z_var.exp()) loss = rec_loss + kl_diver writer.add_scalar('Reconstruction Loss', rec_loss, epoch) writer.add_scalar('KL-Divergence', kl_diver, epoch) writer.add_scalar('Total Loss', loss, epoch) # print("loss = {0} || rec = {1} || kl = {2}".format(loss, rec_loss, kl_diver)) loss.backward() optimizer.step() if val_A_dir is not None: if epoch % 1000 == 0: print('Generating Validation Data...') for file in os.listdir(val_A_dir): filepath = os.path.join(val_A_dir, file) print("Converting {0} from Class 0 to Class 1".format( filepath)) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_A, std_log_src=log_f0s_std_A, mean_log_target=log_f0s_mean_B, std_log_target=log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm, _, _ = model.convert( np.array([coded_sp_norm]), 0, 1, device) coded_sp_converted_norm = coded_sp_converted_norm.cpu( ).numpy() coded_sp_converted_norm = np.squeeze( coded_sp_converted_norm) coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_A_output_dir, str(epoch) + '_' + os.path.basename(file)), wav_transformed, sampling_rate) break if epoch % 1000 == 0: print('Saving Checkpoint') filepath = os.path.join(model_dir, model_name) if not os.path.exists(filepath): os.makedirs(filepath) torch.save(model.state_dict(), os.path.join(filepath, '{0}.ckpt'.format(epoch)))
def test(): model.eval() te_loss = 0 batch_x, batch_y = create_batch(testx, testy) with th.no_grad(): for x, y in zip(batch_x, batch_y): if gpu: x, y = V(th.Tensor(x).cuda()), V(th.Tensor(y).cuda()) else: x, y = V(th.Tensor(x)), V(th.Tensor(y)) x_hat, mu, sigma = model(x, y) loss = Loss_function(x_hat, x, mu, sigma) te_loss += loss.item() return te_loss / test_N tr_loss, te_loss = [], [] for epoch in range(max_epoch): trl = train() tel = test() tr_loss.append(trl) te_loss.append(tel) if epoch % 2 == 0: print(epoch, trl, tel) th.save(model.state_dict(), f"save_model/vae_adadelta_{max_epoch}.pth") plt.plot(tr_loss) plt.plot(te_loss) plt.show()
state_dict['outputs2vocab.weight'] = torch.randn( len(i2w), args.hidden_size * model.hidden_factor) state_dict['outputs2vocab.bias'] = torch.randn(len(i2w)) print(state_dict['embedding.weight'].size(), model.embedding.weight.size()) model.load_state_dict(state_dict) else: model.embedding.weight.data.copy_(vocab.vectors) model = to_device(model) print(model) train(model, datasets, args) if args.save_model is not None: torch.save(model.state_dict(), args.save_model) if args.n_generated > 0: model.eval() samples, z, y_onehot = model.inference(n=args.n_generated) intent = y_onehot.data.max(1)[1].cpu().numpy() delexicalised = idx2word(samples, i2w=i2w, pad_idx=pad_idx) if args.input_type == 'delexicalised': labelling, utterance = surface_realisation(samples, i2w=i2w, pad_idx=pad_idx) print('----------GENERATED----------') for i in range(args.n_generated): print('Intent : ', i2int[intent[i]])
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) pad_idx = vocab.word2idx['<pad>'] sos_idx = vocab.word2idx['<start>'] eos_idx = vocab.word2idx['<end>'] unk_idx = vocab.word2idx['<unk>'] # Build data loader train_data_loader, valid_data_loader = get_loader( args.train_image_dir, args.val_image_dir, args.train_caption_path, args.val_caption_path, vocab, args.batch_size, shuffle=True, num_workers=args.num_workers) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': # return float(1 / (1 + np.exp(-k * (step - x0)))) return float(expit(k * (step - x0))) elif anneal_function == 'linear': return min(1, step / x0) nll = torch.nn.NLLLoss(ignore_index=pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data[0]].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood nll_loss = nll(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return nll_loss, KL_loss, KL_weight # Build the models model = CVAE(vocab_size=len(vocab), embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, max_sequence_length=args.max_sequence_length, num_layers=args.num_layers, bidirectional=args.bidirectional, pad_idx=pad_idx, sos_idx=sos_idx, eos_idx=eos_idx, unk_idx=unk_idx) model.to(device) # Loss and optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # Train the models total_step = len(train_data_loader) step_for_kl_annealing = 0 best_valid_loss = float("inf") patience = 0 for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(train_data_loader): # Set mini-batch dataset images = images.to(device) captions_src = captions[:, :captions.size()[1] - 1] captions_tgt = captions[:, 1:] captions_src = captions_src.to(device) captions_tgt = captions_tgt.to(device) lengths = lengths - 1 lengths = lengths.to(device) # Forward, backward and optimize logp, mean, logv, z = model(images, captions_src, lengths) #loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, captions_tgt, lengths, mean, logv, args.anneal_function, step_for_kl_annealing, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss) / args.batch_size # backward + optimization optimizer.zero_grad() loss.backward() optimizer.step() step_for_kl_annealing += 1 # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) outputs = model._sample(logp) outputs = outputs.cpu().numpy() # Convert word_ids to words sampled_caption = [] ground_truth_caption = [] for word_id in outputs[-1]: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break captions_tgt = captions_tgt.cpu().numpy() for word_id in captions_tgt[-1]: word = vocab.idx2word[word_id] ground_truth_caption.append(word) if word == '<end>': break reconstructed = ' '.join(sampled_caption) ground_truth = ' '.join(ground_truth_caption) print("ground_truth: {0} \n reconstructed: {1}\n".format( ground_truth, reconstructed)) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( model.state_dict(), os.path.join(args.model_path, 'model-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( model.state_dict(), os.path.join(args.model_path, 'model-{}-epoch.ckpt'.format(epoch + 1))) valid_loss = 0 #check against validation set and early stop if the validation score is not improving within patience period for j, (images, captions, lengths) in enumerate(valid_data_loader): # Set mini-batch dataset images = images.to(device) captions_src = captions[:, :captions.size()[1] - 1] captions_tgt = captions[:, 1:] captions_src = captions_src.to(device) captions_tgt = captions_tgt.to(device) lengths = lengths - 1 lengths = lengths.to(device) # Forward, backward and optimize logp, mean, logv, z = model(images, captions_src, lengths) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, captions_tgt, lengths, mean, logv, args.anneal_function, step_for_kl_annealing, args.k, args.x0) valid_loss += (NLL_loss + KL_weight * KL_loss) / args.batch_size if j == 2: break print("validation loss for epoch {}: {}".format(epoch + 1, valid_loss)) print("patience is at {}".format(patience)) if valid_loss < best_valid_loss: best_valid_loss = valid_loss patience = 0 else: patience += 1 if patience == 5: print("early stopping at epoch {}".format(epoch + 1)) break