def val_test(args): writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_filename = './models/{0}'.format(args.output_folder) train_loader, valid_loader, test_loader = train_util.get_dataloaders(args) recons_input_img = train_util.log_input_img_grid(test_loader, writer) input_dim = 3 model = VAE(input_dim, args.hidden_size, args.enc_type, args.dec_type) # if torch.cuda.device_count() > 1 and args.device == "cuda": # model = torch.nn.DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) discriminators = {} if args.recons_loss == "gan": recons_disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) recons_disc_opt = torch.optim.Adam(recons_disc.parameters(), lr=args.disc_lr, amsgrad=True) discriminators["recons_disc"] = [recons_disc, recons_disc_opt] model.to(args.device) for disc in discriminators: discriminators[disc][0].to(args.device) if args.weights == "load": start_epoch = train_util.load_state(save_filename, model, optimizer, discriminators) else: start_epoch = 0 stop_patience = args.stop_patience best_loss = torch.tensor(np.inf) for epoch in tqdm(range(start_epoch, 4), file=sys.stdout): val_loss_dict, z = train_util.test(get_losses, model, valid_loader, args, discriminators, True) # if args.weights == "init" and epoch==1: # epoch+=1 # break # print(z.shape) train_util.log_recons_img_grid(recons_input_img, model, epoch + 1, args.device, writer) train_util.log_interp_img_grid(recons_input_img, model, epoch + 1, args.device, writer) train_util.log_losses("val", val_loss_dict, epoch + 1, writer) train_util.log_latent_metrics("val", z, epoch + 1, writer) train_util.save_state(model, optimizer, discriminators, val_loss_dict["recons_loss"], best_loss, args.recons_loss, epoch, save_filename) print(val_loss_dict)
def main(args): class uniform_initializer(object): def __init__(self, stdv): self.stdv = stdv def __call__(self, tensor): nn.init.uniform_(tensor, -self.stdv, self.stdv) class xavier_normal_initializer(object): def __call__(self, tensor): nn.init.xavier_normal_(tensor) if args.cuda: print('using cuda') print(args) opt_dict = {"not_improved": 0, "lr": 1., "best_loss": 1e4} train_data = MonoTextData(args.train_data, label=args.label) vocab = train_data.vocab vocab_size = len(vocab) val_data = MonoTextData(args.val_data, label=args.label, vocab=vocab) test_data = MonoTextData(args.test_data, label=args.label, vocab=vocab) print('Train data: %d samples' % len(train_data)) print('finish reading datasets, vocab size is %d' % len(vocab)) print('dropped sentences: %d' % train_data.dropped) sys.stdout.flush() log_niter = (len(train_data) // args.batch_size) // 10 model_init = uniform_initializer(0.01) emb_init = uniform_initializer(0.1) if args.enc_type == 'lstm': encoder = LSTMEncoder(args, vocab_size, model_init, emb_init) args.enc_nh = args.dec_nh else: raise ValueError("the specified encoder type is not supported") decoder = LSTMDecoder(args, vocab, model_init, emb_init) device = torch.device("cuda" if args.cuda else "cpu") args.device = device vae = VAE(encoder, decoder, args).to(device) if args.eval: print('begin evaluation') vae.load_state_dict(torch.load(args.load_path)) vae.eval() with torch.no_grad(): test_data_batch = test_data.create_data_batch( batch_size=args.batch_size, device=device, batch_first=True) test(vae, test_data_batch, "TEST", args) au, au_var = calc_au(vae, test_data_batch) print("%d active units" % au) # print(au_var) test_data_batch = test_data.create_data_batch(batch_size=1, device=device, batch_first=True) calc_iwnll(vae, test_data_batch, args) return enc_optimizer = optim.SGD(vae.encoder.parameters(), lr=1.0, momentum=args.momentum) dec_optimizer = optim.SGD(vae.decoder.parameters(), lr=1.0, momentum=args.momentum) opt_dict['lr'] = 1.0 iter_ = decay_cnt = 0 best_loss = 1e4 best_kl = best_nll = best_ppl = 0 pre_mi = 0 aggressive_flag = True if args.aggressive else False vae.train() start = time.time() kl_weight = args.kl_start anneal_rate = (1.0 - args.kl_start) / (args.warm_up * (len(train_data) / args.batch_size)) train_data_batch = train_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) val_data_batch = val_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) test_data_batch = test_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) for epoch in range(args.epochs): report_kl_loss = report_rec_loss = 0 report_num_words = report_num_sents = 0 for i in np.random.permutation(len(train_data_batch)): batch_data = train_data_batch[i] batch_size, sent_len = batch_data.size() # not predict start symbol report_num_words += (sent_len - 1) * batch_size report_num_sents += batch_size # kl_weight = 1.0 kl_weight = min(1.0, kl_weight + anneal_rate) sub_iter = 1 batch_data_enc = batch_data burn_num_words = 0 burn_pre_loss = 1e4 burn_cur_loss = 0 while aggressive_flag and sub_iter < 100: enc_optimizer.zero_grad() dec_optimizer.zero_grad() burn_batch_size, burn_sents_len = batch_data_enc.size() burn_num_words += (burn_sents_len - 1) * burn_batch_size loss, loss_rc, loss_kl = vae.loss(batch_data_enc, kl_weight, nsamples=args.nsamples) burn_cur_loss += loss.sum().item() loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) enc_optimizer.step() id_ = np.random.random_integers(0, len(train_data_batch) - 1) batch_data_enc = train_data_batch[id_] if sub_iter % 15 == 0: burn_cur_loss = burn_cur_loss / burn_num_words if burn_pre_loss - burn_cur_loss < 0: break burn_pre_loss = burn_cur_loss burn_cur_loss = burn_num_words = 0 sub_iter += 1 # if sub_iter >= 30: # break # print(sub_iter) enc_optimizer.zero_grad() dec_optimizer.zero_grad() loss, loss_rc, loss_kl = vae.loss(batch_data, kl_weight, nsamples=args.nsamples) loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) loss_rc = loss_rc.sum() loss_kl = loss_kl.sum() if not aggressive_flag: enc_optimizer.step() dec_optimizer.step() report_rec_loss += loss_rc.item() report_kl_loss += loss_kl.item() if iter_ % log_niter == 0: train_loss = (report_rec_loss + report_kl_loss) / report_num_sents if aggressive_flag or epoch == 0: vae.eval() with torch.no_grad(): mi = calc_mi(vae, val_data_batch) au, _ = calc_au(vae, val_data_batch) vae.train() print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, mi: %.4f, recon: %.4f,' \ 'au %d, time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_sents, mi, report_rec_loss / report_num_sents, au, time.time() - start)) else: print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, recon: %.4f,' \ 'time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_sents, report_rec_loss / report_num_sents, time.time() - start)) sys.stdout.flush() report_rec_loss = report_kl_loss = 0 report_num_words = report_num_sents = 0 iter_ += 1 if aggressive_flag and (iter_ % len(train_data_batch)) == 0: vae.eval() cur_mi = calc_mi(vae, val_data_batch) vae.train() print("pre mi:%.4f. cur mi:%.4f" % (pre_mi, cur_mi)) if cur_mi - pre_mi < 0: aggressive_flag = False print("STOP BURNING") pre_mi = cur_mi print('kl weight %.4f' % kl_weight) vae.eval() with torch.no_grad(): loss, nll, kl, ppl, mi = test(vae, val_data_batch, "VAL", args) au, au_var = calc_au(vae, val_data_batch) print("%d active units" % au) # print(au_var) if loss < best_loss: print('update best loss') best_loss = loss best_nll = nll best_kl = kl best_ppl = ppl torch.save(vae.state_dict(), args.save_path) if loss > opt_dict["best_loss"]: opt_dict["not_improved"] += 1 if opt_dict["not_improved"] >= decay_epoch and epoch >= 15: opt_dict["best_loss"] = loss opt_dict["not_improved"] = 0 opt_dict["lr"] = opt_dict["lr"] * lr_decay vae.load_state_dict(torch.load(args.save_path)) print('new lr: %f' % opt_dict["lr"]) decay_cnt += 1 enc_optimizer = optim.SGD(vae.encoder.parameters(), lr=opt_dict["lr"], momentum=args.momentum) dec_optimizer = optim.SGD(vae.decoder.parameters(), lr=opt_dict["lr"], momentum=args.momentum) else: opt_dict["not_improved"] = 0 opt_dict["best_loss"] = loss if decay_cnt == max_decay: break if epoch % args.test_nepoch == 0: with torch.no_grad(): loss, nll, kl, ppl, _ = test(vae, test_data_batch, "TEST", args) vae.train() # compute importance weighted estimate of log p(x) vae.load_state_dict(torch.load(args.save_path)) vae.eval() with torch.no_grad(): loss, nll, kl, ppl, _ = test(vae, test_data_batch, "TEST", args) au, au_var = calc_au(vae, test_data_batch) print("%d active units" % au) # print(au_var) test_data_batch = test_data.create_data_batch(batch_size=1, device=device, batch_first=True) with torch.no_grad(): calc_iwnll(vae, test_data_batch, args)
def main(args): writer = SummaryWriter('./logs_vae/{0}'.format(args.output_folder)) save_filename = './models_vae/{0}'.format(args.output_folder) if args.dataset in ['mnist', 'fashion-mnist', 'cifar10']: transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) if args.dataset == 'mnist': # Define the train & test datasets train_dataset = datasets.MNIST(args.data_folder, train=True, download=True, transform=transform) test_dataset = datasets.MNIST(args.data_folder, train=False, transform=transform) num_channels = 1 elif args.dataset == 'fashion-mnist': transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Define the train & test datasets train_dataset = datasets.FashionMNIST(args.data_folder, train=True, download=True, transform=transform) test_dataset = datasets.FashionMNIST(args.data_folder, train=False, transform=transform) num_channels = 1 elif args.dataset == 'cifar10': transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Define the train & test datasets train_dataset = datasets.CIFAR10(args.data_folder, train=True, download=True, transform=transform) test_dataset = datasets.CIFAR10(args.data_folder, train=False, transform=transform) num_channels = 3 valid_dataset = test_dataset elif args.dataset == 'miniimagenet': transform = transforms.Compose([ transforms.RandomResizedCrop(128), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Define the train, valid & test datasets train_dataset = MiniImagenet(args.data_folder, train=True, download=True, transform=transform) valid_dataset = MiniImagenet(args.data_folder, valid=True, download=True, transform=transform) test_dataset = MiniImagenet(args.data_folder, test=True, download=True, transform=transform) num_channels = 3 # Define the data loaders train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True, num_workers=args.num_workers, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True) # Fixed images for Tensorboard fixed_images, _ = next(iter(test_loader)) fixed_grid = make_grid(fixed_images, nrow=8, range=(-1, 1), normalize=True) writer.add_image('original', fixed_grid, 0) model = VAE(num_channels, args.hidden_size, args.z).to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) # Generate the samples first once reconstruction = generate_samples(fixed_images, model, args) grid = make_grid(reconstruction.cpu(), nrow=8, range=(-1, 1), normalize=True) writer.add_image('reconstruction', grid, 0) best_loss = -1. for epoch in range(args.num_epochs): train(epoch, train_loader, model, optimizer, args, writer) loss = test(valid_loader, model, args, writer) reconstruction = generate_samples(fixed_images, model, args) grid = make_grid(reconstruction.cpu(), nrow=8, range=(-1, 1), normalize=True) writer.add_image('reconstruction', grid, epoch + 1) if (epoch == 0) or (loss < best_loss): best_loss = loss with open('{0}/best.pt'.format(save_filename), 'wb') as f: torch.save(model.state_dict(), f) with open('{0}/model_{1}.pt'.format(save_filename, epoch + 1), 'wb') as f: torch.save(model.state_dict(), f)
def main(args): if args.save_path == '': make_savepath(args) seed(args) if args.cuda: print('using cuda') print(args) device = torch.device("cuda" if args.cuda else "cpu") args.device = device opt_dict = {"not_improved": 0, "lr": 1., "best_loss": 1e4} all_data = torch.load(args.data_file) x_train, x_val, x_test = all_data x_train = x_train.to(device) x_val = x_val.to(device) x_test = x_test.to(device) y_size = 1 y_train = x_train.new_zeros(x_train.size(0), y_size) y_val = x_train.new_zeros(x_val.size(0), y_size) y_test = x_train.new_zeros(x_test.size(0), y_size) print(torch.__version__) train_data = torch.utils.data.TensorDataset(x_train, y_train) val_data = torch.utils.data.TensorDataset(x_val, y_val) test_data = torch.utils.data.TensorDataset(x_test, y_test) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=True) print('Train data: %d batches' % len(train_loader)) print('Val data: %d batches' % len(val_loader)) print('Test data: %d batches' % len(test_loader)) sys.stdout.flush() log_niter = len(train_loader) // 5 encoder = ResNetEncoderV2(args) decoder = PixelCNNDecoderV2(args) vae = VAE(encoder, decoder, args).to(device) if args.sample_from != '': save_dir = "samples/%s" % args.dataset if not os.path.exists(save_dir): os.makedirs(save_dir) vae.load_state_dict(torch.load(args.sample_from)) vae.eval() with torch.no_grad(): sample_z = vae.sample_from_prior(400).to(device) sample_x, sample_probs = vae.decode(sample_z, False) image_file = 'sample_binary_from_%s.png' % ( args.sample_from.split('/')[-1][:-3]) save_image(sample_x.data.cpu(), os.path.join(save_dir, image_file), nrow=20) image_file = 'sample_cont_from_%s.png' % ( args.sample_from.split('/')[-1][:-3]) save_image(sample_probs.data.cpu(), os.path.join(save_dir, image_file), nrow=20) return if args.eval: print('begin evaluation') test_loader = torch.utils.data.DataLoader(test_data, batch_size=50, shuffle=True) vae.load_state_dict(torch.load(args.load_path)) vae.eval() with torch.no_grad(): test(vae, test_loader, "TEST", args) au, au_var = calc_au(vae, test_loader) print("%d active units" % au) # print(au_var) calc_iwnll(vae, test_loader, args) return enc_optimizer = optim.Adam(vae.encoder.parameters(), lr=0.001) dec_optimizer = optim.Adam(vae.decoder.parameters(), lr=0.001) opt_dict['lr'] = 0.001 iter_ = 0 best_loss = 1e4 best_kl = best_nll = best_ppl = 0 decay_cnt = pre_mi = best_mi = mi_not_improved = 0 aggressive_flag = True if args.aggressive else False vae.train() start = time.time() kl_weight = args.kl_start anneal_rate = (1.0 - args.kl_start) / (args.warm_up * len(train_loader)) for epoch in range(args.epochs): report_kl_loss = report_rec_loss = 0 report_num_examples = 0 for datum in train_loader: batch_data, _ = datum batch_data = torch.bernoulli(batch_data) batch_size = batch_data.size(0) report_num_examples += batch_size # kl_weight = 1.0 kl_weight = min(1.0, kl_weight + anneal_rate) sub_iter = 1 batch_data_enc = batch_data burn_num_examples = 0 burn_pre_loss = 1e4 burn_cur_loss = 0 while aggressive_flag and sub_iter < 100: enc_optimizer.zero_grad() dec_optimizer.zero_grad() burn_num_examples += batch_data_enc.size(0) loss, loss_rc, loss_kl = vae.loss(batch_data_enc, kl_weight, nsamples=args.nsamples) burn_cur_loss += loss.sum().item() loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) enc_optimizer.step() id_ = np.random.choice(x_train.size(0), args.batch_size, replace=False) batch_data_enc = torch.bernoulli(x_train[id_]) if sub_iter % 10 == 0: burn_cur_loss = burn_cur_loss / burn_num_examples if burn_pre_loss - burn_cur_loss < 0: break burn_pre_loss = burn_cur_loss burn_cur_loss = burn_num_examples = 0 sub_iter += 1 # print(sub_iter) enc_optimizer.zero_grad() dec_optimizer.zero_grad() loss, loss_rc, loss_kl = vae.loss(batch_data, kl_weight, nsamples=args.nsamples) loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) loss_rc = loss_rc.sum() loss_kl = loss_kl.sum() if not aggressive_flag: enc_optimizer.step() dec_optimizer.step() report_rec_loss += loss_rc.item() report_kl_loss += loss_kl.item() if iter_ % log_niter == 0: train_loss = (report_rec_loss + report_kl_loss) / report_num_examples if aggressive_flag or epoch == 0: vae.eval() with torch.no_grad(): mi = calc_mi(vae, val_loader) au, _ = calc_au(vae, val_loader) vae.train() print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, mi: %.4f, recon: %.4f,' \ 'au %d, time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_examples, mi, report_rec_loss / report_num_examples, au, time.time() - start)) else: print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, recon: %.4f,' \ 'time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_examples, report_rec_loss / report_num_examples, time.time() - start)) sys.stdout.flush() report_rec_loss = report_kl_loss = 0 report_num_examples = 0 iter_ += 1 if aggressive_flag and (iter_ % len(train_loader)) == 0: vae.eval() cur_mi = calc_mi(vae, val_loader) vae.train() if cur_mi - best_mi < 0: mi_not_improved += 1 if mi_not_improved == 5: aggressive_flag = False print("STOP BURNING") else: best_mi = cur_mi pre_mi = cur_mi print('kl weight %.4f' % kl_weight) print('epoch: %d, VAL' % epoch) vae.eval() with torch.no_grad(): loss, nll, kl = test(vae, val_loader, "VAL", args) au, au_var = calc_au(vae, val_loader) print("%d active units" % au) # print(au_var) if loss < best_loss: print('update best loss') best_loss = loss best_nll = nll best_kl = kl torch.save(vae.state_dict(), args.save_path) if loss > best_loss: opt_dict["not_improved"] += 1 if opt_dict["not_improved"] >= decay_epoch: opt_dict["best_loss"] = loss opt_dict["not_improved"] = 0 opt_dict["lr"] = opt_dict["lr"] * lr_decay vae.load_state_dict(torch.load(args.save_path)) decay_cnt += 1 print('new lr: %f' % opt_dict["lr"]) enc_optimizer = optim.Adam(vae.encoder.parameters(), lr=opt_dict["lr"]) dec_optimizer = optim.Adam(vae.decoder.parameters(), lr=opt_dict["lr"]) else: opt_dict["not_improved"] = 0 opt_dict["best_loss"] = loss if decay_cnt == max_decay: break if epoch % args.test_nepoch == 0: with torch.no_grad(): loss, nll, kl = test(vae, test_loader, "TEST", args) vae.train() # compute importance weighted estimate of log p(x) vae.load_state_dict(torch.load(args.save_path)) vae.eval() with torch.no_grad(): loss, nll, kl = test(vae, test_loader, "TEST", args) au, au_var = calc_au(vae, test_loader) print("%d active units" % au) # print(au_var) test_loader = torch.utils.data.DataLoader(test_data, batch_size=50, shuffle=True) with torch.no_grad(): calc_iwnll(vae, test_loader, args)
def main(args): class uniform_initializer(object): def __init__(self, stdv): self.stdv = stdv def __call__(self, tensor): nn.init.uniform_(tensor, -self.stdv, self.stdv) class xavier_normal_initializer(object): def __call__(self, tensor): nn.init.xavier_normal_(tensor) if args.cuda: print('using cuda') print(args) opt_dict = {"not_improved": 0, "lr": 1., "best_loss": 1e4} train_data = MonoTextData(args.train_data) vocab = train_data.vocab vocab_size = len(vocab) val_data = MonoTextData(args.val_data, vocab=vocab) test_data = MonoTextData(args.test_data, vocab=vocab) print('Train data: %d samples' % len(train_data)) print('finish reading datasets, vocab size is %d' % len(vocab)) print('dropped sentences: %d' % train_data.dropped) sys.stdout.flush() log_niter = (len(train_data) // args.batch_size) // 10 model_init = uniform_initializer(0.01) emb_init = uniform_initializer(0.1) device = torch.device("cuda" if args.cuda else "cpu") args.device = device encoder = LSTMEncoder(args, vocab_size, model_init, emb_init) args.enc_nh = args.dec_nh decoder = LSTMDecoder(args, vocab, model_init, emb_init) vae = VAE(encoder, decoder, args).to(device) if args.optim == 'sgd': enc_optimizer = optim.SGD(vae.encoder.parameters(), lr=1.0) dec_optimizer = optim.SGD(vae.decoder.parameters(), lr=1.0) opt_dict['lr'] = 1.0 else: enc_optimizer = optim.Adam(vae.encoder.parameters(), lr=0.001, betas=(0.9, 0.999)) dec_optimizer = optim.Adam(vae.decoder.parameters(), lr=0.001, betas=(0.9, 0.999)) opt_dict['lr'] = 0.001 iter_ = decay_cnt = 0 best_loss = 1e4 best_kl = best_nll = best_ppl = 0 pre_mi = -1 aggressive_flag = True if args.aggressive else False vae.train() start = time.time() kl_weight = args.kl_start anneal_rate = (1.0 - args.kl_start) / (args.warm_up * (len(train_data) / args.batch_size)) plot_data = train_data.data_sample(nsample=args.num_plot, device=device, batch_first=True) if args.plot_mode == 'multiple': grid_z = generate_grid(args.zmin, args.zmax, args.dz, device, ndim=1) plot_fn = plot_multiple elif args.plot_mode == 'single': grid_z = generate_grid(args.zmin, args.zmax, args.dz, device, ndim=1) plot_fn = plot_single posterior_mean = [] infer_mean = [] posterior_mean.append( vae.calc_model_posterior_mean(plot_data[0], grid_z)) infer_mean.append(vae.calc_infer_mean(plot_data[0])) train_data_batch = train_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) val_data_batch = val_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) test_data_batch = test_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) for epoch in range(args.epochs): report_kl_loss = report_rec_loss = 0 report_num_words = report_num_sents = 0 for i in np.random.permutation(len(train_data_batch)): if args.plot_mode == "single": batch_data, _ = plot_data else: batch_data = train_data_batch[i] batch_size, sent_len = batch_data.size() # not predict start symbol report_num_words += (sent_len - 1) * batch_size report_num_sents += batch_size # kl_weight = 1.0 kl_weight = min(1.0, kl_weight + anneal_rate) sub_iter = 1 batch_data_enc = batch_data burn_num_words = 0 burn_pre_loss = 1e4 burn_cur_loss = 0 while aggressive_flag and sub_iter < 100: enc_optimizer.zero_grad() dec_optimizer.zero_grad() burn_batch_size, burn_sents_len = batch_data_enc.size() burn_num_words += (burn_sents_len - 1) * burn_batch_size loss, loss_rc, loss_kl = vae.loss(batch_data_enc, kl_weight, nsamples=args.nsamples) burn_cur_loss += loss.sum().item() loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) enc_optimizer.step() if args.plot_mode == "single": batch_data_enc, _ = plot_data else: id_ = np.random.random_integers(0, len(train_data_batch) - 1) batch_data_enc = train_data_batch[id_] if sub_iter % 15 == 0: burn_cur_loss = burn_cur_loss / burn_num_words if burn_pre_loss - burn_cur_loss < 0: break burn_pre_loss = burn_cur_loss burn_cur_loss = burn_num_words = 0 sub_iter += 1 if args.plot_mode == 'single' and epoch == 0 and aggressive_flag: vae.eval() with torch.no_grad(): posterior_mean.append(posterior_mean[-1]) infer_mean.append(vae.calc_infer_mean(plot_data[0])) vae.train() enc_optimizer.zero_grad() dec_optimizer.zero_grad() loss, loss_rc, loss_kl = vae.loss(batch_data, kl_weight, nsamples=args.nsamples) loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) loss_rc = loss_rc.sum() loss_kl = loss_kl.sum() if not aggressive_flag: enc_optimizer.step() dec_optimizer.step() if args.plot_mode == 'single' and epoch == 0: vae.eval() with torch.no_grad(): posterior_mean.append( vae.calc_model_posterior_mean(plot_data[0], grid_z)) if aggressive_flag: infer_mean.append(infer_mean[-1]) else: infer_mean.append(vae.calc_infer_mean(plot_data[0])) vae.train() report_rec_loss += loss_rc.item() report_kl_loss += loss_kl.item() if iter_ % log_niter == 0: train_loss = (report_rec_loss + report_kl_loss) / report_num_sents if aggressive_flag or epoch == 0: vae.eval() mi = calc_mi(vae, val_data_batch) vae.train() print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, mi: %.4f, recon: %.4f,' \ 'time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_sents, mi, report_rec_loss / report_num_sents, time.time() - start)) else: print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, recon: %.4f,' \ 'time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_sents, report_rec_loss / report_num_sents, time.time() - start)) sys.stdout.flush() report_rec_loss = report_kl_loss = 0 report_num_words = report_num_sents = 0 if iter_ % args.plot_niter == 0 and epoch == 0: vae.eval() with torch.no_grad(): if args.plot_mode == 'single' and iter_ != 0: plot_fn(infer_mean, posterior_mean, args) return elif args.plot_mode == "multiple": plot_fn(vae, plot_data, grid_z, iter_, args) vae.train() iter_ += 1 if aggressive_flag and (iter_ % len(train_data_batch)) == 0: vae.eval() cur_mi = calc_mi(vae, val_data_batch) vae.train() if cur_mi - pre_mi < 0: aggressive_flag = False print("STOP BURNING") pre_mi = cur_mi # return print('kl weight %.4f' % kl_weight) print('epoch: %d, VAL' % epoch) with torch.no_grad(): plot_fn(vae, plot_data, grid_z, iter_, args) vae.eval() with torch.no_grad(): loss, nll, kl, ppl = test(vae, val_data_batch, "VAL", args) if loss < best_loss: print('update best loss') best_loss = loss best_nll = nll best_kl = kl best_ppl = ppl torch.save(vae.state_dict(), args.save_path) if loss > opt_dict["best_loss"]: opt_dict["not_improved"] += 1 if opt_dict["not_improved"] >= decay_epoch: opt_dict["best_loss"] = loss opt_dict["not_improved"] = 0 opt_dict["lr"] = opt_dict["lr"] * lr_decay vae.load_state_dict(torch.load(args.save_path)) print('new lr: %f' % opt_dict["lr"]) decay_cnt += 1 if args.optim == 'sgd': enc_optimizer = optim.SGD(vae.encoder.parameters(), lr=opt_dict["lr"]) dec_optimizer = optim.SGD(vae.decoder.parameters(), lr=opt_dict["lr"]) else: enc_optimizer = optim.Adam(vae.encoder.parameters(), lr=opt_dict["lr"], betas=(0.5, 0.999)) dec_optimizer = optim.Adam(vae.decoder.parameters(), lr=opt_dict["lr"], betas=(0.5, 0.999)) else: opt_dict["not_improved"] = 0 opt_dict["best_loss"] = loss if decay_cnt == max_decay: break if epoch % args.test_nepoch == 0: with torch.no_grad(): loss, nll, kl, ppl = test(vae, test_data_batch, "TEST", args) vae.train() print('best_loss: %.4f, kl: %.4f, nll: %.4f, ppl: %.4f' \ % (best_loss, best_kl, best_nll, best_ppl)) sys.stdout.flush() # compute importance weighted estimate of log p(x) vae.load_state_dict(torch.load(args.save_path)) vae.eval() test_data_batch = test_data.create_data_batch(batch_size=1, device=device, batch_first=True) with torch.no_grad(): calc_iwnll(vae, test_data_batch, args)
def main(args): global logging debug = (args.reconstruct_from != "" or args.eval == True) # don't make exp dir for reconstruction logging = create_exp_dir(args.exp_dir, scripts_to_save=None, debug=debug) if args.cuda: logging('using cuda') logging(str(args)) opt_dict = {"not_improved": 0, "lr": 1., "best_loss": 1e4} train_data = MonoTextData(args.train_data, label=args.label) vocab = train_data.vocab vocab_size = len(vocab) val_data = MonoTextData(args.val_data, label=args.label, vocab=vocab) test_data = MonoTextData(args.test_data, label=args.label, vocab=vocab) logging('Train data: %d samples' % len(train_data)) logging('finish reading datasets, vocab size is %d' % len(vocab)) logging('dropped sentences: %d' % train_data.dropped) #sys.stdout.flush() log_niter = (len(train_data) // args.batch_size) // 10 model_init = uniform_initializer(0.01) emb_init = uniform_initializer(0.1) #device = torch.device("cuda" if args.cuda else "cpu") device = "cuda" if args.cuda else "cpu" args.device = device if args.enc_type == 'lstm': encoder = GaussianLSTMEncoder(args, vocab_size, model_init, emb_init) args.enc_nh = args.dec_nh else: raise ValueError("the specified encoder type is not supported") decoder = LSTMDecoder(args, vocab, model_init, emb_init) vae = VAE(encoder, decoder, args).to(device) if args.load_path: loaded_state_dict = torch.load(args.load_path) #curr_state_dict = vae.state_dict() #curr_state_dict.update(loaded_state_dict) vae.load_state_dict(loaded_state_dict) logging("%s loaded" % args.load_path) if args.reset_dec: vae.decoder.reset_parameters(model_init, emb_init) if args.eval: logging('begin evaluation') vae.load_state_dict(torch.load(args.load_path)) vae.eval() with torch.no_grad(): test_data_batch = test_data.create_data_batch( batch_size=args.batch_size, device=device, batch_first=True) test(vae, test_data_batch, "TEST", args) au, au_var = calc_au(vae, test_data_batch) logging("%d active units" % au) # print(au_var) test_data_batch = test_data.create_data_batch(batch_size=1, device=device, batch_first=True) nll, ppl = calc_iwnll(vae, test_data_batch, args) logging('iw nll: %.4f, iw ppl: %.4f' % (nll, ppl)) return if args.reconstruct_from != "": print("begin decoding") sys.stdout.flush() vae.load_state_dict(torch.load(args.reconstruct_from)) vae.eval() with torch.no_grad(): test_data_batch = test_data.create_data_batch( batch_size=args.batch_size, device=device, batch_first=True) # test(vae, test_data_batch, "TEST", args) reconstruct(vae, test_data_batch, vocab, args.decoding_strategy, args.reconstruct_to) return if args.opt == "sgd": enc_optimizer = optim.SGD(vae.encoder.parameters(), lr=args.lr, momentum=args.momentum) dec_optimizer = optim.SGD(vae.decoder.parameters(), lr=args.lr, momentum=args.momentum) opt_dict['lr'] = args.lr elif args.opt == "adam": enc_optimizer = optim.Adam(vae.encoder.parameters(), lr=0.001) dec_optimizer = optim.Adam(vae.decoder.parameters(), lr=0.001) opt_dict['lr'] = 0.001 else: raise ValueError("optimizer not supported") iter_ = decay_cnt = 0 best_loss = 1e4 best_kl = best_nll = best_ppl = 0 pre_mi = 0 vae.train() start = time.time() train_data_batch = train_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) val_data_batch = val_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) test_data_batch = test_data.create_data_batch(batch_size=args.batch_size, device=device, batch_first=True) # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(args.epochs): report_kl_loss = report_rec_loss = report_loss = 0 report_num_words = report_num_sents = 0 for i in np.random.permutation(len(train_data_batch)): batch_data = train_data_batch[i] batch_size, sent_len = batch_data.size() # not predict start symbol report_num_words += (sent_len - 1) * batch_size report_num_sents += batch_size kl_weight = args.beta enc_optimizer.zero_grad() dec_optimizer.zero_grad() if args.iw_train_nsamples < 0: loss, loss_rc, loss_kl = vae.loss(batch_data, kl_weight, nsamples=args.nsamples) else: loss, loss_rc, loss_kl = vae.loss_iw( batch_data, kl_weight, nsamples=args.iw_train_nsamples, ns=ns) loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) loss_rc = loss_rc.sum() loss_kl = loss_kl.sum() enc_optimizer.step() dec_optimizer.step() report_rec_loss += loss_rc.item() report_kl_loss += loss_kl.item() report_loss += loss.item() * batch_size if iter_ % log_niter == 0: #train_loss = (report_rec_loss + report_kl_loss) / report_num_sents train_loss = report_loss / report_num_sents logging('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, recon: %.4f,' \ 'time elapsed %.2fs, kl_weight %.4f' % (epoch, iter_, train_loss, report_kl_loss / report_num_sents, report_rec_loss / report_num_sents, time.time() - start, kl_weight)) #sys.stdout.flush() report_rec_loss = report_kl_loss = report_loss = 0 report_num_words = report_num_sents = 0 iter_ += 1 logging('kl weight %.4f' % kl_weight) vae.eval() with torch.no_grad(): loss, nll, kl, ppl, mi = test(vae, val_data_batch, "VAL", args) au, au_var = calc_au(vae, val_data_batch) logging("%d active units" % au) # print(au_var) if args.save_ckpt > 0 and epoch <= args.save_ckpt: logging('save checkpoint') torch.save( vae.state_dict(), os.path.join(args.exp_dir, f'model_ckpt_{epoch}.pt')) if loss < best_loss: logging('update best loss') best_loss = loss best_nll = nll best_kl = kl best_ppl = ppl torch.save(vae.state_dict(), args.save_path) if loss > opt_dict["best_loss"]: opt_dict["not_improved"] += 1 if opt_dict[ "not_improved"] >= decay_epoch and epoch >= args.load_best_epoch: opt_dict["best_loss"] = loss opt_dict["not_improved"] = 0 opt_dict["lr"] = opt_dict["lr"] * lr_decay vae.load_state_dict(torch.load(args.save_path)) logging('new lr: %f' % opt_dict["lr"]) decay_cnt += 1 enc_optimizer = optim.SGD(vae.encoder.parameters(), lr=opt_dict["lr"], momentum=args.momentum) dec_optimizer = optim.SGD(vae.decoder.parameters(), lr=opt_dict["lr"], momentum=args.momentum) else: opt_dict["not_improved"] = 0 opt_dict["best_loss"] = loss if decay_cnt == max_decay: break if epoch % args.test_nepoch == 0: with torch.no_grad(): loss, nll, kl, ppl, _ = test(vae, test_data_batch, "TEST", args) if args.save_latent > 0 and epoch <= args.save_latent: visualize_latent(args, epoch, vae, "cuda", test_data) vae.train() except KeyboardInterrupt: logging('-' * 100) logging('Exiting from training early') # compute importance weighted estimate of log p(x) vae.load_state_dict(torch.load(args.save_path)) vae.eval() with torch.no_grad(): loss, nll, kl, ppl, _ = test(vae, test_data_batch, "TEST", args) au, au_var = calc_au(vae, test_data_batch) logging("%d active units" % au) # print(au_var) test_data_batch = test_data.create_data_batch(batch_size=1, device=device, batch_first=True) with torch.no_grad(): nll, ppl = calc_iwnll(vae, test_data_batch, args) logging('iw nll: %.4f, iw ppl: %.4f' % (nll, ppl))
batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True) print('train_loader', len(train_loader)) print('test_loader', len(test_loader)) #%% model = VAE().to(device) # optimizer = optim.Adam(model.parameters(), lr=1e-3) optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), weight_decay=0.0005) #%% epochs = 100 viz = Visdom() global plotter, recon plotter = utils.VisdomLinePlotter(env_name='main') sample_image = utils.VisdomImage(env_name='main') recon = utils.VisdomImage(env_name='main') for epoch in range(1, epochs + 1): with torch.no_grad(): sample = torch.randn(32, 32).to(device)
def main(args): train_loader, val_loader, test_loader = train_util.get_dataloaders(args) input_dim = 3 model = VAE(input_dim, args.hidden_size, args.enc_type, args.dec_type) opt = torch.optim.Adam(model.parameters(), lr=LR, amsgrad=True) discriminators = {} if args.recons_loss != "mse": if args.recons_loss == "gan": recons_disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) elif args.recons_loss == "comp": recons_disc = AnchorComparator(input_dim*2, args.img_res, args.input_type).to(args.device) elif "comp_2" in args.recons_loss: recons_disc = ClubbedPermutationComparator(input_dim*2, args.img_res, args.input_type).to(args.device) elif "comp_6" in args.recons_loss: recons_disc = FullPermutationComparator(input_dim*2, args.img_res, args.input_type).to(args.device) recons_disc_opt = torch.optim.Adam(recons_disc.parameters(), lr=args.disc_lr, amsgrad=True) discriminators["recons_disc"] = [recons_disc, recons_disc_opt] if torch.cuda.device_count() > 1: model = train_util.ae_data_parallel(model) for disc in discriminators: discriminators[disc][0] = torch.nn.DataParallel(discriminators[disc][0]) model.to(args.device) model_name = f"vae_{args.recons_loss}" if args.output_folder is None: args.output_folder = os.path.join(model_name, args.dataset, f"depth_{args.enc_type}_{args.dec_type}_hs_{args.img_res}_{args.hidden_size}") log_save_path = os.path.join("./logs", args.output_folder) model_save_path = os.path.join("./models", args.output_folder) if not os.path.exists(log_save_path): os.makedirs(log_save_path) print(f"log:{log_save_path}", file=sys.stderr) sys.stderr.flush() if not os.path.exists(model_save_path): os.makedirs(model_save_path) writer = SummaryWriter(log_save_path) print(f"train loader length:{len(train_loader)}", file=sys.stderr) best_loss = torch.tensor(np.inf) if args.weights == "load": start_epoch = train_util.load_state(model_save_path, model, opt, discriminators) else: start_epoch = 0 recons_input_img = train_util.log_input_img_grid(test_loader, writer) train_util.save_recons_img_grid("val", recons_input_img, model, 0, args) for epoch in range(1, args.num_epochs): print("Epoch {}:".format(epoch)) train(model, opt, train_loader) curr_loss = val(model, val_loader) # val_loss_dict, z = train_util.test(get_losses, model, val_loader, args, discriminators) print(f"epoch val loss:{curr_loss}", file=sys.stderr) sys.stderr.flush() train_util.save_recons_img_grid("val", recons_input_img, model, epoch+1, args) train_util.save_interp_img_grid("val", recons_input_img, model, epoch+1, args)
transform=preproc_transform, ), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True) test_loader = torch.utils.data.DataLoader(eval('datasets.' + DATASET)( '../data/{}/'.format(DATASET), train=False, transform=preproc_transform), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True) model = VAE(INPUT_DIM, DIM, Z_DIM).cuda() print(model) opt = torch.optim.Adam(model.parameters(), lr=LR, amsgrad=True) def train(): train_loss = [] model.train() for batch_idx, (x, _) in enumerate(train_loader): start_time = time.time() x = x.cuda() x_tilde, kl_d = model(x) loss_recons = F.mse_loss(x_tilde, x, size_average=False) / x.size(0) loss = loss_recons + kl_d nll = -Normal(x_tilde, torch.ones_like(x_tilde)).log_prob(x) log_px = nll.mean().item() - np.log(128) + kl_d.item()
def main(args): input_dim = 3 model = VAE(input_dim, args.hidden_size, args.enc_type, args.dec_type) opt = torch.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, "min", patience=args.lr_patience, factor=0.5, # threshold=args.threshold, threshold_mode="abs", min_lr=1e-6) # ae_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, "min", patience=args.lr_patience, factor=0.5, # threshold=args.threshold, threshold_mode="abs", min_lr=1e-7) discriminators = {} if args.recons_loss != "mse": if args.recons_loss == "gan": recons_disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) elif args.recons_loss == "comp": recons_disc = AnchorComparator(input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_2" in args.recons_loss: recons_disc = ClubbedPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_6" in args.recons_loss: recons_disc = FullPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) recons_disc_opt = torch.optim.Adam(recons_disc.parameters(), lr=args.disc_lr, amsgrad=True) discriminators["recons_disc"] = [recons_disc, recons_disc_opt] if torch.cuda.device_count() > 1: model = train_util.ae_data_parallel(model) for disc in discriminators: discriminators[disc][0] = torch.nn.DataParallel( discriminators[disc][0]) model.to(args.device) for disc in discriminators: discriminators[disc][0].to(args.device) print("model built", file=sys.stderr) #print("model created") train_loader, val_loader, test_loader = train_util.get_dataloaders(args) print("loaders acquired", file=sys.stderr) #print("loaders acquired") model_name = f"vae_{args.recons_loss}" if args.output_folder is None: args.output_folder = os.path.join( model_name, args.dataset, f"depth_{args.enc_type}_{args.dec_type}_hs_{args.img_res}_{args.hidden_size}" ) log_save_path = os.path.join("./logs", args.output_folder) model_save_path = os.path.join("./models", args.output_folder) if not os.path.exists(log_save_path): os.makedirs(log_save_path) print(f"log:{log_save_path}", file=sys.stderr) sys.stderr.flush() if not os.path.exists(model_save_path): os.makedirs(model_save_path) writer = SummaryWriter(log_save_path) print(f"train loader length:{len(train_loader)}", file=sys.stderr) best_loss = torch.tensor(np.inf) if args.weights == "load": start_epoch = train_util.load_state(model_save_path, model, opt, discriminators) else: start_epoch = 0 recons_input_img = train_util.log_input_img_grid(test_loader, writer) train_util.log_recons_img_grid(recons_input_img, model, 0, args.device, writer) stop_patience = args.stop_patience for epoch in range(start_epoch, args.num_epochs): try: train(model, train_loader, opt, epoch, writer, args, discriminators) except RuntimeError as err: print("".join( traceback.TracebackException.from_exception(err).format()), file=sys.stderr) print("*******", file=sys.stderr) print(err, file=sys.stderr) exit(0) val_loss_dict, z = train_util.test(get_losses, model, val_loader, args, discriminators) print(f"epoch loss:{val_loss_dict['recons_loss'].item()}") train_util.save_recons_img_grid("test", recons_input_img, model, epoch + 1, args) train_util.save_interp_img_grid("test", recons_input_img, model, epoch + 1, args) train_util.log_losses("val", val_loss_dict, epoch + 1, writer) train_util.log_latent_metrics("val", z, epoch + 1, writer) train_util.save_state(model, opt, discriminators, val_loss_dict["recons_loss"], best_loss, args.recons_loss, epoch, model_save_path)