def eval_elbo_l(model, guide, num_pars, vec_pars, svi_arg_l, param_state_l): # return [elbo(`model`, `guide`, _, # ELBO(num_particles=`num_pars`, vectorize_particles=`vec_pars`)) # when params are set to param_state, # for param_state in `param_state_l`] svi = SVI(model, guide, pyro.optim.Adam({}), loss=Trace_ELBO(num_particles=num_pars, vectorize_particles=vec_pars)) elbo_l = [] cnt, cnt_prog = 0, max(1, int(len(param_state_l) / 20)) for param_state in param_state_l: # set params pyro.get_param_store().set_state(param_state) # compute elbo loss = svi.evaluate_loss(*svi_arg_l) elbo_l.append(-loss) # print cnt += 1 if cnt % cnt_prog == 0: print('.', end='') return elbo_l
def _valid_epoch(self, epoch): """ Validate after training an epoch :param epoch: Integer, current training epoch. :return: A log that contains information about validation """ elbo = TraceGraph_ELBO(vectorize_particles=False, num_particles=4) svi = SVI(self.model.model, self.model.guide, self.optimizer, loss=elbo) imps = ImportanceSampler(self.model.model, self.model.guide, num_samples=4) self.model.eval() self.valid_metrics.reset() with torch.no_grad(): for batch_idx, (data, target) in enumerate(self.valid_data_loader): data, target = data.to(self.device), target.to(self.device) loss = svi.evaluate_loss(observations=data) / data.shape[0] imps.sample(observations=data) log_likelihood = imps.get_log_likelihood().item() / data.shape[0] log_marginal = imps.get_log_normalizer().item() / data.shape[0] self.writer.set_step((epoch - 1) * len(self.valid_data_loader) + batch_idx, 'valid') self.valid_metrics.update('loss', loss) self.valid_metrics.update('log_likelihood', log_likelihood) self.valid_metrics.update('log_marginal', log_marginal) for met in self.metric_ftns: metric_val = met(self.model.model, self.model.guide, data, target, 4) self.valid_metrics.update(met.__name__, metric_val) if self.log_images: self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True)) return self.valid_metrics.result()
def main(args): # load data print('loading training data...') dataset_directory = get_data_directory(__file__) dataset_path = os.path.join(dataset_directory, 'faces_training.csv') if not os.path.exists(dataset_path): try: os.makedirs(dataset_directory) except OSError as e: if e.errno != errno.EEXIST: raise pass wget.download( 'https://d2hg8soec8ck9v.cloudfront.net/datasets/faces_training.csv', dataset_path) data = torch.tensor(np.loadtxt(dataset_path, delimiter=',')).float() sparse_gamma_def = SparseGammaDEF() # Due to the special logic in the custom guide (e.g. parameter clipping), the custom guide # seems to be more amenable to higher learning rates. # Nevertheless, the easy guide performs the best (presumably because of numerical instabilities # related to the gamma distribution in the custom guide). learning_rate = 0.2 if args.guide in ['auto', 'easy'] else 4.5 momentum = 0.05 if args.guide in ['auto', 'easy'] else 0.1 opt = optim.AdagradRMSProp({"eta": learning_rate, "t": momentum}) # use one of our three different guide types if args.guide == 'auto': guide = AutoDiagonalNormal(sparse_gamma_def.model, init_loc_fn=init_to_feasible) elif args.guide == 'easy': guide = MyEasyGuide(sparse_gamma_def.model) else: guide = sparse_gamma_def.guide # this is the svi object we use during training; we use TraceMeanField_ELBO to # get analytic KL divergences svi = SVI(sparse_gamma_def.model, guide, opt, loss=TraceMeanField_ELBO()) # we use svi_eval during evaluation; since we took care to write down our model in # a fully vectorized way, this computation can be done efficiently with large tensor ops svi_eval = SVI(sparse_gamma_def.model, guide, opt, loss=TraceMeanField_ELBO(num_particles=args.eval_particles, vectorize_particles=True)) print('\nbeginning training with %s guide...' % args.guide) # the training loop for k in range(args.num_epochs): loss = svi.step(data) # for the custom guide we clip parameters after each gradient step if args.guide == 'custom': clip_params() if k % args.eval_frequency == 0 and k > 0 or k == args.num_epochs - 1: loss = svi_eval.evaluate_loss(data) print("[epoch %04d] training elbo: %.4g" % (k, -loss))
def train_vae( model: BaseAutoEncoder, epochs: int, train_loader: DataLoader, test_loader: DataLoader, lr: float, loss_fn: callable, ) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]: """ Train VAE model. :param model: VAE model :param epochs: number of epochs to train :param train_loader: train dataset loader :param test_loader: test dataset loader :param lr: learning rate :param loss_fn: loss function to be applied :return: training results; see train_metrics and test_metrics """ train_metrics = { "loss": [], "step": [], } test_metrics = { "loss": [], "step": [], } global_step = 0 optimizer = optim.Adam({"lr": lr}) svi = SVI(model.model, model.guide, optimizer, loss=loss_fn) for epoch in trange(epochs): print(f"Epoch: {epoch + 1} / {epochs}.") # training step pbar = tqdm(train_loader) for inputs, _ in pbar: # we are not using labels for training inputs = inputs.view((-1, 28 * 28)) loss = svi.step(inputs) train_metrics["loss"].append(loss / 32) train_metrics["step"].append(global_step) global_step += 1 pbar.update(1) pbar.close() # validation step val_loss = 0.0 for inputs, _ in test_loader: inputs = inputs.view((-1, 28 * 28)) val_loss += svi.evaluate_loss(inputs) test_metrics["loss"].append(val_loss / len(test_loader.dataset)) test_metrics["step"].append(global_step) return train_metrics, test_metrics
def train(device, dataloaders, dataset_sizes, learning_rate, num_epochs, early_stop_patience, model_path, pre_trained_baseline_net): # clear param store pyro.clear_param_store() cvae_net = CVAE(200, 500, 500, pre_trained_baseline_net) cvae_net.to(device) optimizer = pyro.optim.Adam({"lr": learning_rate}) svi = SVI(cvae_net.model, cvae_net.guide, optimizer, loss=Trace_ELBO()) best_loss = np.inf early_stop_count = 0 Path(model_path).parent.mkdir(parents=True, exist_ok=True) for epoch in range(num_epochs): # Each epoch has a training and validation phase for phase in ['train', 'val']: running_loss = 0.0 num_preds = 0 # Iterate over data. bar = tqdm(dataloaders[phase], desc='CVAE Epoch {} {}'.format(epoch, phase).ljust(20)) for i, batch in enumerate(bar): inputs = batch['input'].to(device) outputs = batch['output'].to(device) if phase == 'train': loss = svi.step(inputs, outputs) else: loss = svi.evaluate_loss(inputs, outputs) # statistics running_loss += loss / inputs.size(0) num_preds += 1 if i % 10 == 0: bar.set_postfix(loss='{:.2f}'.format(running_loss / num_preds), early_stop_count=early_stop_count) epoch_loss = running_loss / dataset_sizes[phase] # deep copy the model if phase == 'val': if epoch_loss < best_loss: best_loss = epoch_loss torch.save(cvae_net.state_dict(), model_path) early_stop_count = 0 else: early_stop_count += 1 if early_stop_count >= early_stop_patience: break # Save model weights cvae_net.load_state_dict(torch.load(model_path)) cvae_net.eval() return cvae_net
def main(args): train_loader, test_loader = get_data() vae = VAE(use_cuda=False) optimizer = Adam({"lr": 0.0001}) svi = SVI(vae.model, vae.guide, optimizer, loss="ELBO") # setup visdom for visualization if args.visdom_flag: vis = visdom.Visdom() train_elbo = [] test_elbo = [] # training loop for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for _, (x, _) in enumerate(train_loader): # wrap the mini-batch in a PyTorch Variable x = Variable(x) # do ELBO gradient and accumulate loss epoch_loss += svi.step(x) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if epoch % args.test_frequency == 0: # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for i, (x, _) in enumerate(test_loader): # wrap the mini-batch in a PyTorch Variable x = Variable(x) # compute ELBO estimate and accumulate loss test_loss += svi.evaluate_loss(x) # visualize how well we're reconstructing them if i == 0: if args.visdom_flag: plot_vae_samples(vae, vis) # report test diagnostics normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test test_elbo.append(total_epoch_loss_test) print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test))
def main(args): # load data print('loading training data...') dataset_directory = get_data_directory(__file__) dataset_path = os.path.join(dataset_directory, 'faces_training.csv') if not os.path.exists(dataset_path): try: os.makedirs(dataset_directory) except OSError as e: if e.errno != errno.EEXIST: raise pass wget.download('https://d2fefpcigoriu7.cloudfront.net/datasets/faces_training.csv', dataset_path) data = torch.tensor(np.loadtxt(dataset_path, delimiter=',')).float() sparse_gamma_def = SparseGammaDEF() # due to the special logic in the custom guide (e.g. parameter clipping), the custom guide # is more numerically stable and enables us to use a larger learning rate (and consequently # achieves better results) learning_rate = 0.2 if args.auto_guide else 4.5 momentum = 0.05 if args.auto_guide else 0.1 opt = optim.AdagradRMSProp({"eta": learning_rate, "t": momentum}) # either use an automatically constructed guide (see pyro.contrib.autoguide for details) or our custom guide guide = AutoDiagonalNormal(sparse_gamma_def.model) if args.auto_guide else sparse_gamma_def.guide # this is the svi object we use during training; we use TraceMeanField_ELBO to # get analytic KL divergences svi = SVI(sparse_gamma_def.model, guide, opt, loss=TraceMeanField_ELBO()) # we use svi_eval during evaluation; since we took care to write down our model in # a fully vectorized way, this computation can be done efficiently with large tensor ops svi_eval = SVI(sparse_gamma_def.model, guide, opt, loss=TraceMeanField_ELBO(num_particles=args.eval_particles, vectorize_particles=True)) guide_description = 'automatically constructed' if args.auto_guide else 'custom' print('\nbeginning training with %s guide...' % guide_description) # the training loop for k in range(args.num_epochs): loss = svi.step(data) if not args.auto_guide: # for the custom guide we clip parameters after each gradient step sparse_gamma_def.clip_params() if k % args.eval_frequency == 0 and k > 0 or k == args.num_epochs - 1: loss = svi_eval.evaluate_loss(data) print("[epoch %04d] training elbo: %.4g" % (k, -loss))
def fit(self, optim=Adam({'lr': 1e-3}), loss=Trace_ELBO(num_particles=1), max_iter=5000, random_instance=None): svi = SVI(self.model, self.guide, optim=optim, loss=loss) with trange(max_iter) as t: for i in t: t.set_description(f'迭代:{i}') svi.step(self.data) loss = svi.evaluate_loss(self.data) with torch.no_grad(): postfix_kwargs = {} if random_instance is not None: g = pyro.param('g') s = pyro.param('s') postfix_kwargs.update({ 'g': '{0}'.format((g - random_instance.g).abs().mean()), 's': '{0}'.format((s - random_instance.s).abs().mean()) }) t.set_postfix(loss=loss, **postfix_kwargs)
def main(args): # load data print('loading training data...') dataset_directory = get_data_directory(__file__) dataset_path = os.path.join(dataset_directory, 'faces_training.csv') if not os.path.exists(dataset_path): try: os.makedirs(dataset_directory) except OSError as e: if e.errno != errno.EEXIST: raise pass wget.download( 'https://d2fefpcigoriu7.cloudfront.net/datasets/faces_training.csv', dataset_path) data = torch.tensor(np.loadtxt(dataset_path, delimiter=',')).float() learning_rate = 4.5 momentum = 0.1 opt = optim.AdagradRMSProp({"eta": learning_rate, "t": momentum}) # this is the svi object we use during training; we use TraceMeanField_ELBO to # get analytic KL divergences svi = SVI(model, guide, opt, loss=TraceMeanField_ELBO()) # we use svi_eval during evaluation; since we took care to write down our model in # a fully vectorized way, this computation can be done efficiently with large tensor ops svi_eval = SVI(model_original, guide, opt, loss=TraceMeanField_ELBO(num_particles=args.eval_particles, vectorize_particles=True)) guide_description = 'custom' print('\nbeginning training with %s guide...' % guide_description) # the training loop for k in range(args.num_epochs): loss = svi.step(data) clip_params() if k % args.eval_frequency == 0 and k > 0 or k == args.num_epochs - 1: loss = svi_eval.evaluate_loss(data) print("[epoch %04d] training elbo: %.4g" % (k, -loss))
def fit(self, optim=Adam({'lr': 5e-2}), loss=Trace_ELBO(num_particles=1), max_iter=5000, random_instance=None): svi = SVI(self.model, self.guide, optim=optim, loss=loss) with trange(max_iter) as t: for i in t: t.set_description(f'迭代:{i}') svi.step(self.data) loss = svi.evaluate_loss(self.data) with torch.no_grad(): postfix_kwargs = {} if random_instance is not None: b = pyro.param('b') postfix_kwargs['threshold_error'] = '{0}'.format((b - random_instance.b).abs().mean()) if self._model in ('irt_2pl', 'irt_3pl', 'irt_4pl'): a = pyro.param('a') postfix_kwargs['slop_error'] = '{0}'.format((a - random_instance.a).abs().mean()) if self._model in ('irt_3pl', 'irt_4pl'): c = pyro.param('c') postfix_kwargs['guess_error'] = '{0}'.format((c - random_instance.c).abs().mean()) if self._model == 'irt_4pl': d = pyro.param('d') postfix_kwargs['slip_error'] = '{0}'.format((d - random_instance.d).abs().mean()) t.set_postfix(loss=loss, **postfix_kwargs)
def _valid_epoch(self, epoch): """ Validate after training an epoch :param epoch: Integer, current training epoch. :return: A log that contains information about validation """ if self.jit: elbo = JitTraceGraph_ELBO(vectorize_particles=False, num_particles=self.num_particles) else: elbo = TraceGraph_ELBO(vectorize_particles=False, num_particles=self.num_particles) svi = SVI(self.model.model, self.model.guide, self.optimizer, loss=elbo) self.model.eval() self.valid_metrics.reset() with torch.no_grad(): for batch_idx, (data, target) in enumerate(self.valid_data_loader): data, target = data.to(self.device), target.to(self.device) loss = svi.evaluate_loss(observations=data) self.writer.set_step( (epoch - 1) * len(self.valid_data_loader) + batch_idx, 'valid') self.valid_metrics.update('loss', loss.item()) for met in self.metric_ftns: self.valid_metrics.update(met.__name__, met(target)) self.writer.add_image( 'input', make_grid(data.cpu(), nrow=8, normalize=True)) # add histogram of model parameters to the tensorboard for name, p in self.model.named_parameters(): self.writer.add_histogram(name, p, bins='auto') return self.valid_metrics.result()
def main(args): # clear param store pyro.clear_param_store() # setup MNIST data loaders # train_loader, test_loader train_loader, test_loader = setup_data_loaders(MNIST, use_cuda=args.cuda, batch_size=256) # setup the VAE vae = VAE(use_cuda=args.cuda) # setup the optimizer adam_args = {"lr": args.learning_rate} optimizer = Adam(adam_args) # setup the inference algorithm elbo = JitTrace_ELBO() if args.jit else Trace_ELBO() svi = SVI(vae.model, vae.guide, optimizer, loss=elbo) # setup visdom for visualization if args.visdom_flag: vis = visdom.Visdom() train_elbo = [] test_elbo = [] # training loop for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for x, _ in train_loader: # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # do ELBO gradient and accumulate loss epoch_loss += svi.step(x) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if epoch % args.test_frequency == 0: # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for i, (x, _) in enumerate(test_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # compute ELBO estimate and accumulate loss test_loss += svi.evaluate_loss(x) # pick three random test images from the first mini-batch and # visualize how well we're reconstructing them if i == 0: if args.visdom_flag: plot_vae_samples(vae, vis) reco_indices = np.random.randint(0, x.shape[0], 3) for index in reco_indices: test_img = x[index, :] reco_img = vae.reconstruct_img(test_img) vis.image(test_img.reshape( 28, 28).detach().cpu().numpy(), opts={'caption': 'test image'}) vis.image(reco_img.reshape( 28, 28).detach().cpu().numpy(), opts={'caption': 'reconstructed image'}) # report test diagnostics normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test test_elbo.append(total_epoch_loss_test) print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test)) if epoch == args.tsne_iter: mnist_test_tsne(vae=vae, test_loader=test_loader) plot_llk(np.array(train_elbo), np.array(test_elbo)) return vae
train_loss += svi.step(x) if args.verbose and i % 1e3 == 0: print(">>> [{:03d}%] current training ELBO: {:.3f}".format( np.round(100 * (i+1) * args.batch_size / len(train_loader.dataset)).astype(int), train_loss)) # testing for i, (x, _) in enumerate(test_loader): if opts['use_cuda']: x = x.cuda() # wrap mini-batch in pytorch variable, # compute ELBO estimate and accumulate loss x = Variable(x) test_loss += svi.evaluate_loss(x) if args.verbose and i % 1e3 == 0: print(">>> [{:03d}%] current testing ELBO: {:.3f}".format( np.round(100 * (i+1) * args.batch_size / len(test_loader.dataset)).astype(int), test_loss)) # record mean training and testing losses train_elbo[epoch] = -train_loss / len(train_loader.dataset) test_elbo[epoch] = -test_loss / len(test_loader.dataset) # logging log = '[Epoch {:03d}/{:03d}] Training ELBO: {:.4f}, Testing ELBO: {:.4f}, Mins: {:.1f}'.format( epoch + 1, args.epochs, train_elbo[epoch], test_elbo[epoch], (dt.now() - start_time).total_seconds() / 60) print('>>> {}'.format(log))
for i, data in enumerate(train_loader): x, targets = data targets = targets.view(-1) loss = svi.step(x.to(device), targets.to(device)) train_props['loss'] += loss L = len(train_loader) train_props = {k: v / L for k, v in train_props.items()} cv_props = {k: 0 for k in status_properties} for j, data in enumerate(cv_loader): x, targets = data targets = targets.view(-1) x.to(device) targets.to(device) preds = clf.predict(x) cv_props['loss'] += svi.evaluate_loss(x.to(device), targets.to(device)) cv_props['accuracy'] += accuracy(preds.to(device), targets.to(device)) L = len(cv_loader) cv_props = {k: v / L for k, v in cv_props.items()} if cv_props['loss'] < best_loss: print('Saving state') state = { 'state_dict': clf.state_dict(), 'train_props': train_props, 'cv_props': cv_props } torch.save(state, 'nn_state.pth.tar') torch.save(opt, 'nn_opt.pth.tar') status(epoch, train_props, cv_props) except KeyboardInterrupt:
train_elbo.append(-total_epoch_loss_train) # --------------------------Do testing for each epoch here-------------------------------- test_loss = 0. # compute the loss over the entire test set for x_test,y_test in test_loader: x_test = x_test.cuda() y_test = y_test.cuda() # compute ELBO estimate and accumulate loss labels_y_test = torch.tensor(np.zeros((y_test.shape[0],2))) y_test_2=torch.Tensor.cpu(y_test.reshape(1,y_test.size()[0])[0]).numpy().astype(int) labels_y_test=np.eye(2)[y_test_2] labels_y_test = torch.from_numpy(labels_y_test) test_loss += svi.evaluate_loss(x_test.reshape(-1,10000),labels_y_test.cuda().float()) normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test print("[epoch %03d] average training loss: %.4f testing loss: %.4f" % (epoch, total_epoch_loss_train,total_epoch_loss_test)) df['learning_rate'][count]=LEARNING_RATE df['train_loss'][count]=total_epoch_loss_train count = count + 1 print('+++++++++++++++++++++++++++++++++++++Incrementing Learning Rate++++++++++++++++++++++++++++++++++++') learning_rates.append(LEARNING_RATE) train_losses.append(total_epoch_loss_train) df.to_csv('data_lr_experiment_sup_d'+str(d)+'.csv')
def main(): # parse command line arguments parser = argparse.ArgumentParser(description="parse args") parser.add_argument('-n', '--num-epochs', default=101, type=int, help='number of training epochs') parser.add_argument('-tf', '--test-frequency', default=5, type=int, help='how often we evaluate the test set') parser.add_argument('-lr', '--learning-rate', default=1.0e-3, type=float, help='learning rate') parser.add_argument('-b1', '--beta1', default=0.95, type=float, help='beta1 adam hyperparameter') parser.add_argument('--cuda', action='store_true', default=False, help='whether to use cuda') parser.add_argument('-visdom', '--visdom_flag', default=False, help='Whether plotting in visdom is desired') parser.add_argument('-i-tsne', '--tsne_iter', default=100, type=int, help='epoch when tsne visualization runs') args = parser.parse_args() # setup MNIST data loaders # train_loader, test_loader train_loader, test_loader = setup_data_loaders(MNIST, use_cuda=args.cuda, batch_size=256) # setup the VAE vae = VAE(use_cuda=args.cuda) # setup the optimizer adam_args = {"lr": args.learning_rate} optimizer = Adam(adam_args) # setup the inference algorithm svi = SVI(vae.model, vae.guide, optimizer, loss="ELBO") # setup visdom for visualization if args.visdom_flag: vis = visdom.Visdom() train_elbo = [] test_elbo = [] # training loop for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for _, (x, _) in enumerate(train_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # wrap the mini-batch in a PyTorch Variable x = Variable(x) # do ELBO gradient and accumulate loss epoch_loss += svi.step(x) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if epoch % args.test_frequency == 0: # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for i, (x, _) in enumerate(test_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # wrap the mini-batch in a PyTorch Variable x = Variable(x) # compute ELBO estimate and accumulate loss test_loss += svi.evaluate_loss(x) # pick three random test images from the first mini-batch and # visualize how well we're reconstructing them if i == 0: if args.visdom_flag: plot_vae_samples(vae, vis) reco_indices = np.random.randint(0, x.size(0), 3) for index in reco_indices: test_img = x[index, :] reco_img = vae.reconstruct_img(test_img) vis.image(test_img.contiguous().view(28, 28).data.cpu().numpy(), opts={'caption': 'test image'}) vis.image(reco_img.contiguous().view(28, 28).data.cpu().numpy(), opts={'caption': 'reconstructed image'}) # report test diagnostics normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test test_elbo.append(total_epoch_loss_test) print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test)) if epoch == args.tsne_iter: mnist_test_tsne(vae=vae, test_loader=test_loader) plot_llk(np.array(train_elbo), np.array(test_elbo)) return vae
def evaluate(dmm: nn.Module, svi: SVI, data_loader: DataLoader) -> float: dmm.eval() return sum(svi.evaluate_loss(x) for x in data_loader) / len(data_loader.dataset)
def main(): # parse command line arguments parser = argparse.ArgumentParser(description="parse args") parser.add_argument('-n', '--num-epochs', default=101, type=int, help='number of training epochs') parser.add_argument('-tf', '--test-frequency', default=5, type=int, help='how often we evaluate the test set') parser.add_argument('-lr', '--learning-rate', default=1.0e-3, type=float, help='learning rate') parser.add_argument('-b1', '--beta1', default=0.95, type=float, help='beta1 adam hyperparameter') parser.add_argument('--cuda', action='store_true', default=False, help='whether to use cuda') parser.add_argument('-visdom', '--visdom_flag', default=False, help='Whether plotting in visdom is desired') parser.add_argument('-i-tsne', '--tsne_iter', default=100, type=int, help='epoch when tsne visualization runs') args = parser.parse_args() # setup MNIST data loaders # train_loader, test_loader train_loader, test_loader = setup_data_loaders(MNIST, use_cuda=args.cuda, batch_size=256) # setup the VAE vae = VAE(use_cuda=args.cuda) # setup the optimizer adam_args = {"lr": args.learning_rate} optimizer = Adam(adam_args) # setup the inference algorithm svi = SVI(vae.model, vae.guide, optimizer, loss="ELBO") # setup visdom for visualization if args.visdom_flag: vis = visdom.Visdom() train_elbo = [] test_elbo = [] # training loop for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for _, (x, _) in enumerate(train_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # wrap the mini-batch in a PyTorch Variable x = Variable(x) # do ELBO gradient and accumulate loss epoch_loss += svi.step(x) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if epoch % args.test_frequency == 0: # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for i, (x, _) in enumerate(test_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # wrap the mini-batch in a PyTorch Variable x = Variable(x) # compute ELBO estimate and accumulate loss test_loss += svi.evaluate_loss(x) # pick three random test images from the first mini-batch and # visualize how well we're reconstructing them if i == 0: if args.visdom_flag: plot_vae_samples(vae, vis) reco_indices = np.random.randint(0, x.size(0), 3) for index in reco_indices: test_img = x[index, :] reco_img = vae.reconstruct_img(test_img) vis.image(test_img.contiguous().view( 28, 28).data.cpu().numpy(), opts={'caption': 'test image'}) vis.image(reco_img.contiguous().view( 28, 28).data.cpu().numpy(), opts={'caption': 'reconstructed image'}) # report test diagnostics normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test test_elbo.append(total_epoch_loss_test) print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test)) if epoch == args.tsne_iter: mnist_test_tsne(vae=vae, test_loader=test_loader) plot_llk(np.array(train_elbo), np.array(test_elbo)) return vae
iaf_dim=50, use_cuda=True) learning_rate = 0.01 beta1 = 0.9 beta2 = 0.999 clip_norm = 10.0 lr_decay = 1.0 weight_decay = 0 adam_params = { "lr": learning_rate, "betas": (beta1, beta2), "clip_norm": clip_norm, "lrd": lr_decay, "weight_decay": weight_decay } adam = ClippedAdam(adam_params) elbo = Trace_ELBO() svi = SVI(dmm.model, dmm.guide, adam, loss=elbo) for i in range(100): loss = svi.step(input_tensor, input_tensor_reversed, input_tensor_mask) val_nll = svi.evaluate_loss(input_tensor, input_tensor_reversed, input_tensor_mask) print(val_nll) _, _, loss_loc, loss_scale = do_prediction(dmm, pred_tensor, pred_tensor_reversed, pred_tensor_mask, 5, ground_truth) print(loss_loc, loss_scale)
class SVIExperiment(BaseCovariateExperiment): def __init__(self, hparams, pyro_model: BaseSEM): super().__init__(hparams, pyro_model) self.svi_loss = CustomELBO(num_particles=hparams.num_svi_particles) self._build_svi() def _build_svi(self, loss=None): def per_param_callable(module_name, param_name): params = { 'eps': 1e-5, 'amsgrad': self.hparams.use_amsgrad, 'weight_decay': self.hparams.l2 } if 'flow_components' in module_name or 'sex_logits' in param_name: params['lr'] = self.hparams.pgm_lr else: params['lr'] = self.hparams.lr print( f'building opt for {module_name} - {param_name} with p: {params}' ) return params if loss is None: loss = self.svi_loss if self.hparams.use_cf_guide: def guide(*args, **kwargs): return self.pyro_model.counterfactual_guide( *args, **kwargs, counterfactual_type=self.hparams.cf_elbo_type) self.svi = SVI(self.pyro_model.svi_model, guide, Adam(per_param_callable), loss) else: self.svi = SVI(self.pyro_model.svi_model, self.pyro_model.svi_guide, Adam(per_param_callable), loss) self.svi.loss_class = loss def backward(self, *args, **kwargs): pass # No loss to backpropagate since we're using Pyro's optimisation machinery def print_trace_updates(self, batch): with torch.no_grad(): print('Traces:\n' + ('#' * 10)) guide_trace = pyro.poutine.trace( self.pyro_model.svi_guide).get_trace(**batch) model_trace = pyro.poutine.trace( pyro.poutine.replay(self.pyro_model.svi_model, trace=guide_trace)).get_trace(**batch) guide_trace = pyro.poutine.util.prune_subsample_sites(guide_trace) model_trace = pyro.poutine.util.prune_subsample_sites(model_trace) model_trace.compute_log_prob() guide_trace.compute_score_parts() print(f'model: {model_trace.nodes.keys()}') for name, site in model_trace.nodes.items(): if site["type"] == "sample": fn = site['fn'] if isinstance(fn, Independent): fn = fn.base_dist print(f'{name}: {fn} - {fn.support}') log_prob_sum = site["log_prob_sum"] is_obs = site["is_observed"] print( f'model - log p({name}) = {log_prob_sum} | obs={is_obs}' ) if torch.isnan(log_prob_sum): value = site['value'][0] conc0 = fn.concentration0 conc1 = fn.concentration1 print(f'got:\n{value}\n{conc0}\n{conc1}') raise Exception() print(f'guide: {guide_trace.nodes.keys()}') for name, site in guide_trace.nodes.items(): if site["type"] == "sample": fn = site['fn'] if isinstance(fn, Independent): fn = fn.base_dist print(f'{name}: {fn} - {fn.support}') entropy = site["score_parts"].entropy_term.sum() is_obs = site["is_observed"] print(f'guide - log q({name}) = {entropy} | obs={is_obs}') def get_trace_metrics(self, batch): metrics = {} model = self.svi.loss_class.trace_storage['model'] guide = self.svi.loss_class.trace_storage['guide'] metrics['log p(x)'] = model.nodes['x']['log_prob'].mean() metrics['log p(age)'] = model.nodes['age']['log_prob'].mean() metrics['log p(sex)'] = model.nodes['sex']['log_prob'].mean() metrics['log p(ventricle_volume)'] = model.nodes['ventricle_volume'][ 'log_prob'].mean() metrics['log p(brain_volume)'] = model.nodes['brain_volume'][ 'log_prob'].mean() metrics['p(z)'] = model.nodes['z']['log_prob'].mean() metrics['q(z)'] = guide.nodes['z']['log_prob'].mean() metrics['log p(z) - log q(z)'] = metrics['p(z)'] - metrics['q(z)'] return metrics def prep_batch(self, batch): x = batch['image'] * 255. age = batch['age'].unsqueeze(1).float() sex = batch['sex'].unsqueeze(1).float() ventricle_volume = batch['ventricle_volume'].unsqueeze(1).float() brain_volume = batch['brain_volume'].unsqueeze(1).float() x = x.float() if self.training: x += torch.rand_like(x) return { 'x': x, 'age': age, 'sex': sex, 'ventricle_volume': ventricle_volume, 'brain_volume': brain_volume } def training_step(self, batch, batch_idx): batch = self.prep_batch(batch) if self.hparams.validate: print('Validation:') self.print_trace_updates(batch) loss = self.svi.step(**batch) metrics = self.get_trace_metrics(batch) if np.isnan(loss): self.logger.experiment.add_text( 'nan', f'nand at {self.current_epoch}:\n{metrics}') raise ValueError( 'loss went to nan with metrics:\n{}'.format(metrics)) tensorboard_logs = {('train/' + k): v for k, v in metrics.items()} tensorboard_logs['train/loss'] = loss return {'loss': torch.Tensor([loss]), 'log': tensorboard_logs} def validation_step(self, batch, batch_idx): batch = self.prep_batch(batch) loss = self.svi.evaluate_loss(**batch) metrics = self.get_trace_metrics(batch) return {'loss': loss, **metrics} def test_step(self, batch, batch_idx): batch = self.prep_batch(batch) loss = self.svi.evaluate_loss(**batch) metrics = self.get_trace_metrics(batch) samples = self.build_test_samples(batch) return {'loss': loss, **metrics, 'samples': samples} @classmethod def add_arguments(cls, parser): parser = super().add_arguments(parser) parser.add_argument( '--num_svi_particles', default=4, type=int, help="number of particles to use for ELBO (default: %(default)s)") parser.add_argument( '--num_sample_particles', default=32, type=int, help= "number of particles to use for MC sampling (default: %(default)s)" ) parser.add_argument( '--use_cf_guide', default=False, action='store_true', help="whether to use counterfactual guide (default: %(default)s)") parser.add_argument( '--cf_elbo_type', default=-1, choices=[-1, 0, 1, 2], help= "-1: randomly select per batch, 0: shuffle thickness, 1: shuffle intensity, 2: shuffle both (default: %(default)s)" ) return parser
train_props['accuracy'] += a.item() # train_props['accuracy_1'] += a1 # train_props['accuracy_2'] += a2 # train_props['accuracy_3'] += a3 L = len(train_loader) train_props = {k:v/L for k,v in train_props.items()} cv_props = {k:0 for k in status_properties} for j, data in enumerate(cv_loader): x, targets = data targets = targets.view(-1) x = x.to(device) targets = targets.to(device) clf.eval() preds = clf.predict(x) cv_props['loss'] += svi.evaluate_loss(x, targets) # preds = F.log_softmax(preds, dim=1) # preds = torch.argmax(preds, dim=1) # a, a1, a2, a3 = accuracy(preds, targets) # a = accuracy(preds, targets) a = (preds == targets).float().mean() cv_props['accuracy'] += a.item() # cv_props['accuracy_1'] += a1 # cv_props['accuracy_2'] += a2 # cv_props['accuracy_3'] += a3 L = len(cv_loader) cv_props = {k:v/L for k,v in cv_props.items()} # if cv_props['loss'] < best_loss: if cv_props['accuracy'] > best_acc: print('Saving state') state = {'state_dict': clf.state_dict(), 'train_props': train_props, 'cv_props': cv_props, 'epoch': epoch}
class SVIExperiment(BaseCovariateExperiment): def __init__(self, hparams, pyro_model: BaseSEM): super().__init__(hparams, pyro_model) if hparams.tracegraph_elbo: self.svi_loss = StorageTraceGraph_ELBO( num_particles=hparams.num_svi_particles) else: self.svi_loss = StorageTrace_ELBO( num_particles=hparams.num_svi_particles) self._build_svi() def _build_svi(self, loss=None): def per_param_callable(module_name, param_name): if self.hparams.use_adagrad_rmsprop: params = { 'eta': self.hparams.eta, 'delta': self.hparams.delta, 't': self.hparams.t } else: params = { 'weight_decay': self.hparams.weight_decay, 'betas': self.hparams.betas, 'eps': 1e-5 } if any([(pn in module_name) for pn in ('prior_flow', 'posterior_flow')]): params['lr'] = self.hparams.lr elif 'affine' in module_name: params['lr'] = self.hparams.lr params['weight_decay'] = 0. elif 'flow_components' in module_name: params['lr'] = self.hparams.pgm_lr elif 'sex_logits' in param_name: params['lr'] = self.hparams.pgm_lr params['weight_decay'] = 0. elif 'decoder' in module_name and 'logstd_head' in param_name: params['weight_decay'] = self.hparams.logstd_weight_decay else: params['lr'] = self.hparams.lr logger.info( f'building opt for {module_name} - {param_name} with p: {params}' ) return params def per_param_clip_args(module_name, param_name): clip_args = defaultdict(lambda: None) if any([(pn in module_name) for pn in ('prior_flow', 'posterior_flow')]): clip_args['clip_norm'] = self.hparams.flow_clip_norm elif any([(pn in param_name) for pn in ('affine', 'sex_logits', 'flow_components')]): clip_args['clip_norm'] = self.hparams.pgm_clip_norm else: clip_args['clip_norm'] = self.hparams.clip_norm logger.info( f'building clip args for {module_name} - {param_name} with p: {clip_args}' ) return clip_args if loss is None: loss = self.svi_loss optimizer = AdagradRMSProp if self.hparams.use_adagrad_rmsprop else AdamW verbose = self.hparams.verbosity > 1 # only print lr in debug mode if self.hparams.use_exponential_lr: self.scheduler = ExponentialLR( { 'optimizer': optimizer, 'optim_args': per_param_callable, 'gamma': self.hparams.lrd, 'verbose': verbose }, clip_args=per_param_clip_args) else: self.scheduler = OneCycleLR( { 'optimizer': optimizer, 'optim_args': per_param_callable, 'epochs': self.hparams.n_epochs, 'steps_per_epoch': self._steps_per_epoch(), 'pct_start': self.hparams.pct_start, 'div_factor': self.hparams.div_factor, 'final_div_factor': self.hparams.final_div_factor, 'verbose': verbose }, clip_args=per_param_clip_args) if self.hparams.use_cf_guide: def guide(*args, **kwargs): return self.pyro_model.counterfactual_guide( *args, **kwargs, counterfactual_type=self.hparams.cf_elbo_type) self.svi = SVI(self.pyro_model.svi_model, guide, self.scheduler, loss) else: self.svi = SVI(self.pyro_model.svi_model, self.pyro_model.svi_guide, self.scheduler, loss) self.svi.loss_class = loss def backward(self, *args, **kwargs): pass # No loss to backpropagate since we're using Pyro's optimisation machinery def print_trace_updates(self, batch): with torch.no_grad(): logger.info('Traces:\n' + ('#' * 10)) guide_trace = pyro.poutine.trace( self.pyro_model.svi_guide).get_trace(batch) model_trace = pyro.poutine.trace( pyro.poutine.replay(self.pyro_model.svi_model, trace=guide_trace)).get_trace(batch) guide_trace = pyro.poutine.util.prune_subsample_sites(guide_trace) model_trace = pyro.poutine.util.prune_subsample_sites(model_trace) model_trace.compute_log_prob() guide_trace.compute_score_parts() logging.info(f'model: {model_trace.nodes.keys()}') for name, site in model_trace.nodes.items(): if site["type"] == "sample": fn = site['fn'] if isinstance(fn, Independent): fn = fn.base_dist try: logging.info(f'{name}: {fn} - {fn.support}') except NotImplementedError: logging.info(f'{name}: {fn}') log_prob_sum = site["log_prob_sum"] is_obs = site["is_observed"] logging.info( f'model - log p({name}) = {log_prob_sum} | obs={is_obs}' ) if torch.isnan(log_prob_sum): value = site['value'][0] conc0 = fn.concentration0 conc1 = fn.concentration1 raise RuntimeError( f'Error: \n{value}\n{conc0}\n{conc1}') logging.info(f'guide: {guide_trace.nodes.keys()}') for name, site in guide_trace.nodes.items(): if site["type"] == "sample": fn = site['fn'] if isinstance(fn, Independent): fn = fn.base_dist try: logging.info(f'{name}: {fn} - {fn.support}') except NotImplementedError: logging.info(f'{name}: {fn}') entropy = site["score_parts"].entropy_term.sum() is_obs = site["is_observed"] logging.info( f'guide - log q({name}) = {entropy} | obs={is_obs}') def get_trace_metrics(self, batch): metrics = {} model = self.svi.loss_class.trace_storage['model'] guide = self.svi.loss_class.trace_storage['guide'] for k in self.required_data: metrics[f'log p({k})'] = model.nodes[k]['log_prob'].mean() if self.pyro_model.n_levels > 0: metrics['log p(z) - log q(z)'] = 0. for i in range(self.pyro_model.n_levels): metrics[f'log p(z{i})'] = model.nodes[f'z{i}'][ 'log_prob'].mean() metrics[f'log q(z{i})'] = guide.nodes[f'z{i}'][ 'log_prob'].mean() metrics['log p(z) - log q(z)'] += metrics[ f'log p(z{i})'] - metrics[f'log q(z{i})'] else: metrics['log p(z)'] = model.nodes['z']['log_prob'].mean() metrics['log q(z)'] = guide.nodes['z']['log_prob'].mean() metrics['log p(z) - log q(z)'] = metrics['log p(z)'] - metrics[ 'log q(z)'] return metrics def _theis_noise(self, obs): """ add noise to discrete variables per Theis 2016 """ if self.training: obs['x'] += (torch.rand_like(obs['x']) - 0.5) obs['slice_number'] += (torch.rand_like(obs['slice_number']) - 0.5) obs['duration'] += torch.rand_like(obs['duration'] - 0.5) obs['duration'].clamp_(min=1e-4) obs['edss'] += ((torch.rand_like(obs['edss']) / 2.) - 0.25) obs['edss'].clamp_(min=1e-4) return obs @property def pseudo3d(self): return self.pyro_model.pseudo3d def prep_batch(self, batch): x = 255. * batch['image'].float( ) # multiply by 255 b/c preprocess tfms out = dict(x=x) for k in self.required_data: if k in batch: out[k] = batch[k].unsqueeze(1).float() out = self._theis_noise(out) return out def _steps_per_epoch(self): return len(self.calabresi_train ) // self.train_batch_size # integer div b/c drop_last used def _set_annealing_factor(self, batch_idx=None): steps_per_epoch = self._steps_per_epoch() if batch_idx is None: batch_idx = steps_per_epoch not_in_sanity_check = self.hparams.annealing_epochs > 0 in_annealing_epochs = self.current_epoch < self.hparams.annealing_epochs n_levels = max(self.pyro_model.n_levels, 1) self.pyro_model.annealing_factor = [1. for _ in range(n_levels)] for i in range(n_levels): if not_in_sanity_check and in_annealing_epochs and self.training: min_af = self.hparams.min_annealing_factor[i] max_af = self.hparams.max_annealing_factor[i] self.pyro_model.annealing_factor[i] = min_af + (max_af - min_af) * \ (float(batch_idx + self.current_epoch * steps_per_epoch + 1) / float(self.hparams.annealing_epochs * steps_per_epoch)) else: self.pyro_model.annealing_factor[ i] = self.hparams.max_annealing_factor[i] if self.training: self.log(f'annealing_factor/af{i}', self.pyro_model.annealing_factor[i], on_step=False, on_epoch=True) def training_step(self, batch, batch_idx): self._set_annealing_factor(batch_idx) batch = self.prep_batch(batch) if self.hparams.validate: logging.info('Validation:') self.print_trace_updates(batch) loss = self.svi.step(batch) self.scheduler.step() loss = torch.as_tensor(loss) self.log('train_loss', loss, on_step=False, on_epoch=True) metrics = self.get_trace_metrics(batch) if np.isnan(loss): self.logger.experiment.add_text( 'nan', f'nand at {self.current_epoch}:\n{metrics}') raise ValueError( 'loss went to nan with metrics:\n{}'.format(metrics)) for k, v in metrics.items(): self.log('train/' + k, v, on_step=False, on_epoch=True) return loss def validation_step(self, batch, batch_idx): self._set_annealing_factor() batch = self.prep_batch(batch) loss = self.svi.evaluate_loss(batch) self.log('val_loss', loss, on_step=False, on_epoch=True) metrics = self.get_trace_metrics(batch) for k, v in metrics.items(): self.log('val/' + k, v, on_step=False, on_epoch=True) return metrics def test_step(self, batch, batch_idx): import nibabel as nib self._set_annealing_factor() subject = int(batch['subject'][0]) scan = int(batch['scan'][0]) batch = self.prep_batch(batch) loss = self.svi.evaluate_loss(batch) self.log('test_loss', loss, on_step=False, on_epoch=True) metrics = self.get_trace_metrics(batch) for k, v in metrics.items(): self.log('test/' + k, v, on_step=False, on_epoch=True) samples = self.build_test_samples(batch) for intervention, data in samples.items(): cf = data['x'].detach().cpu().numpy() if self.hparams.pseudo3d: cf = cf[:, 1, ...] # get the middle slices cf = cf.squeeze() fn = os.path.join(self.hparams.test_dir, f'{subject}_{scan}_{intervention}.nii.gz') nib.Nifti1Image(cf, None).to_filename(fn) return {'samples': samples, 'metrics': metrics} @classmethod def add_arguments(cls, parser): parser = super().add_arguments(parser) parser.add_argument( '--num-svi-particles', default=4, type=int, help="number of particles to use for ELBO (default: %(default)s)") parser.add_argument( '--num-sample-particles', default=32, type=int, help= "number of particles to use for MC sampling (default: %(default)s)" ) parser.add_argument( '--use-cf-guide', default=False, action='store_true', help="whether to use counterfactual guide (default: %(default)s)") parser.add_argument( '--cf-elbo-type', default=-1, choices=[-1, 0, 1, 2], help= "-1: randomly select per batch, 0: shuffle thickness, 1: shuffle intensity, 2: shuffle both (default: %(default)s)" ) parser.add_argument( '--annealing-epochs', default=50, type=int, help="anneal kl div in z for this # epochs (default: %(default)s)") parser.add_argument( '--min-annealing-factor', default=[0.2], type=float, nargs='+', help= "anneal kl div in z starting here (per level for hierarchical) (default: %(default)s)" ) parser.add_argument( '--max-annealing-factor', default=[1.0], type=float, nargs='+', help= "anneal kl div in z ending here (per level for hierarchical) (default: %(default)s)" ) parser.add_argument( '--tracegraph-elbo', default=False, action='store_true', help= "use tracegraph elbo (much more computationally expensive) (default: %(default)s)" ) return parser
def train(args, DATA_PATH): # clear param store pyro.clear_param_store() #pyro.enable_validation(True) # train_loader, test_loader transform = {} transform["train"] = transforms.Compose([ transforms.Resize((400, 400)), transforms.ToTensor(), ]) transform["test"] = transforms.Compose( [transforms.Resize((400, 400)), transforms.ToTensor()]) train_loader, test_loader = setup_data_loaders( dataset=GameCharacterFullData, root_path=DATA_PATH, batch_size=32, transforms=transform) # setup the VAE vae = VAE(use_cuda=args.cuda, num_labels=17) # setup the exponential learning rate scheduler optimizer = torch.optim.Adam scheduler = pyro.optim.ExponentialLR({ 'optimizer': optimizer, 'optim_args': { 'lr': args.learning_rate }, 'gamma': 0.1 }) # setup the inference algorithm elbo = JitTrace_ELBO() if args.jit else Trace_ELBO() svi = SVI(vae.model, vae.guide, scheduler, loss=elbo) # setup visdom for visualization if args.visdom_flag: vis = visdom.Visdom(port='8097') train_elbo = [] test_elbo = [] # training loop for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for x, y, actor, reactor, actor_type, reactor_type, action, reaction in train_loader: # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() y = y.cuda() actor = actor.cuda() reactor = reactor.cuda() actor_type = actor_type.cuda() reactor_type = reactor_type.cuda() action = action.cuda() reaction = reaction.cuda() # do ELBO gradient and accumulate loss epoch_loss += svi.step(x, y, actor, reactor, actor_type, reactor_type, action, reaction) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if epoch % args.test_frequency == 0: # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for i, (x, y, actor, reactor, actor_type, reactor_type, action, reaction) in enumerate(test_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() y = y.cuda() actor = actor.cuda() reactor = reactor.cuda() actor_type = actor_type.cuda() reactor_type = reactor_type.cuda() action = action.cuda() reaction = reaction.cuda() # compute ELBO estimate and accumulate loss test_loss += svi.evaluate_loss(x, y, actor, reactor, actor_type, reactor_type, action, reaction) # pick three random test images from the first mini-batch and # visualize how well we're reconstructing them if i == 0: if args.visdom_flag: plot_vae_samples(vae, vis) reco_indices = np.random.randint(0, x.shape[0], 3) for index in reco_indices: test_img = x[index, :] reco_img = vae.reconstruct_img(test_img) vis.image(test_img.reshape( 400, 400).detach().cpu().numpy(), opts={'caption': 'test image'}) vis.image(reco_img.reshape( 400, 400).detach().cpu().numpy(), opts={'caption': 'reconstructed image'}) # report test diagnostics normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test test_elbo.append(total_epoch_loss_test) print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test)) return vae, optimizer
def main(args): pyro.set_rng_seed(0) pyro.clear_param_store() pyro.enable_validation(__debug__) # load data if args.dataset == "dipper": capture_history_file = os.path.dirname( os.path.abspath(__file__)) + '/dipper_capture_history.csv' elif args.dataset == "vole": capture_history_file = os.path.dirname( os.path.abspath(__file__)) + '/meadow_voles_capture_history.csv' else: raise ValueError("Available datasets are \'dipper\' and \'vole\'.") capture_history = torch.tensor( np.genfromtxt(capture_history_file, delimiter=',')).float()[:, 1:] N, T = capture_history.shape print( "Loaded {} capture history for {} individuals collected over {} time periods." .format(args.dataset, N, T)) if args.dataset == "dipper" and args.model in ["4", "5"]: sex_file = os.path.dirname( os.path.abspath(__file__)) + '/dipper_sex.csv' sex = torch.tensor(np.genfromtxt(sex_file, delimiter=',')).float()[:, 1] print("Loaded dipper sex data.") elif args.dataset == "vole" and args.model in ["4", "5"]: raise ValueError( "Cannot run model_{} on meadow voles data, since we lack sex " + "information for these animals.".format(args.model)) else: sex = None model = models[args.model] # we use poutine.block to only expose the continuous latent variables # in the models to AutoDiagonalNormal (all of which begin with 'phi' # or 'rho') def expose_fn(msg): return msg["name"][0:3] in ['phi', 'rho'] # we use a mean field diagonal normal variational distributions (i.e. guide) # for the continuous latent variables. guide = AutoDiagonalNormal(poutine.block(model, expose_fn=expose_fn)) # since we enumerate the discrete random variables, # we need to use TraceEnum_ELBO or TraceTMC_ELBO. optim = Adam({'lr': args.learning_rate}) if args.tmc: elbo = TraceTMC_ELBO(max_plate_nesting=1) tmc_model = poutine.infer_config(model, lambda msg: { "num_samples": args.tmc_num_samples, "expand": False } if msg["infer"].get("enumerate", None) == "parallel" else {} ) # noqa: E501 svi = SVI(tmc_model, guide, optim, elbo) else: elbo = TraceEnum_ELBO(max_plate_nesting=1, num_particles=20, vectorize_particles=True) svi = SVI(model, guide, optim, elbo) losses = [] print( "Beginning training of model_{} with Stochastic Variational Inference." .format(args.model)) for step in range(args.num_steps): loss = svi.step(capture_history, sex) losses.append(loss) if step % 20 == 0 and step > 0 or step == args.num_steps - 1: print("[iteration %03d] loss: %.3f" % (step, np.mean(losses[-20:]))) # evaluate final trained model elbo_eval = TraceEnum_ELBO(max_plate_nesting=1, num_particles=2000, vectorize_particles=True) svi_eval = SVI(model, guide, optim, elbo_eval) print("Final loss: %.4f" % svi_eval.evaluate_loss(capture_history, sex))
total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(-total_epoch_loss_train) # --------------------------Do testing for each epoch here-------------------------------- # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for x_test in test_loader: # if on GPU put mini-batch into CUDA memory x_test = x_test[0].cuda() # compute ELBO estimate and accumulate loss test_loss += svi.evaluate_loss( x_test ) #Data entry point <---------------------------------Data Entry Point normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test incept_score = 0 # This loop fixes the limits for the random number generator #On first run the limits = np.zeros((2, d)) for i in range(0, d): limits[0, i] = -4 limits[1, i] = 4 incept_score = inception_scoring( d, limits) #Calls the inception score and calculates it.
def main_sVAE(arr): X_DIM = 10000 Y_DIM = 2 Z_DIM=16 ALPHA_ENCO = int("".join(str(i) for i in arr[0:10]),2) BETA_ENCO = int("".join(str(i) for i in arr[10:18]),2) ALPHA_DECO = int("".join(str(i) for i in arr[18:28]),2) BETA_DECO = int("".join(str(i) for i in arr[28:37]),2) H_DIM_ENCO_1 = ALPHA_ENCO + BETA_ENCO H_DIM_ENCO_2 = ALPHA_ENCO H_DIM_DECO_1 = ALPHA_DECO H_DIM_DECO_2 = ALPHA_DECO + BETA_DECO print(str(H_DIM_ENCO_1)) print(str(H_DIM_ENCO_2)) print(str(H_DIM_DECO_1)) print(str(H_DIM_DECO_2)) print('-----------') # Run options LEARNING_RATE = 1.0e-3 USE_CUDA = True # Run only for a single iteration for testing NUM_EPOCHS = 501 TEST_FREQUENCY = 5 train_loader,test_loader = dataloader_first() # clear param store pyro.clear_param_store() # setup the VAE vae = VAE(x_dim=X_DIM, y_dim=Y_DIM, h_dim_enco_1=H_DIM_ENCO_1, h_dim_enco_2=H_DIM_ENCO_2, h_dim_deco_1=H_DIM_DECO_1, h_dim_deco_2=H_DIM_DECO_1, z_dim=Z_DIM, use_cuda=USE_CUDA) # setup the optimizer adagrad_params = {"lr": 0.00003} optimizer = Adagrad(adagrad_params) svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO()) train_elbo = [] test_elbo = [] # training loop for epoch in range(NUM_EPOCHS): total_epoch_loss_train = train(svi, train_loader, use_cuda=USE_CUDA) train_elbo.append(-total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if epoch==500: # --------------------------Do testing for each epoch here-------------------------------- # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for x_test,y_test in test_loader: x_test = x_test.cuda() y_test = y_test.cuda() # compute ELBO estimate and accumulate loss labels_y_test = torch.tensor(np.zeros((y_test.shape[0],2))) y_test_2=torch.Tensor.cpu(y_test.reshape(1,y_test.size()[0])[0]).numpy().astype(int) labels_y_test=np.eye(2)[y_test_2] labels_y_test = torch.from_numpy(labels_y_test) test_loss += svi.evaluate_loss(x_test.reshape(-1,10000),labels_y_test.cuda().float()) #Data entry point <---------------------------------Data Entry Point normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_test)) return total_epoch_loss_test
def main(args): # clear param store pyro.clear_param_store() ### SETUP train_loader, test_loader = get_data() # setup the VAE vae = VAE(use_cuda=args.cuda) # setup the optimizer adam_args = {"lr": args.learning_rate} optimizer = Adam(adam_args) # setup the inference algorithm elbo = JitTrace_ELBO() if args.jit else Trace_ELBO() svi = SVI(vae.model, vae.guide, optimizer, loss=elbo) inputSize = 0 # setup visdom for visualization if args.visdom_flag: vis = visdom.Visdom() train_elbo = [] test_elbo = [] for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for step, batch in enumerate(train_loader): x, adj = 0, 0 # if on GPU put mini-batch into CUDA memory if args.cuda: x = batch['x'].cuda() adj = batch['edge_index'].cuda() else: x = batch['x'] adj = batch['edge_index'] print("x_shape", x.shape) print("adj_shape", adj.shape) inputSize = x.shape[0] * x.shape[1] epoch_loss += svi.step(x, adj) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if True: # if epoch % args.test_frequency == 0: # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for step, batch in enumerate(test_loader): x, adj = 0, 0 # if on GPU put mini-batch into CUDA memory if args.cuda: x = batch['x'].cuda() adj = batch['edge_index'].cuda() else: x = batch['x'] adj = batch['edge_index'] # compute ELBO estimate and accumulate loss # print('before evaluating test loss') test_loss += svi.evaluate_loss(x, adj) # print('after evaluating test loss') # pick three random test images from the first mini-batch and # visualize how well we're reconstructing them # if i == 0: # if args.visdom_flag: # plot_vae_samples(vae, vis) # reco_indices = np.random.randint(0, x.shape[0], 3) # for index in reco_indices: # test_img = x[index, :] # reco_img = vae.reconstruct_img(test_img) # vis.image(test_img.reshape(28, 28).detach().cpu().numpy(), # opts={'caption': 'test image'}) # vis.image(reco_img.reshape(28, 28).detach().cpu().numpy(), # opts={'caption': 'reconstructed image'}) if args.visdom_flag: plot_vae_samples(vae, vis) reco_indices = np.random.randint(0, x.shape[0], 3) for index in reco_indices: test_img = x[index, :] reco_img = vae.reconstruct_graph(test_img) vis.image(test_img.reshape(28, 28).detach().cpu().numpy(), opts={'caption': 'test image'}) vis.image(reco_img.reshape(28, 28).detach().cpu().numpy(), opts={'caption': 'reconstructed image'}) # report test diagnostics normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test test_elbo.append(total_epoch_loss_test) print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test)) # if epoch == args.tsne_iter: # mnist_test_tsne(vae=vae, test_loader=test_loader) # plot_llk(np.array(train_elbo), np.array(test_elbo)) if args.save: torch.save( { 'epoch': epoch, 'model_state_dict': vae.state_dict(), 'optimzier_state_dict': optimizer.get_state(), 'train_loss': total_epoch_loss_train, 'test_loss': total_epoch_loss_test }, 'vae_' + args.name + str(args.time) + '.pt') return vae
def train(device, dataloaders, dataset_sizes, learning_rate, num_epochs, early_stop_patience, model_path, pre_trained_baseline_net): # clear param store pyro.clear_param_store() cvae_net = CVAE(200, 500, 500, pre_trained_baseline_net) cvae_net.to(device) optimizer = pyro.optim.Adam({"lr": learning_rate}) svi = SVI(cvae_net.model, cvae_net.guide, optimizer, loss=Trace_ELBO()) best_loss = np.inf early_stop_count = 0 Path(model_path).parent.mkdir(parents=True, exist_ok=True) # to track evolution val_inp, digits = get_val_images(num_quadrant_inputs=1, num_images=30, shuffle=False) val_inp = val_inp.to(device) samples = [] losses = [] for epoch in range(num_epochs): # Each epoch has a training and validation phase for phase in ['train', 'val']: running_loss = 0.0 # Iterate over data. bar = tqdm(dataloaders[phase], desc='CVAE Epoch {} {}'.format(epoch, phase).ljust(20)) for i, batch in enumerate(bar): inputs = batch['input'].to(device) outputs = batch['output'].to(device) if phase == 'train': loss = svi.step(inputs, outputs) / inputs.size(0) else: loss = svi.evaluate_loss(inputs, outputs) / inputs.size(0) # statistics running_loss += loss if i % 10 == 0: bar.set_postfix(loss='{:.2f}'.format(loss), early_stop_count=early_stop_count) # track evolution if phase == 'train': df = pd.DataFrame(columns=['epoch', 'loss']) df.loc[0] = [epoch + float(i) / len(dataloaders[phase]), loss] losses.append(df) if i % 47 == 0: # every 10% of training (469) dfs = predict_samples( val_inp, digits, cvae_net, epoch + float(i) / len(dataloaders[phase]), ) samples.append(dfs) epoch_loss = running_loss / dataset_sizes[phase] # deep copy the model if phase == 'val': if epoch_loss < best_loss: best_loss = epoch_loss cvae_net.save(model_path) early_stop_count = 0 else: early_stop_count += 1 if early_stop_count >= early_stop_patience: break # Save model weights cvae_net.load(model_path) # record evolution samples = pd.concat(samples, axis=0, ignore_index=True) samples.to_csv('samples.csv', index=False) losses = pd.concat(losses, axis=0, ignore_index=True) losses.to_csv('losses.csv', index=False) return cvae_net
def main(args): # Init tensorboard writer = SummaryWriter('./runs/' + args.runname + str(args.trialnumber)) model_name = 'VanillaDMM' # Set evaluation log file evaluation_logpath = './logs/{}/evaluation_result.log'.format( model_name.lower()) log_evaluation(evaluation_logpath, 'Evaluation Trial - {}\n'.format(args.trialnumber)) # Constants time_length = 30 input_length_for_pred = 20 pred_length = time_length - input_length_for_pred train_batch_size = 16 valid_batch_size = 1 # For model input_channels = 1 z_channels = 50 emission_channels = [64, 32] transition_channels = 64 encoder_channels = [32, 64] rnn_input_dim = 256 rnn_channels = 128 kernel_size = 3 pred_length = 0 # Device checking use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") # Make dataset logging.info("Generate data") train_datapath = args.datapath / 'train' valid_datapath = args.datapath / 'valid' train_dataset = DiffusionDataset(train_datapath) valid_dataset = DiffusionDataset(valid_datapath) # Create data loaders from pickle data logging.info("Generate data loaders") train_dataloader = DataLoader( train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=8) valid_dataloader = DataLoader( valid_dataset, batch_size=valid_batch_size, num_workers=4) # Training parameters width = 100 height = 100 input_dim = width * height # Create model logging.warning("Generate model") logging.warning(input_dim) pred_input_dim = 10 dmm = DMM(input_channels=input_channels, z_channels=z_channels, emission_channels=emission_channels, transition_channels=transition_channels, encoder_channels=encoder_channels, rnn_input_dim=rnn_input_dim, rnn_channels=rnn_channels, kernel_size=kernel_size, height=height, width=width, pred_input_dim=pred_input_dim, num_layers=1, rnn_dropout_rate=0.0, num_iafs=0, iaf_dim=50, use_cuda=use_cuda) # Initialize model logging.info("Initialize model") epochs = args.endepoch learning_rate = 0.0001 beta1 = 0.9 beta2 = 0.999 clip_norm = 10.0 lr_decay = 1.0 weight_decay = 0 adam_params = {"lr": learning_rate, "betas": (beta1, beta2), "clip_norm": clip_norm, "lrd": lr_decay, "weight_decay": weight_decay} adam = ClippedAdam(adam_params) elbo = Trace_ELBO() svi = SVI(dmm.model, dmm.guide, adam, loss=elbo) # saves the model and optimizer states to disk save_model = Path('./checkpoints/' + model_name) def save_checkpoint(epoch): save_dir = save_model / '{}.model'.format(epoch) save_opt_dir = save_model / '{}.opt'.format(epoch) logging.info("saving model to %s..." % save_dir) torch.save(dmm.state_dict(), save_dir) logging.info("saving optimizer states to %s..." % save_opt_dir) adam.save(save_opt_dir) logging.info("done saving model and optimizer checkpoints to disk.") # Starting epoch start_epoch = args.startepoch # loads the model and optimizer states from disk if start_epoch != 0: load_opt = './checkpoints/' + model_name + \ '/e{}-i188-opt-tn{}.opt'.format(start_epoch - 1, args.trialnumber) load_model = './checkpoints/' + model_name + \ '/e{}-i188-tn{}.pt'.format(start_epoch - 1, args.trialnumber) def load_checkpoint(): # assert exists(load_opt) and exists(load_model), \ # "--load-model and/or --load-opt misspecified" logging.info("loading model from %s..." % load_model) dmm.load_state_dict(torch.load(load_model, map_location=device)) # logging.info("loading optimizer states from %s..." % load_opt) # adam.load(load_opt) # logging.info("done loading model and optimizer states.") if load_model != '': logging.info('Load checkpoint') load_checkpoint() # Validation only? validation_only = args.validonly # Train the model if not validation_only: logging.info("Training model") annealing_epochs = 1000 minimum_annealing_factor = 0.2 N_train_size = 3000 N_mini_batches = int(N_train_size / train_batch_size + int(N_train_size % train_batch_size > 0)) for epoch in tqdm(range(start_epoch, epochs), desc='Epoch', leave=True): r_loss_train = 0 dmm.train(True) idx = 0 mov_avg_loss = 0 mov_data_len = 0 for which_mini_batch, data in enumerate(tqdm(train_dataloader, desc='Train', leave=True)): if annealing_epochs > 0 and epoch < annealing_epochs: # compute the KL annealing factor approriate for the current mini-batch in the current epoch min_af = minimum_annealing_factor annealing_factor = min_af + (1.0 - min_af) * \ (float(which_mini_batch + epoch * N_mini_batches + 1) / float(annealing_epochs * N_mini_batches)) else: # by default the KL annealing factor is unity annealing_factor = 1.0 data['observation'] = normalize( data['observation'].unsqueeze(2).to(device)) batch_size, length, _, w, h = data['observation'].shape data_reversed = reverse_sequences(data['observation']) data_mask = torch.ones( batch_size, length, input_channels, w, h).cuda() loss = svi.step(data['observation'], data_reversed, data_mask, annealing_factor) # Running losses mov_avg_loss += loss mov_data_len += batch_size r_loss_train += loss idx += 1 # Average losses train_loss_avg = r_loss_train / (len(train_dataset) * time_length) writer.add_scalar('Loss/train', train_loss_avg, epoch) logging.info("Epoch: %d, Training loss: %1.5f", epoch, train_loss_avg) # # Time to time evaluation if epoch == epochs - 1: for temp_pred_length in [20]: r_loss_valid = 0 r_loss_loc_valid = 0 r_loss_scale_valid = 0 r_loss_latent_valid = 0 dmm.train(False) val_pred_length = temp_pred_length val_pred_input_length = 10 with torch.no_grad(): for i, data in enumerate(tqdm(valid_dataloader, desc='Eval', leave=True)): data['observation'] = normalize( data['observation'].unsqueeze(2).to(device)) batch_size, length, _, w, h = data['observation'].shape data_reversed = reverse_sequences( data['observation']) data_mask = torch.ones( batch_size, length, input_channels, w, h).cuda() pred_tensor = data['observation'][:, :input_length_for_pred, :, :, :] pred_tensor_reversed = reverse_sequences( pred_tensor) pred_tensor_mask = torch.ones( batch_size, input_length_for_pred, input_channels, w, h).cuda() ground_truth = data['observation'][:, input_length_for_pred:, :, :, :] val_nll = svi.evaluate_loss( data['observation'], data_reversed, data_mask) preds, _, loss_loc, loss_scale = do_prediction_rep_inference( dmm, pred_tensor_mask, val_pred_length, val_pred_input_length, data['observation']) ground_truth = denormalize( data['observation'].squeeze().cpu().detach() ) pred_with_input = denormalize( torch.cat( [data['observation'][:, :-val_pred_length, :, :, :].squeeze(), preds.squeeze()], dim=0 ).cpu().detach() ) # Running losses r_loss_valid += val_nll r_loss_loc_valid += loss_loc r_loss_scale_valid += loss_scale # Average losses valid_loss_avg = r_loss_valid / \ (len(valid_dataset) * time_length) valid_loss_loc_avg = r_loss_loc_valid / \ (len(valid_dataset) * val_pred_length * width * height) valid_loss_scale_avg = r_loss_scale_valid / \ (len(valid_dataset) * val_pred_length * width * height) writer.add_scalar('Loss/test', valid_loss_avg, epoch) writer.add_scalar( 'Loss/test_obs', valid_loss_loc_avg, epoch) writer.add_scalar('Loss/test_scale', valid_loss_scale_avg, epoch) logging.info("Validation loss: %1.5f", valid_loss_avg) logging.info("Validation obs loss: %1.5f", valid_loss_loc_avg) logging.info("Validation scale loss: %1.5f", valid_loss_scale_avg) log_evaluation(evaluation_logpath, "Validation obs loss for {}s pred {}: {}\n".format( val_pred_length, args.trialnumber, valid_loss_loc_avg)) log_evaluation(evaluation_logpath, "Validation scale loss for {}s pred {}: {}\n".format( val_pred_length, args.trialnumber, valid_loss_scale_avg)) # Save model if epoch % 50 == 0 or epoch == epochs - 1: torch.save(dmm.state_dict(), args.modelsavepath / model_name / 'e{}-i{}-tn{}.pt'.format(epoch, idx, args.trialnumber)) adam.save(args.modelsavepath / model_name / 'e{}-i{}-opt-tn{}.opt'.format(epoch, idx, args.trialnumber)) # Last validation after training test_samples_indices = range(100) total_n = 0 if validation_only: r_loss_loc_valid = 0 r_loss_scale_valid = 0 r_loss_latent_valid = 0 dmm.train(False) val_pred_length = args.validpredlength val_pred_input_length = 10 with torch.no_grad(): for i in tqdm(test_samples_indices, desc='Valid', leave=True): # Data processing data = valid_dataset[i] if torch.isnan(torch.sum(data['observation'])): print("Skip {}".format(i)) continue else: total_n += 1 data['observation'] = normalize( data['observation'].unsqueeze(0).unsqueeze(2).to(device)) batch_size, length, _, w, h = data['observation'].shape data_reversed = reverse_sequences(data['observation']) data_mask = torch.ones( batch_size, length, input_channels, w, h).to(device) # Prediction pred_tensor_mask = torch.ones( batch_size, input_length_for_pred, input_channels, w, h).to(device) preds, _, loss_loc, loss_scale = do_prediction_rep_inference( dmm, pred_tensor_mask, val_pred_length, val_pred_input_length, data['observation']) ground_truth = denormalize( data['observation'].squeeze().cpu().detach() ) pred_with_input = denormalize( torch.cat( [data['observation'][:, :-val_pred_length, :, :, :].squeeze(), preds.squeeze()], dim=0 ).cpu().detach() ) # Save samples if i < 5: save_dir_samples = Path('./samples/more_variance_long') with open(save_dir_samples / '{}-gt-test.pkl'.format(i), 'wb') as fout: pickle.dump(ground_truth, fout) with open(save_dir_samples / '{}-vanilladmm-pred-test.pkl'.format(i), 'wb') as fout: pickle.dump(pred_with_input, fout) # Running losses r_loss_loc_valid += loss_loc r_loss_scale_valid += loss_scale r_loss_latent_valid += np.sum((preds.squeeze().detach().cpu().numpy( ) - data['latent'][time_length - val_pred_length:, :, :].detach().cpu().numpy()) ** 2) # Average losses test_samples_indices = range(total_n) print(total_n) valid_loss_loc_avg = r_loss_loc_valid / \ (total_n * val_pred_length * width * height) valid_loss_scale_avg = r_loss_scale_valid / \ (total_n * val_pred_length * width * height) valid_loss_latent_avg = r_loss_latent_valid / \ (total_n * val_pred_length * width * height) logging.info("Validation obs loss for %ds pred VanillaDMM: %f", val_pred_length, valid_loss_loc_avg) logging.info("Validation latent loss: %f", valid_loss_latent_avg) with open('VanillaDMMResult.log', 'a+') as fout: validation_log = 'Pred {}s VanillaDMM: {}\n'.format( val_pred_length, valid_loss_loc_avg) fout.write(validation_log)
def train(): parser = argparse.ArgumentParser(description='Train VAE.') parser.add_argument('-c', '--config', default='train_config.json', help='Config file.') args = parser.parse_args() print(args) c = json.load(open(args.config)) print(c) pyro.clear_param_store() # TODO: Move to config file. lookback = 50 max_n_files = None train_start_date = datetime.strptime(c['train_start_date'], '%Y/%m/%d') train_end_date = datetime.strptime(c['train_end_date'], '%Y/%m/%d') val_start_date = datetime.strptime(c['val_start_date'], '%Y/%m/%d') val_end_date = datetime.strptime(c['val_end_date'], '%Y/%m/%d') min_sequence_length_train = 2 * (c['series_length'] + lookback) min_sequence_length_test = 2 * (c['series_length'] + lookback) out_path = Path(c['out_dir']) out_path.mkdir(exist_ok=True) dataset_train = create_ticker_dataset(c['in_dir'], c['series_length'], lookback, min_sequence_length_train, start_date=train_start_date, end_date=train_end_date, normalised_returns=c['normalised_returns'], max_n_files=max_n_files) dataset_val = create_ticker_dataset(c['in_dir'], c['series_length'], lookback, min_sequence_length_test, start_date=val_start_date, end_date=val_end_date, fixed_start_date=True, normalised_returns=c['normalised_returns'], max_n_files=max_n_files) train_loader = DataLoader(dataset_train, batch_size=c['batch_size'], shuffle=True, num_workers=0, drop_last=True) val_loader = DataLoader(dataset_val, batch_size=c['batch_size'], shuffle=False, num_workers=0, drop_last=True) N_train_data = len(dataset_train) N_val_data = len(dataset_val) N_mini_batches = N_train_data // c['batch_size'] N_train_time_slices = c['batch_size'] * N_mini_batches print(f'N_train_data: {N_train_data}, N_val_data: {N_val_data}') # setup the VAE vae = VAE(c['series_length'], z_dim=c['z_dim'], hidden_dims=c['hidden_dims'], use_cuda=c['cuda']) # setup the optimizer adam_args = {"lr": c['learning_rate']} optimizer = Adam(adam_args) # setup the inference algorithm elbo = JitTrace_ELBO() if c['jit'] else Trace_ELBO() svi = SVI(vae.model, vae.guide, optimizer, loss=elbo) if c['checkpoint_load']: checkpoint = torch.load(c['checkpoint_load']) vae.load_state_dict(checkpoint['model_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) train_elbo = [] val_elbo = [] # training loop for epoch in range(c['n_epochs']): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for batch in train_loader: x = batch['series'] # if on GPU put mini-batch into CUDA memory if c['cuda']: x = x.cuda() # do ELBO gradient and accumulate loss epoch_loss += svi.step(x.float()) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) torch.save({ 'epoch': epoch, 'model_state_dict': vae.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': epoch_loss }, out_path / c['checkpoint_save'].format(epoch)) if epoch % c['val_frequency'] == 0: # initialize loss accumulator val_loss = 0. # compute the loss over the entire test set for i, batch in enumerate(val_loader): x = batch['series'] # if on GPU put mini-batch into CUDA memory if c['cuda']: x = x.cuda() x = x.float() # compute ELBO estimate and accumulate loss val_loss += svi.evaluate_loss(x) if i == 0: # Visualise first batch. x_reconst = vae.reconstruct_img(x) x = x.cpu().numpy() x_reconst = x_reconst.cpu().detach().numpy() n = min(5, x.shape[0]) fig, axes = plt.subplots(n, 1, squeeze=False) for s in range(n): ax = axes[s, 0] ax.plot(x[s]) ax.plot(x_reconst[s]) fig.savefig(out_path / f'val_{epoch:03d}.png') plt.close(fig) # report test diagnostics normalizer_val = len(val_loader.dataset) total_epoch_loss_val = val_loss / normalizer_val val_elbo.append(total_epoch_loss_val) print("[epoch %03d] average val loss: %.4f" % (epoch, total_epoch_loss_val)) # t-SNE. all_z_latents = [] for batch in val_loader: x = batch['series'] # z_latents = minibatch_inference(dmm, test_batch) # z_latents = encode_x_to_z(dmm, test_batch, sample_z_t=False) # x, z, x_reconst = test_minibatch(dmm, test_batch, args, sample_z=True) if c['cuda']: x = x.cuda() z_loc, z_scale, z = vae.encode_x(x.float()) all_z_latents.append(z.cpu().numpy()) # all_latents = torch.cat(all_z_latents, dim=0) all_latents = np.concatenate(all_z_latents, axis=0) # Run t-SNE with 2 output dimensions. from sklearn.manifold import TSNE model_tsne = TSNE(n_components=2, random_state=0) # z_states = all_latents.detach().cpu().numpy() z_states = all_latents z_embed = model_tsne.fit_transform(z_states) # Plot t-SNE embedding. fig = plt.figure() plt.scatter(z_embed[:, 0], z_embed[:, 1], s=10) fig.savefig(out_path / f'tsne_{epoch:03d}.png') plt.close(fig) print('Finished training.')
class SVILossCompute(LossCompute): """A simple loss compute and train function.""" def __init__(self, generator, model, guide, optimizer, optim_params, elbo_type='TraceELBO', num_particles=1, eval=False, step=1. / 30000.0, aux_model=None, aux_guide=None): optim = self.getOptimizer(optimizer, optim_params) elbo = self.getELBO(elbo_type, num_particles) criterion = SVI(model, guide, optim, loss=elbo) super(SVILossCompute, self).__init__(generator, criterion, optim) self.eval = eval self.guide = guide self.model = model self.kl_anneal = step self.step = step self.aux_criterion = None #hack to get only KL term self.model_no_obs = poutine.block(model, hide=["preds", 'lm_preds']) optim = self.getOptimizer(optimizer, optim_params) elbo = self.getELBO(elbo_type, num_particles) self.kl_eval_svi = SVI(self.model_no_obs, self.guide, optim, elbo) #aux model and guide are for calculating additional loss terms... if aux_model is not None and aux_guide is not None: print('setting aux loss, ') logging.info("setting aux loss") optim = self.getOptimizer(optimizer, optim_params) elbo = self.getELBO(elbo_type, num_particles) self.aux_criterion = SVI(aux_model, aux_guide, optim, loss=elbo) self.aux_guide = aux_guide self.aux_model = aux_model def setKLAnnealingSchedule(self, step_size, kl_anneal): """ step_size: how much to increase weight of KL term at each step beta: current weight of kl term """ self.step = step_size self.kl_anneal = kl_anneal def getKLAnnealingSchedule(self): return self.step, self.kl_anneal def getOptimizerStateDict(self): return self.criterion.optim.get_state() def setOptimizerStateDict(self, state_dict): return self.criterion.optim.set_state(state_dict) def getELBO(self, elbo_type, particles): if elbo_type == 'TraceELBO': return Trace_ELBO(num_particles=particles) elif elbo_type == "MeanFieldELBO": return TraceMeanField_ELBO(num_particles=particles) else: raise ValueError("{} ELBO not supported".format(elbo_type)) def getOptimizer(self, optimizer, optim_params): if optimizer == 'clippedadam': return PyroOptim(ClippedAdam, optim_params) elif optimizer == 'adadelta': #not 100% on this but pretty sure ** "dereferences" the dictionary return Adadelta(optim_params) elif optimizer == 'clippedadadelta': #since it's custom, gotta set it up in the way Pyro expects return PyroOptim(ClippedAdadelta, optim_params) else: raise ValueError("{} optimizer not supported".format(optimizer)) def __call__(self, src, trg, src_mask, trg_mask, src_lengths, trg_lengths, trg_y, norm): #x = self.generator(x) kl_anneal = self.kl_anneal if self.eval: #you could also do .eval_loss or something but this allows a bit more probing of results with torch.no_grad(): elbo = self.criterion.evaluate_loss( src, trg, src_mask, trg_mask, src_lengths, trg_lengths, trg_y) * norm kl_term = self.kl_eval_svi.evaluate_loss( src, trg, src_mask, trg_mask, src_lengths, trg_lengths, trg_y) * norm nll = elbo - kl_term def torch_item(x): return x if isinstance(x, numbers.Number) else x.item() if self.aux_criterion is not None: aux_loss = self.aux_criterion.evaluate_loss( src, trg, src_mask, trg_mask, src_lengths, trg_lengths, trg_y) else: aux_loss = -1.0 loss = { 'elbo': elbo, 'nll': nll, 'approx_kl': kl_term, 'aux_loss': aux_loss } else: loss = self.criterion.step(src, trg, src_mask, trg_mask, src_lengths, trg_lengths, trg_y, kl_anneal) if self.aux_criterion is not None: aux_loss = self.aux_criterion.step(src, trg, src_mask, trg_mask, src_lengths, trg_lengths, trg_y, kl_anneal) loss = loss * norm self.kl_anneal = min(self.kl_anneal + self.step, 1.0) return loss
def main(args): # setup MNIST data loaders # train_loader, test_loader train_loader, test_loader = setup_data_loaders(MNIST, use_cuda=args.cuda, batch_size=256) # setup the VAE vae = VAE(use_cuda=args.cuda) # setup the optimizer adam_args = {"lr": args.learning_rate} optimizer = Adam(adam_args) # setup the inference algorithm svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO()) # setup visdom for visualization if args.visdom_flag: vis = visdom.Visdom() train_elbo = [] test_elbo = [] # training loop for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do a training epoch over each mini-batch x returned # by the data loader for _, (x, _) in enumerate(train_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # do ELBO gradient and accumulate loss epoch_loss += svi.step(x) # report training diagnostics normalizer_train = len(train_loader.dataset) total_epoch_loss_train = epoch_loss / normalizer_train train_elbo.append(total_epoch_loss_train) print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train)) if epoch % args.test_frequency == 0: # initialize loss accumulator test_loss = 0. # compute the loss over the entire test set for i, (x, _) in enumerate(test_loader): # if on GPU put mini-batch into CUDA memory if args.cuda: x = x.cuda() # compute ELBO estimate and accumulate loss test_loss += svi.evaluate_loss(x) # pick three random test images from the first mini-batch and # visualize how well we're reconstructing them if i == 0: if args.visdom_flag: plot_vae_samples(vae, vis) reco_indices = np.random.randint(0, x.size(0), 3) for index in reco_indices: test_img = x[index, :] reco_img = vae.reconstruct_img(test_img) vis.image(test_img.reshape(28, 28).detach().cpu().numpy(), opts={'caption': 'test image'}) vis.image(reco_img.reshape(28, 28).detach().cpu().numpy(), opts={'caption': 'reconstructed image'}) # report test diagnostics normalizer_test = len(test_loader.dataset) total_epoch_loss_test = test_loss / normalizer_test test_elbo.append(total_epoch_loss_test) print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test)) if epoch == args.tsne_iter: mnist_test_tsne(vae=vae, test_loader=test_loader) plot_llk(np.array(train_elbo), np.array(test_elbo)) return vae