loss.backward() optimizer.step() minibatch_iter.set_postfix(loss=loss.item()) def test(): model.eval() likelihood.eval() correct = 0 with torch.no_grad(), num_likelihood_samples(16): for data, target in test_loader: if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() output = likelihood(model(data)) # This gives us 16 samples from the predictive distribution pred = output.probs.mean(0).argmax(-1) # Taking the mean over all of the sample we've drawn correct += pred.eq(target.view_as(pred)).cpu().sum() print('Test set: Accuracy: {}/{} ({}%)'.format( correct, len(test_loader.dataset), 100. * correct / float(len(test_loader.dataset)) )) for epoch in range(1, n_epochs + 1): with use_toeplitz(False): train(epoch) test() scheduler.step() state_dict = model.state_dict() likelihood_state_dict = likelihood.state_dict() torch.save({'model': state_dict, 'likelihood': likelihood_state_dict}, 'dkl_cifar_checkpoint.dat')
acq_value.item(), pred_rmse.item(), pred_avg_variance.item() ] print("Step RMSE: ", pred_rmse) all_outputs.append(step_output_list) start_ind = end_ind end_ind = int(end_ind + args.batch_size) output_dict = { "model_state_dict": model.cpu().state_dict(), "queried_points": { 'x': model.cpu().train_inputs[0], 'y': model.cpu().train_targets }, "results": DataFrame(all_outputs) } torch.save(output_dict, args.output) if __name__ == "__main__": args = parse() with fast_pred_var(True), \ use_toeplitz(args.toeplitz), \ detach_test_caches(True), \ max_cholesky_size(args.cholesky_size), \ max_root_decomposition_size(args.sketch_size), \ root_pred_var(True): main(args)
def main(args): if args.cuda and torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") init_dict, train_dict, test_dict = prepare_data(args.data_loc, args.num_init, args.num_total, test_is_year=False) init_x, init_y, init_y_var = ( init_dict["x"].to(device), init_dict["y"].to(device), init_dict["y_var"].to(device), ) train_x, train_y, train_y_var = ( train_dict["x"].to(device), train_dict["y"].to(device), train_dict["y_var"].to(device), ) test_x, test_y, test_y_var = ( test_dict["x"].to(device), test_dict["y"].to(device), test_dict["y_var"].to(device), ) model = FixedNoiseOnlineSKIGP( init_x, init_y.view(-1, 1), init_y_var.view(-1, 1), GridInterpolationKernel( base_kernel=ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), grid_size=30, num_dims=2, grid_bounds=torch.tensor([[0.0, 1.0], [0.0, 1.0]]), ), learn_additional_noise=False, ).to(device) mll = BatchedWoodburyMarginalLogLikelihood(model.likelihood, model) print("---- Fitting initial model ----") start = time.time() with skip_logdet_forward(True), max_root_decomposition_size( args.sketch_size), use_toeplitz(args.toeplitz): fit_gpytorch_torch(mll, options={"lr": 0.1, "maxiter": 1000}) end = time.time() print("Elapsed fitting time: ", end - start) model.zero_grad() model.eval() print("--- Generating initial predictions on test set ----") start = time.time() with detach_test_caches(True), max_root_decomposition_size( args.sketch_size), max_cholesky_size( args.cholesky_size), use_toeplitz(args.toeplitz): pred_dist = model(test_x) pred_mean = pred_dist.mean.detach() # pred_var = pred_dist.variance.detach() end = time.time() print("Elapsed initial prediction time: ", end - start) rmse_initial = ((pred_mean.view(-1) - test_y.view(-1))**2).mean().sqrt() print("Initial RMSE: ", rmse_initial.item()) optimizer = torch.optim.Adam(model.parameters(), lr=1e-2) mll_time_list = [] rmse_list = [] for i in range(500, train_x.shape[0]): model.zero_grad() model.train() start = time.time() with skip_logdet_forward(True), max_root_decomposition_size( args.sketch_size), max_cholesky_size( args.cholesky_size), use_toeplitz(args.toeplitz): loss = -mll(model(train_x[:i]), train_y[:i]).sum() loss.backward() mll_time = start - time.time() optimizer.step() model.zero_grad() optimizer.zero_grad() start = time.time() with torch.no_grad(): model.condition_on_observations( train_x[i].unsqueeze(0), train_y[i].view(1, 1), train_y_var[i].view(-1, 1), inplace=True, ) fantasy_time = start - time.time() mll_time_list.append([-mll_time, -fantasy_time]) if i % 25 == 0: start = time.time() model.eval() model.zero_grad() with detach_test_caches(), max_root_decomposition_size( args.sketch_size), max_cholesky_size(args.cholesky_size): pred_dist = model(test_x) end = time.time() rmse = (((pred_dist.mean - test_y.view(-1))**2).mean().sqrt().item()) rmse_list.append([rmse, end - start]) print("Current RMSE: ", rmse) print("Outputscale: ", model.covar_module.base_kernel.raw_outputscale) print( "Lengthscale: ", model.covar_module.base_kernel.base_kernel.raw_lengthscale, ) print("Step: ", i, "Train Loss: ", loss) optimizer.param_groups[0]["lr"] *= 0.9 torch.save({ "training": mll_time_list, "predictions": rmse_list }, args.output)
def main(): parser = argparse.ArgumentParser( description='Deep Kernel Learning with synthetic data.') parser.add_argument('--datapath', type=str, help='Path to data directory.') parser.add_argument('--batchsize', type=int, default=10, help='Batch size.') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs.') parser.add_argument('--lr', type=float, default=0.1, help='Path to data directory.') parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args = parser.parse_args() torch.manual_seed(args.seed) use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") traindata = Synthetic(args.datapath, 'train', download=True) train_loader = DataLoader(traindata, batch_size=args.batchsize) num_classes = len(np.unique(traindata.targets)) testdata = Synthetic(args.datapath, 'test') test_loader = DataLoader(testdata, batch_size=args.batchsize) feature_extractor = ConvFeatureExtractor().to(device) num_features = feature_extractor._filter_sum model = DKLModel(feature_extractor, num_dim=5).to(device) likelihood = SoftmaxLikelihood(num_features=model.num_dim, n_classes=num_classes).to(device) optimizer = SGD([ { 'params': model.feature_extractor.parameters() }, { 'params': model.gp_layer.hyperparameters(), 'lr': args.lr * 0.01 }, { 'params': model.gp_layer.variational_parameters() }, { 'params': likelihood.parameters() }, ], lr=args.lr, momentum=0.9, nesterov=True, weight_decay=0) scheduler = MultiStepLR( optimizer, milestones=[0.5 * args.n_epochs, 0.75 * args.n_epochs], gamma=0.1) for epoch in range(1, args.n_epochs + 1): scheduler.step() with settings.use_toeplitz(False), settings.max_preconditioner_size(0): train(epoch, train_loader, optimizer, likelihood, model, device) test(test_loader, likelihood, model, device) state_dict = model.state_dict() likelihood_state_dict = likelihood.state_dict() torch.save({ 'model': state_dict, 'likelihood': likelihood_state_dict }, 'dkl_synthetic_checkpoint.dat')
def main_loop(flags): random.seed(flags.manual_seed) np.random.seed(flags.manual_seed) torch.manual_seed(flags.manual_seed) torch.cuda.manual_seed_all(flags.manual_seed) # Check if CUDA is available device_str = f"cuda:{flags.gpu}" if (torch.cuda.is_available() and flags.gpu > -1) else "cpu" flags.device = torch.device(device_str) print(flags) # print the configuration # Construct model and all other necessary objects model, likelihood, mll, optimizer, train_ds, test_ds = construct_model( flags) print(f"Number of training samples: {len(train_ds)}") shuff = len( train_ds ) != flags.batch_size # don't shuffle if each batch equals the dataset train_loader = torch.utils.data.DataLoader(train_ds, batch_size=flags.batch_size, shuffle=shuff) test_loader = torch.utils.data.DataLoader(test_ds, batch_size=flags.batch_size) # Set checkpoint path if flags.save_dir: save_dir = Path(flags.save_dir) / flags.model_name save_dir.mkdir(parents=True, exist_ok=True) else: save_dir = Path(mkdtemp()) # Create temporary directory best_loss = np.inf start_epoch = 1 # Restore from checkpoint if one exists best_checkpoint = save_dir / "model_best.pth.tar" previous_checkpoints = list(save_dir.glob("checkpoint_*.pth.tar")) if previous_checkpoints: latest_chkpt = max(previous_checkpoints ) # `max()` is here equivalent to `sorted(...)[-1]` print(f"===> Restoring from '{latest_chkpt}'") start_epoch, best_loss = utils.load_checkpoint(latest_chkpt, model, likelihood, mll, optimizer) print(f"Training for {flags.epochs} epochs") # Main training loop for epoch in range(start_epoch, start_epoch + flags.epochs): print(f"Training on epoch {epoch}") start = time.time() step_counter = (epoch - 1) * len(train_loader) with settings.use_toeplitz(device_str == "cpu"): # settings.fast_computations(covar_root_decomposition=False),\ # settings.lazily_evaluate_kernels(state=False),\ # settings.tridiagonal_jitter(1e-2),\ # settings.max_cholesky_numel(4096),\ # settings.max_preconditioner_size(10),\ train(model, optimizer, train_loader, mll, step_counter, flags) end = time.time() print(f"Train time for epoch {epoch}: {end - start:0.2f}s") if epoch % flags.eval_epochs == 0: # do evaluation and update the best loss val_loss = evaluate(model, likelihood, test_loader, mll, step_counter, flags) if flags.save_best and val_loss < best_loss: best_loss = val_loss print(f"Best loss yet. Saving in '{best_checkpoint}'") utils.save_checkpoint(best_checkpoint, model, likelihood, mll, optimizer, epoch, best_loss) if epoch % flags.chkpt_epochs == 0: # Save checkpoint chkpt_path = save_dir / f"checkpoint_{epoch:04d}.pth.tar" print(f"===> Saving checkpoint in '{chkpt_path}'") utils.save_checkpoint(chkpt_path, model, likelihood, mll, optimizer, epoch, best_loss) # if predictions are to be save or to be plotted, then make predictions on the test set if flags.preds_path or flags.plot: # print("Loading best model...") # utils.load_checkpoint(best_checkpoint, model, likelihood) print("Making predictions...") pred_mean, pred_var = predict(model, likelihood, test_loader, flags.device) utils.save_predictions(pred_mean, pred_var, save_dir, flags) if flags.plot: getattr(plot, flags.plot)(pred_mean, pred_var, train_ds, test_ds)
for key in y_means: y_means[key] = y_means[key].cpu() output_dict = { "observations": { "x": train_x.cpu(), "y": train_y.cpu(), "means": y_means, "latent_y": latent_y.cpu(), }, "results": DataFrame(all_outputs), "args": args } torch.save(output_dict, args.output) if __name__ == "__main__": args = parse() use_fast_pred_var = True if not args.use_exact else False with use_toeplitz(args.toeplitz), max_cholesky_size( args.cholesky_size ), max_root_decomposition_size(args.sketch_size), cholesky_jitter( 1e-3 ), fast_pred_var( use_fast_pred_var ), fast_pred_samples( True ): main(args)
{'params': model.covar.parameters()}, {'params': model.mean.parameters()}, {'params': model.likelihood.parameters()}, ], lr=0.01) # "Loss" for GPs - the marginal log likelihood mll = ExactMarginalLogLikelihood(likelihood, model) training_iterations = 60 def train(): iterator = tqdm(range(training_iterations)) for i in iterator: # Zero backprop gradients optimizer.zero_grad() # Get output from model output = model(x_train) # Calc loss and backprop derivatives loss = -mll(output, y_train) loss.backward() iterator.set_postfix(loss=loss.item()) optimizer.step() train() model.eval() likelihood.eval() with torch.no_grad(), use_toeplitz(False), fast_pred_var(): preds = model(x_test) print('Test MAE: {}'.format(torch.mean(torch.abs(preds.mean - y_test))))