def evaluate(method: storch.method.Method, model: DiscreteVAE, data, optimizer): # Compute expected gradient optimizer.zero_grad() z = generative_story(storch.method.Expect("z"), model, data) storch.backward() expected_gradient = z.param_grads["probs"] # Collect gradient samples gradients = [] for i in range(100): optimizer.zero_grad() z = generative_story(method, model, data) elbo = storch.backward() gradients.append(z.param_grads["probs"]) gradients = storch.gather_samples(gradients, "gradients") mean_gradient = storch.reduce_plates(gradients, "gradients") bias_gradient = ( storch.reduce_plates((mean_gradient - expected_gradient) ** 2) ).sum() print( "Training ELBO " + str(elbo.item()) + " Gradient variance " + str(storch.variance(gradients, "gradients")._tensor.item()) + " Gradient bias " + str(bias_gradient._tensor.item()) )
def experiment(method): optim.zero_grad() b = Bernoulli(logits=eta.repeat(3)) x = method(b) cost = torch.sum((x - p)**2, -1) storch.add_cost(cost, "cost") storch.backward() return eta.grad.clone()
def experiment(method): for i in range(2000): optim.zero_grad() b = Bernoulli(logits=eta) x = method(b) cost = torch.sum((x - p)**2, -1) storch.add_cost(cost, "cost") storch.backward() optim.step() if i % 100 == 0: print(eta)
def estimate_variance(method): gradient_samples = [] for i in range(1000): f, c = compute_f(method) storch.add_cost(f, "f") storch.backward() gradient_samples.append(c.grad) gradients = storch.gather_samples(gradient_samples, "gradients") # print(gradients) print("variance", storch.variance(gradients, "gradients")) print("mean", storch.reduce_plates(gradients, "gradients")) print("st dev", torch.sqrt(storch.variance(gradients, "gradients"))) print(type(gradients)) print(gradients.shape) print(gradients.plates)
def train(method: storch.method.Method, train_loader): model = DiscreteVAE() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) for epoch in range(5): print("Epoch:" + str(epoch + 1)) for i, (data, _) in enumerate(train_loader): # if i % 300 == 0: # evaluate(method, model, data, optimizer) optimizer.zero_grad() generative_story(method, model, data) elbo = storch.backward() optimizer.step() if i % 300 == 0: print("Training ELBO " + str(elbo.item()))
import storch import torch from torch.distributions import Bernoulli from storch.method import GumbelSoftmax torch.manual_seed(0) p = torch.tensor(0.5, requires_grad=True) for i in range(10000): sample = GumbelSoftmax(f"sample_{i}")(Bernoulli(p)) storch.add_cost(sample, f"cost_{i}") storch.backward() print("Finished")
score_method = storch.method.ScoreFunction("white_noise_1", n_samples=2) infer_method = storch.method.Infer("white_noise_2", Normal) def loss(v): return torch.nn.MSELoss(reduction="none")(v, theta).mean(dim=-1) mu = lax_method(Normal(mu_prior, 1)) k = expect( Categorical(probs=torch.tensor([[0.1, 0.3, 0.6], [0.1, 0.8, 0.1]], requires_grad=True)), ) agg_v = 0.0 s1 = 1.0 for i in range(2): k1, k2 = 0, 0 if i == 1: k1 = k[:, 0] k2 = k[:, 1] s1 = score_method(Normal(mu + k1, 1)) aaa = -mu + s1 * k2 s2 = infer_method(Normal(-mu + s1 * k2, 1)) # plus = lambda a, b: a + b # plus = storch.deterministic(plus) agg_v = agg_v + s1 + s2 * mu print(isinstance(agg_v, Iterable)) storch.add_cost(loss(agg_v), "loss") storch.backward(debug=False, print_costs=True)
import storch import torch from torch.distributions import Bernoulli, OneHotCategorical expect = storch.method.Expect("x") probs = torch.tensor([0.95, 0.01, 0.01, 0.01, 0.01, 0.01], requires_grad=True) indices = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) b = OneHotCategorical(probs=probs) z = expect.sample(b) c = (2.4 * z * indices).sum(-1) storch.add_cost(c, "no_baseline_cost") storch.backward() expect_grad = z.grad["probs"].clone() def eval(grads): print("----------------------------------") grad_samples = storch.gather_samples(grads, "variance") mean = storch.reduce_plates(grad_samples, plates=["variance"]) print("mean grad", mean) print("expected grad", expect_grad) print("specific_diffs", (mean - expect_grad)**2) mse = storch.reduce_plates((grad_samples - expect_grad)**2).sum() print("MSE", mse) bias = (storch.reduce_plates((mean - expect_grad)**2)).sum() print("bias", bias) return bias
d = a + b # Sample e from a normal distribution using reparameterization normal_distribution = Normal(b + c, 1) e = method(normal_distribution) f = d * e * e return f, c # e*e follows a noncentral chi-squared distribution https://en.wikipedia.org/wiki/Noncentral_chi-squared_distribution # exp_f = d * (1 + mu * mu) repar = Reparameterization("e", n_samples=1) f, c = compute_f(repar) storch.add_cost(f, "f") print(storch.backward()) print("first derivative estimate", c.grad) f, c = compute_f(repar) storch.add_cost(f, "f") print(storch.backward()) print("second derivative estimate", c.grad) def estimate_variance(method): gradient_samples = [] for i in range(1000): f, c = compute_f(method) storch.add_cost(f, "f")
def train(epoch, model, train_loader, device, optimizer, args, writer): model.train() train_loss = 0 for batch_idx, (data, _) in enumerate(train_loader): data = data.to(device) optimizer.zero_grad() storch.reset() # Denote the minibatch dimension as being independent data = storch.denote_independent(data.view(-1, 784), 0, "data") recon_batch, KLD, z = model(data) storch.add_cost(loss_function(recon_batch, data), "reconstruction") cost = backward() train_loss += cost.item() optimizer.step() cond_log = batch_idx % args.log_interval == 0 if cond_log: step = 100.0 * batch_idx / len(train_loader) global_step = 100 * (epoch - 1) + step # Variance of expect method is 0 by definition. variances = {} if args.method != "expect" and args.variance_samples > 1: _consider_param = "probs" if args.latents < 3: old_method = model.sampling_method model.sampling_method = Expect("z") optimizer.zero_grad() recon_batch, _, z = model(data) storch.add_cost(loss_function(recon_batch, data), "reconstruction") backward() expect_grad = storch.reduce_plates( z.grad[_consider_param]).detach_tensor() optimizer.zero_grad() model.sampling_method = old_method grads = {n: [] for n in z.grad} for i in range(args.variance_samples): optimizer.zero_grad() recon_batch, _, z = model(data) storch.add_cost(loss_function(recon_batch, data), "reconstruction") backward() for param_name, grad in z.grad.items(): # Make sure to reduce the data dimension and detach, for memory reasons. grads[param_name].append( storch.reduce_plates(grad).detach_tensor()) variances = {} for param_name, gradz in grads.items(): # Create a new independent dimension for the different gradient samples grad_samples = storch.gather_samples(gradz, "variance") # Compute the variance over this independent dimension variances[param_name] = storch.variance( grad_samples, "variance")._tensor if param_name == _consider_param and args.latents < 3: mean = storch.reduce_plates(grad_samples, "variance") mse = storch.reduce_plates( (grad_samples - expect_grad)**2).sum() bias = (storch.reduce_plates( (mean - expect_grad)**2)).sum() print("mse", mse._tensor.item()) # Should approach 0 when increasing variance_samples for unbiased estimators. print("bias", bias._tensor.item()) writer.add_scalar("train/probs_bias", bias._tensor, global_step) writer.add_scalar("train/probs_mse", mse._tensor, global_step) print( "Train Epoch: {} [{}/{} ({:.0f}%)]\tCost: {:.6f}\t Logits var {}" .format( epoch, batch_idx * len(data), len(train_loader.dataset), step, cost.item(), variances, )) writer.add_scalar("train/ELBO", cost, global_step) for param_name, var in variances.items(): writer.add_scalar("train/variance/" + param_name, var, global_step) avg_train_loss = train_loss / (batch_idx + 1) print("====> Epoch: {} Average loss: {:.4f}".format(epoch, avg_train_loss)) return avg_train_loss