def forward(nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor) -> Tensor: sigma_constrained_value = sigma_unconstrained_value.exp() mu = X.mm(beta_value) # For this model, we need to compute the following three scores: # We need to compute the first and second gradient of this score with respect # to nu_value. nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \ + nu.log_prob(nu_value) # We need to compute the first and second gradient of this score with respect # to sigma_unconstrained_value. sigma_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \ + sigma.log_prob(sigma_constrained_value) \ + sigma_unconstrained_value # We need to compute the first and second gradient of this score with respect # to beta_value. beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \ + beta.log_prob(beta_value) return nu_score.sum() + sigma_score.sum() + beta_score.sum()
def miwae_impute(iota_x, mask, L, d, p_z, encoder, decoder): batch_size = iota_x.shape[0] p = iota_x.shape[1] out_encoder = encoder(iota_x) q_zgivenxobs = td.Independent( td.Normal(loc=out_encoder[..., :d], scale=torch.nn.Softplus()(out_encoder[..., d:(2 * d)])), 1) zgivenx = q_zgivenxobs.rsample([L]) zgivenx_flat = zgivenx.reshape([L * batch_size, d]) out_decoder = decoder(zgivenx_flat) all_means_obs_model = out_decoder[..., :p] all_scales_obs_model = torch.nn.Softplus()(out_decoder[..., p:(2 * p)]) + 0.001 all_degfreedom_obs_model = torch.nn.Softplus()( out_decoder[..., (2 * p):(3 * p)]) + 3 data_flat = torch.Tensor.repeat(iota_x, [L, 1]).reshape([-1, 1]).cuda() tiledmask = torch.Tensor.repeat(mask, [L, 1]).cuda() all_log_pxgivenz_flat = torch.distributions.StudentT( loc=all_means_obs_model.reshape([-1, 1]), scale=all_scales_obs_model.reshape([-1, 1]), df=all_degfreedom_obs_model.reshape([-1, 1])).log_prob(data_flat) all_log_pxgivenz = all_log_pxgivenz_flat.reshape([L * batch_size, p]) logpxobsgivenz = torch.sum(all_log_pxgivenz * tiledmask, 1).reshape([L, batch_size]) logpz = p_z.log_prob(zgivenx) logq = q_zgivenxobs.log_prob(zgivenx) xgivenz = td.Independent( td.StudentT(loc=all_means_obs_model, scale=all_scales_obs_model, df=all_degfreedom_obs_model), 1) imp_weights = torch.nn.functional.softmax( logpxobsgivenz + logpz - logq, 0) # these are w_1,....,w_L for all observations in the batch xms = xgivenz.sample().reshape([L, batch_size, p]) xm = torch.einsum('ki,kij->ij', imp_weights, xms) return xm
def kl_grad_shift_plot( ax: Axes, model: VariationalRegressor, training_dataset: Tuple[torch.Tensor], plot_dataset: Tuple[torch.Tensor] = plot_dataset, ) -> Axes: # Unpacking x_plot, y_plot, _ = plot_dataset x_train, _ = training_dataset # Plot X OOD with torch.set_grad_enabled(True): x_train.requires_grad = True μ_x, α_x, β_x = model(x_train) kl_divergence = model.kl(α_x, β_x, model.prior_α, model.prior_β) x_out = model.ood_x( x_train, kl=kl_divergence, ) x_train, x_out = ( x_train.detach().numpy().flatten(), x_out.detach().numpy().flatten(), ) # Reduce clutter by limiting number of points displayed N_display = 100 if x_out is not None and x_out.size > 0: ax.scatter( np.random.choice(x_out, N_display), np.zeros((N_display, )), color=colours["primaryRed"], alpha=0.5, marker="x", s=8, label=r"$\hat{x}_{n}$", ) ax.scatter( np.random.choice(x_train, N_display), np.zeros((N_display, )), color=colours["navyBlue"], alpha=0.5, marker="x", s=8, label=r"$x_{n}$", ) # Plot KL for reference # Plot box top_kl_plot = 3.5 plot_x_range = [data_range_plot[0] - 1, data_range_plot[1] + 1] with torch.set_grad_enabled(False): # Forward pass μ_x, α_x, β_x = model(torch.Tensor(x_plot)) kl = model.kl(α_x, β_x, model.prior_α, model.prior_β) ellk = model.ellk(μ_x, α_x, β_x, torch.Tensor(y_plot)) mllk = D.StudentT(2 * α_x, μ_x, torch.sqrt(β_x / α_x)).log_prob(y_plot) # TODO likelihood remove once study over gm = GaussianMixture(n_components=5).fit(x_train.reshape(-1, 1)) llk = np.exp(gm.score_samples(x_plot.reshape(-1, 1))).reshape(-1, 1) kl_llk = kl - llk # KL ax.plot( x_plot, kl, "o", label=r"KL(q($\lambda\mid$x)$\Vert$p($\lambda$))", markersize=2, markerfacecolor=(*colours_rgb["navyBlue"], 0.6), markeredgewidth=1, markeredgecolor=(*colours_rgb["navyBlue"], 0.1), ) # # ELLK ax.plot( x_plot, ellk, "o", label=r"ELLK(x,y,$\lambda$)", markersize=2, markerfacecolor=(*colours_rgb["orange"], 0.6), markeredgewidth=1, markeredgecolor=(*colours_rgb["orange"], 0.1), ) # # MLLK # ax.plot( # x_plot, # mllk, # "o", # label=r"MLLK(x,y)", # markersize=2, # markerfacecolor=(*colours_rgb["red"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["red"], 0.1), # ) # # KL penalised by density llk # ax.plot( # x_plot, # kl_llk, # "o", # label=r"KL(x)-LLK(x)", # markersize=2, # markerfacecolor=(*colours_rgb["brightGreen"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["brightGreen"], 0.1), # ) # # Density likelihood # ax.plot( # x_plot, # llk, # "o", # label=r"LLK(x)", # markersize=2, # markerfacecolor=(*colours_rgb["purple"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["purple"], 0.1), # ) # Gamma parameters # ax.plot( # x_plot, # α_x, # "o", # label=r"$\alpha$(x)", # markersize=2, # markerfacecolor=(*colours_rgb["pink"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["pink"], 0.1), # ) # ax.plot( # x_plot, # β_x, # "o", # label=r"$\beta$(x)", # markersize=2, # markerfacecolor=(*colours_rgb["purple"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["purple"], 0.1), # ) # Gamma split aleatoric epistemic # ax.plot( # x_plot, # β_x / α_x, # "o", # label=r"$\sigma_{aleatoric}(x) = \frac{\beta(x)}{\alpha(x)}$", # markersize=2, # markerfacecolor=(*colours_rgb["pink"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["pink"], 0.1), # ) # ax.plot( # x_plot, # α_x / (α_x - 1), # "o", # label=r"$\sigma_{epistemic}(x) = \frac{\alpha(x)}{\alpha(x) - 1}$", # markersize=2, # markerfacecolor=(*colours_rgb["purple"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["purple"], 0.1), # ) # ax.plot( # x_plot, # α_x / β_x, # "o", # label=r"$\frac{\alpha(x)}{\beta(x)}$", # markersize=2, # markerfacecolor=(*colours_rgb["primaryRed"], 0.6), # markeredgewidth=1, # markeredgecolor=(*colours_rgb["primaryRed"], 0.1), # ) # Misc ax.grid(True) ax.set_xlim(plot_x_range) ax.set_ylim([-top_kl_plot, top_kl_plot]) ax.set_xlabel("x") ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0) return ax
def log_likelihood_student(x, mu, sigma_square, df=2.0): sigma = sqrt(sigma_square) dist = distributions.StudentT(df=df, loc=mu, scale=sigma) return torch.sum(dist.log_prob(x), dim=1)
def Y(self) -> dist.Distribution: mu = torch.mv(self.X(), self.beta()) + self.alpha() return dist.StudentT(self.nu(), mu, self.sigma())
def get_ns_model_source(prob_label): """ Given the problem key prob_label, return (ns, p, cs), a tuple of - ns: a list of sample sizes n's - p: a kcgof.cdensity.UnnormalizedCondDensity representing the model p - rx: a callable object that takes n (sample size) and return a torch tensor of size n x d where d is the appropriate dimension. Represent the marginal distribuiton of x - cs: a kcgof.cdata.CondSource. The CondSource generates sample from the distribution r. * (p, cs) together specifies a conditional goodness-of-fit testing problem. """ slope_h0_d5 = torch.arange(5) + 1.0 # slope_h0_d20 = torch.arange(20) + 1.0 prob2tuples = { # A case where H0 is true. Gaussian least squares model. 'gaussls_h0_d5': ( [200, 300, 400, 500], # p cden.CDGaussianOLS(slope=slope_h0_d5, c=0, variance=1.0), # rx cden.RXIsotropicGaussian(dx=5), # CondSource for r cdat.CSGaussianOLS(slope=slope_h0_d5, c=0, variance=1.0), ), # simplest case where H0 is true. 'gaussls_h0_d1': ( [200, 300, 500], # p cden.CDGaussianOLS(slope=torch.tensor(1.0), c=1.0, variance=1.0), # rx cden.RXIsotropicGaussian(dx=1), # CondSource for r cdat.CSGaussianOLS(slope=torch.tensor(1.0), c=1.0, variance=1.0), ), # an obvious case for Gauss LS problem. H1 true. Very easy 'gaussls_h1_d1_easy': ( [100, 200, 300], # p cden.CDGaussianOLS(slope=torch.tensor(1.0), c=1.0, variance=1.0), # rx cden.RXIsotropicGaussian(dx=1), # CondSource for r cdat.CSGaussianOLS(slope=torch.tensor(2.0), c=-1.0, variance=1.0), ), # H1 case # r(y|x) = same model with a slightly different m (slope). # p(y|x) = Gaussian pdf[y - mx - q*x^2 - c]. Least squares with Gaussian noise. # r(x) = Gaussian N(0,1)? 'quad_quad_d1': ( [100, 300, 500], # p cden.CDAdditiveNoiseRegression(f=lambda X: 1.8 * X + X**2 + 1.0, noise=dists.Normal(0, 1), dx=1), # rx (prior on x) cden.RXIsotropicGaussian(dx=1), #Condsource for r cdat.CSAdditiveNoiseRegression(f=lambda X: 2.0 * X + X**2 + 1.0, noise=dists.Normal(0, 1), dx=1)), # H1 case. dx=dy=1. T(5) noise. Gaussian ordinary LS. # Or r(y|x) = t(5) noise + mx + c, m = c =1 # p(y|x) = Gaussian pdf[y - (mx + c), same m and c # r(x) can be any, N(0,1)? 'gauss_t_d1': ( [100, 300, 500], # p cden.CDGaussianOLS(slope=torch.ones(1), c=torch.ones(1), variance=1.0), # rx cden.RXIsotropicGaussian(dx=1), # CondSource for r cdat.CSAdditiveNoiseRegression(f=lambda X: 1.0 + X, noise=dists.StudentT(df=5), dx=1)), # H1 case (same as Zheng’s): # r(y|x) = Gaussian pdf[y - (mx + q*x^2 + c)], m = 1. c =1. q should be low # p(y|x) = Gaussian pdf[y - (mx + c), m=1. and c=1 # r(x) = U[-3,3] (linearity breaks down from approximately |X| > 2) 'quad_vs_lin_d1': ( [100, 400, 700, 1000], # p(y|x) cden.CDGaussianOLS(slope=torch.tensor([1.0]), c=torch.tensor([1.0]), variance=1.0), # rx lambda n: dists.Uniform(low=-2.0, high=2.0).sample((n, 1)), # CondSource for r(y|x) cdat.CSAdditiveNoiseRegression( f=lambda X: 1.0 * X + 0.1 * X**2 + 1.0, noise=dists.Normal(0, 1.0), dx=1)), } # end of prob2tuples # add more problems to prob2tuples prob2tuples['g_het_dx3'] = create_prob_g_het(dx=3) prob2tuples['g_het_dx4'] = create_prob_g_het(dx=4) prob2tuples['g_het_dx5'] = create_prob_g_het(dx=5) prob2tuples['g_het_dx10'] = create_prob_g_het(dx=10) if prob_label not in prob2tuples: raise ValueError('Unknown problem label. Need to be one of %s' % str(list(prob2tuples.keys()))) return prob2tuples[prob_label]
def test_step(self, batch, batch_idx): x, y = batch μ_x, α_x, β_x = self(x) log_likelihood = self.ellk(μ_x, α_x, β_x, y) kl_divergence = self.kl(α_x, β_x, self.prior_α, self.prior_β) if self.mse_mode: loss = F.mse_loss(μ_x, y) else: loss = -self.elbo(log_likelihood, kl_divergence, train=False) y_pred = self.predictive_mean(x) m_p = D.StudentT(2 * α_x, loc=μ_x, scale=torch.sqrt(β_x / α_x)) # --------- # Metrics self.log(TEST_LOSS, loss, on_epoch=True) self.log(TEST_ELBO, -loss, on_epoch=True) self.log(TEST_MLLK, torch.sum(m_p.log_prob(y)), on_epoch=True) # i.i.d assumption # Mean fit self.log(TEST_MEAN_FIT_MAE, F.l1_loss(y_pred, y), on_epoch=True) self.log(TEST_MEAN_FIT_RMSE, torch.sqrt(F.mse_loss(y_pred, y)), on_epoch=True) # Variance fit pred_var = self.predictive_std(x)**2 empirical_var = (y_pred - y)**2 self.log(TEST_VARIANCE_FIT_MAE, F.l1_loss(pred_var, empirical_var), on_epoch=True) self.log( TEST_VARIANCE_FIT_RMSE, torch.sqrt(F.mse_loss(pred_var, empirical_var)), on_epoch=True, ) # Sample fit ancestral = False if ancestral: lbds = D.Gamma(α_x, β_x).sample((1, )) samples_y = (D.Normal(μ_x, 1 / torch.sqrt(lbds)).sample( (1, )).reshape(y.shape)) else: samples_y = m_p.sample((1, )).reshape(y.shape) self.log(TEST_SAMPLE_FIT_MAE, F.l1_loss(samples_y, y), on_epoch=True) self.log(TEST_SAMPLE_FIT_RMSE, torch.sqrt(F.mse_loss(samples_y, y)), on_epoch=True) # Model expected log likelihood self.log(TEST_ELLK, torch.mean(log_likelihood), on_epoch=True) # Model KL self.log(TEST_KL, torch.mean(kl_divergence), on_epoch=True) # Noise x_noisy = generate_noise_for_model_test(x) μ_x, α_x, β_x = self(x_noisy) sigma = self.predictive_std(x_noisy) kl_divergence = self.kl(α_x, β_x, self.prior_α, self.prior_β) # Noise likelihood self.log(NOISE_UNCERTAINTY, torch.mean(sigma), on_epoch=True) # Noise KL self.log(NOISE_KL, torch.mean(kl_divergence), on_epoch=True) return loss
def beta_baseline(self) -> dist.Distribution: return dist.StudentT(self.dof_baseline, 0.0, self.scale_baseline)
n_samples = 500 cols = 2 rows = 2 fig = plt.figure(figsize=(7 + cols, 2 + rows), facecolor='white', dpi=150) x = np.linspace(-8, 8, 200) x = torch.from_numpy(x).float() # print (m.log_prob(x)) m = d.Cauchy(torch.tensor([0.0]), torch.tensor([1.])) probs = torch.exp(m.log_prob(x)) m = d.Normal(torch.tensor([0.0]), torch.tensor([1.])) probs2 = torch.exp(m.log_prob(x)) m = d.StudentT(torch.tensor([2.0])) probs3 = torch.exp(m.log_prob(x)) ax = plt.subplot2grid((rows, cols), (0, 0), frameon=False) ax.plot(numpy(x), numpy(probs), label='Cauchy') ax.plot(numpy(x), numpy(probs2), label='Normal') ax.plot(numpy(x), numpy(probs3), label='StudentT') ax.legend() m = d.Cauchy(torch.tensor([0.0]), torch.tensor([1.])) samps = m.sample([n_samples]) m = d.Normal(torch.tensor([0.0]), torch.tensor([1.])) samps2 = m.sample([n_samples])
def variance(self): return dists.StudentT(self.df, self.loc, self.scale).variance
def expectation(self): return dists.StudentT(self.df, self.loc, self.scale).mean
def entropy(self): return dists.StudentT(self.df, self.loc, self.scale).entropy()
def sample(self, batch_size): model = dists.StudentT(self.df, self.loc, self.scale) return model.rsample((batch_size, ))
def log_prob(self, value): model = dists.StudentT(self.df, self.loc, self.scale) return model.log_prob(value).sum(-1)