Example #1
0
    def forward(nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor) -> Tensor:
        sigma_constrained_value = sigma_unconstrained_value.exp()
        mu = X.mm(beta_value)

        # For this model, we need to compute the following three scores:
        # We need to compute the first and second gradient of this score with respect
        # to nu_value.
        nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
            + nu.log_prob(nu_value)



        # We need to compute the first and second gradient of this score with respect
        # to sigma_unconstrained_value.
        sigma_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
            + sigma.log_prob(sigma_constrained_value) \
            + sigma_unconstrained_value



        # We need to compute the first and second gradient of this score with respect
        # to beta_value.
        beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
            + beta.log_prob(beta_value)

        return nu_score.sum() + sigma_score.sum() + beta_score.sum()
Example #2
0
def miwae_impute(iota_x, mask, L, d, p_z, encoder, decoder):
    batch_size = iota_x.shape[0]
    p = iota_x.shape[1]
    out_encoder = encoder(iota_x)
    q_zgivenxobs = td.Independent(
        td.Normal(loc=out_encoder[..., :d],
                  scale=torch.nn.Softplus()(out_encoder[..., d:(2 * d)])), 1)

    zgivenx = q_zgivenxobs.rsample([L])
    zgivenx_flat = zgivenx.reshape([L * batch_size, d])

    out_decoder = decoder(zgivenx_flat)
    all_means_obs_model = out_decoder[..., :p]
    all_scales_obs_model = torch.nn.Softplus()(out_decoder[...,
                                                           p:(2 * p)]) + 0.001
    all_degfreedom_obs_model = torch.nn.Softplus()(
        out_decoder[..., (2 * p):(3 * p)]) + 3

    data_flat = torch.Tensor.repeat(iota_x, [L, 1]).reshape([-1, 1]).cuda()
    tiledmask = torch.Tensor.repeat(mask, [L, 1]).cuda()

    all_log_pxgivenz_flat = torch.distributions.StudentT(
        loc=all_means_obs_model.reshape([-1, 1]),
        scale=all_scales_obs_model.reshape([-1, 1]),
        df=all_degfreedom_obs_model.reshape([-1, 1])).log_prob(data_flat)
    all_log_pxgivenz = all_log_pxgivenz_flat.reshape([L * batch_size, p])

    logpxobsgivenz = torch.sum(all_log_pxgivenz * tiledmask,
                               1).reshape([L, batch_size])
    logpz = p_z.log_prob(zgivenx)
    logq = q_zgivenxobs.log_prob(zgivenx)

    xgivenz = td.Independent(
        td.StudentT(loc=all_means_obs_model,
                    scale=all_scales_obs_model,
                    df=all_degfreedom_obs_model), 1)

    imp_weights = torch.nn.functional.softmax(
        logpxobsgivenz + logpz - logq,
        0)  # these are w_1,....,w_L for all observations in the batch
    xms = xgivenz.sample().reshape([L, batch_size, p])
    xm = torch.einsum('ki,kij->ij', imp_weights, xms)

    return xm
Example #3
0
def kl_grad_shift_plot(
    ax: Axes,
    model: VariationalRegressor,
    training_dataset: Tuple[torch.Tensor],
    plot_dataset: Tuple[torch.Tensor] = plot_dataset,
) -> Axes:
    # Unpacking
    x_plot, y_plot, _ = plot_dataset
    x_train, _ = training_dataset

    # Plot X OOD
    with torch.set_grad_enabled(True):
        x_train.requires_grad = True
        μ_x, α_x, β_x = model(x_train)
        kl_divergence = model.kl(α_x, β_x, model.prior_α, model.prior_β)
        x_out = model.ood_x(
            x_train,
            kl=kl_divergence,
        )
        x_train, x_out = (
            x_train.detach().numpy().flatten(),
            x_out.detach().numpy().flatten(),
        )

    # Reduce clutter by limiting number of points displayed
    N_display = 100

    if x_out is not None and x_out.size > 0:
        ax.scatter(
            np.random.choice(x_out, N_display),
            np.zeros((N_display, )),
            color=colours["primaryRed"],
            alpha=0.5,
            marker="x",
            s=8,
            label=r"$\hat{x}_{n}$",
        )
    ax.scatter(
        np.random.choice(x_train, N_display),
        np.zeros((N_display, )),
        color=colours["navyBlue"],
        alpha=0.5,
        marker="x",
        s=8,
        label=r"$x_{n}$",
    )

    # Plot KL for reference

    # Plot box
    top_kl_plot = 3.5
    plot_x_range = [data_range_plot[0] - 1, data_range_plot[1] + 1]

    with torch.set_grad_enabled(False):
        # Forward pass
        μ_x, α_x, β_x = model(torch.Tensor(x_plot))
        kl = model.kl(α_x, β_x, model.prior_α, model.prior_β)
        ellk = model.ellk(μ_x, α_x, β_x, torch.Tensor(y_plot))
        mllk = D.StudentT(2 * α_x, μ_x, torch.sqrt(β_x / α_x)).log_prob(y_plot)

        # TODO likelihood remove once study over
        gm = GaussianMixture(n_components=5).fit(x_train.reshape(-1, 1))
        llk = np.exp(gm.score_samples(x_plot.reshape(-1, 1))).reshape(-1, 1)
        kl_llk = kl - llk

    # KL
    ax.plot(
        x_plot,
        kl,
        "o",
        label=r"KL(q($\lambda\mid$x)$\Vert$p($\lambda$))",
        markersize=2,
        markerfacecolor=(*colours_rgb["navyBlue"], 0.6),
        markeredgewidth=1,
        markeredgecolor=(*colours_rgb["navyBlue"], 0.1),
    )
    # # ELLK
    ax.plot(
        x_plot,
        ellk,
        "o",
        label=r"ELLK(x,y,$\lambda$)",
        markersize=2,
        markerfacecolor=(*colours_rgb["orange"], 0.6),
        markeredgewidth=1,
        markeredgecolor=(*colours_rgb["orange"], 0.1),
    )
    # # MLLK
    # ax.plot(
    #     x_plot,
    #     mllk,
    #     "o",
    #     label=r"MLLK(x,y)",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["red"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["red"], 0.1),
    # )
    # # KL penalised by density llk
    # ax.plot(
    #     x_plot,
    #     kl_llk,
    #     "o",
    #     label=r"KL(x)-LLK(x)",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["brightGreen"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["brightGreen"], 0.1),
    # )
    # # Density likelihood
    # ax.plot(
    #     x_plot,
    #     llk,
    #     "o",
    #     label=r"LLK(x)",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["purple"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["purple"], 0.1),
    # )

    # Gamma parameters
    # ax.plot(
    #     x_plot,
    #     α_x,
    #     "o",
    #     label=r"$\alpha$(x)",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["pink"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["pink"], 0.1),
    # )
    # ax.plot(
    #     x_plot,
    #     β_x,
    #     "o",
    #     label=r"$\beta$(x)",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["purple"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["purple"], 0.1),
    # )

    # Gamma split aleatoric epistemic
    # ax.plot(
    #     x_plot,
    #     β_x / α_x,
    #     "o",
    #     label=r"$\sigma_{aleatoric}(x) = \frac{\beta(x)}{\alpha(x)}$",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["pink"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["pink"], 0.1),
    # )
    # ax.plot(
    #     x_plot,
    #     α_x / (α_x - 1),
    #     "o",
    #     label=r"$\sigma_{epistemic}(x) = \frac{\alpha(x)}{\alpha(x) - 1}$",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["purple"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["purple"], 0.1),
    # )
    # ax.plot(
    #     x_plot,
    #     α_x / β_x,
    #     "o",
    #     label=r"$\frac{\alpha(x)}{\beta(x)}$",
    #     markersize=2,
    #     markerfacecolor=(*colours_rgb["primaryRed"], 0.6),
    #     markeredgewidth=1,
    #     markeredgecolor=(*colours_rgb["primaryRed"], 0.1),
    # )

    # Misc
    ax.grid(True)
    ax.set_xlim(plot_x_range)
    ax.set_ylim([-top_kl_plot, top_kl_plot])
    ax.set_xlabel("x")
    ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

    return ax
Example #4
0
def log_likelihood_student(x, mu, sigma_square, df=2.0):
    sigma = sqrt(sigma_square)
    dist = distributions.StudentT(df=df, loc=mu, scale=sigma)
    return torch.sum(dist.log_prob(x), dim=1)
 def Y(self) -> dist.Distribution:
     mu = torch.mv(self.X(), self.beta()) + self.alpha()
     return dist.StudentT(self.nu(), mu, self.sigma())
Example #6
0
def get_ns_model_source(prob_label):
    """
    Given the problem key prob_label, return (ns, p, cs), a tuple of
    - ns: a list of sample sizes n's
    - p: a kcgof.cdensity.UnnormalizedCondDensity representing the model p
    - rx: a callable object that takes n (sample size) and return a torch
        tensor of size n x d where d is the appropriate dimension. Represent the
        marginal distribuiton of x
    - cs: a kcgof.cdata.CondSource. The CondSource generates sample from the
        distribution r.

    * (p, cs) together specifies a conditional goodness-of-fit testing problem.
    """
    slope_h0_d5 = torch.arange(5) + 1.0
    # slope_h0_d20 = torch.arange(20) + 1.0
    prob2tuples = {
        # A case where H0 is true. Gaussian least squares model.
        'gaussls_h0_d5': (
            [200, 300, 400, 500],
            # p
            cden.CDGaussianOLS(slope=slope_h0_d5, c=0, variance=1.0),
            # rx
            cden.RXIsotropicGaussian(dx=5),
            # CondSource for r
            cdat.CSGaussianOLS(slope=slope_h0_d5, c=0, variance=1.0),
        ),
        # simplest case where H0 is true.
        'gaussls_h0_d1': (
            [200, 300, 500],
            # p
            cden.CDGaussianOLS(slope=torch.tensor(1.0), c=1.0, variance=1.0),
            # rx
            cden.RXIsotropicGaussian(dx=1),
            # CondSource for r
            cdat.CSGaussianOLS(slope=torch.tensor(1.0), c=1.0, variance=1.0),
        ),

        # an obvious case for Gauss LS problem. H1 true. Very easy
        'gaussls_h1_d1_easy': (
            [100, 200, 300],
            # p
            cden.CDGaussianOLS(slope=torch.tensor(1.0), c=1.0, variance=1.0),
            # rx
            cden.RXIsotropicGaussian(dx=1),
            # CondSource for r
            cdat.CSGaussianOLS(slope=torch.tensor(2.0), c=-1.0, variance=1.0),
        ),
        # H1 case
        # r(y|x) = same model with a slightly different m (slope).
        # p(y|x) = Gaussian pdf[y - mx - q*x^2 - c]. Least squares with Gaussian noise.
        # r(x) = Gaussian N(0,1)?
        'quad_quad_d1': (
            [100, 300, 500],
            # p
            cden.CDAdditiveNoiseRegression(f=lambda X: 1.8 * X + X**2 + 1.0,
                                           noise=dists.Normal(0, 1),
                                           dx=1),
            # rx (prior on x)
            cden.RXIsotropicGaussian(dx=1),
            #Condsource for r
            cdat.CSAdditiveNoiseRegression(f=lambda X: 2.0 * X + X**2 + 1.0,
                                           noise=dists.Normal(0, 1),
                                           dx=1)),

        # H1 case. dx=dy=1. T(5) noise. Gaussian ordinary LS.
        # Or r(y|x) = t(5) noise + mx + c, m = c =1
        # p(y|x) =  Gaussian pdf[y - (mx + c), same m and c
        # r(x) can be any, N(0,1)?
        'gauss_t_d1': (
            [100, 300, 500],
            # p
            cden.CDGaussianOLS(slope=torch.ones(1),
                               c=torch.ones(1),
                               variance=1.0),
            # rx
            cden.RXIsotropicGaussian(dx=1),
            # CondSource for r
            cdat.CSAdditiveNoiseRegression(f=lambda X: 1.0 + X,
                                           noise=dists.StudentT(df=5),
                                           dx=1)),

        # H1 case (same as Zheng’s):
        # r(y|x) = Gaussian pdf[y - (mx + q*x^2 + c)], m = 1. c =1. q should be low
        # p(y|x) =  Gaussian pdf[y - (mx + c), m=1.  and c=1
        # r(x) = U[-3,3] (linearity breaks down from approximately |X| > 2)
        'quad_vs_lin_d1': (
            [100, 400, 700, 1000],
            # p(y|x)
            cden.CDGaussianOLS(slope=torch.tensor([1.0]),
                               c=torch.tensor([1.0]),
                               variance=1.0),
            # rx
            lambda n: dists.Uniform(low=-2.0, high=2.0).sample((n, 1)),
            # CondSource for r(y|x)
            cdat.CSAdditiveNoiseRegression(
                f=lambda X: 1.0 * X + 0.1 * X**2 + 1.0,
                noise=dists.Normal(0, 1.0),
                dx=1)),
    }  # end of prob2tuples

    # add more problems to prob2tuples
    prob2tuples['g_het_dx3'] = create_prob_g_het(dx=3)
    prob2tuples['g_het_dx4'] = create_prob_g_het(dx=4)
    prob2tuples['g_het_dx5'] = create_prob_g_het(dx=5)
    prob2tuples['g_het_dx10'] = create_prob_g_het(dx=10)

    if prob_label not in prob2tuples:
        raise ValueError('Unknown problem label. Need to be one of %s' %
                         str(list(prob2tuples.keys())))
    return prob2tuples[prob_label]
Example #7
0
    def test_step(self, batch, batch_idx):
        x, y = batch
        μ_x, α_x, β_x = self(x)
        log_likelihood = self.ellk(μ_x, α_x, β_x, y)
        kl_divergence = self.kl(α_x, β_x, self.prior_α, self.prior_β)

        if self.mse_mode:
            loss = F.mse_loss(μ_x, y)
        else:
            loss = -self.elbo(log_likelihood, kl_divergence, train=False)

        y_pred = self.predictive_mean(x)

        m_p = D.StudentT(2 * α_x, loc=μ_x, scale=torch.sqrt(β_x / α_x))

        # ---------
        # Metrics
        self.log(TEST_LOSS, loss, on_epoch=True)
        self.log(TEST_ELBO, -loss, on_epoch=True)
        self.log(TEST_MLLK, torch.sum(m_p.log_prob(y)),
                 on_epoch=True)  # i.i.d assumption

        # Mean fit
        self.log(TEST_MEAN_FIT_MAE, F.l1_loss(y_pred, y), on_epoch=True)
        self.log(TEST_MEAN_FIT_RMSE,
                 torch.sqrt(F.mse_loss(y_pred, y)),
                 on_epoch=True)

        # Variance fit
        pred_var = self.predictive_std(x)**2
        empirical_var = (y_pred - y)**2
        self.log(TEST_VARIANCE_FIT_MAE,
                 F.l1_loss(pred_var, empirical_var),
                 on_epoch=True)
        self.log(
            TEST_VARIANCE_FIT_RMSE,
            torch.sqrt(F.mse_loss(pred_var, empirical_var)),
            on_epoch=True,
        )

        # Sample fit
        ancestral = False
        if ancestral:
            lbds = D.Gamma(α_x, β_x).sample((1, ))
            samples_y = (D.Normal(μ_x, 1 / torch.sqrt(lbds)).sample(
                (1, )).reshape(y.shape))
        else:
            samples_y = m_p.sample((1, )).reshape(y.shape)

        self.log(TEST_SAMPLE_FIT_MAE, F.l1_loss(samples_y, y), on_epoch=True)
        self.log(TEST_SAMPLE_FIT_RMSE,
                 torch.sqrt(F.mse_loss(samples_y, y)),
                 on_epoch=True)

        # Model expected log likelihood
        self.log(TEST_ELLK, torch.mean(log_likelihood), on_epoch=True)

        # Model KL
        self.log(TEST_KL, torch.mean(kl_divergence), on_epoch=True)

        # Noise
        x_noisy = generate_noise_for_model_test(x)
        μ_x, α_x, β_x = self(x_noisy)
        sigma = self.predictive_std(x_noisy)
        kl_divergence = self.kl(α_x, β_x, self.prior_α, self.prior_β)

        # Noise likelihood
        self.log(NOISE_UNCERTAINTY, torch.mean(sigma), on_epoch=True)

        # Noise KL
        self.log(NOISE_KL, torch.mean(kl_divergence), on_epoch=True)
        return loss
Example #8
0
 def beta_baseline(self) -> dist.Distribution:
     return dist.StudentT(self.dof_baseline, 0.0, self.scale_baseline)
Example #9
0
n_samples = 500

cols = 2
rows = 2
fig = plt.figure(figsize=(7 + cols, 2 + rows), facecolor='white', dpi=150)

x = np.linspace(-8, 8, 200)
x = torch.from_numpy(x).float()
# print (m.log_prob(x))
m = d.Cauchy(torch.tensor([0.0]), torch.tensor([1.]))
probs = torch.exp(m.log_prob(x))

m = d.Normal(torch.tensor([0.0]), torch.tensor([1.]))
probs2 = torch.exp(m.log_prob(x))

m = d.StudentT(torch.tensor([2.0]))
probs3 = torch.exp(m.log_prob(x))

ax = plt.subplot2grid((rows, cols), (0, 0), frameon=False)

ax.plot(numpy(x), numpy(probs), label='Cauchy')
ax.plot(numpy(x), numpy(probs2), label='Normal')
ax.plot(numpy(x), numpy(probs3), label='StudentT')
ax.legend()

m = d.Cauchy(torch.tensor([0.0]), torch.tensor([1.]))
samps = m.sample([n_samples])

m = d.Normal(torch.tensor([0.0]), torch.tensor([1.]))
samps2 = m.sample([n_samples])
Example #10
0
 def variance(self):
     return dists.StudentT(self.df, self.loc, self.scale).variance
Example #11
0
 def expectation(self):
     return dists.StudentT(self.df, self.loc, self.scale).mean
Example #12
0
 def entropy(self):
     return dists.StudentT(self.df, self.loc, self.scale).entropy()
Example #13
0
 def sample(self, batch_size):
     model = dists.StudentT(self.df, self.loc, self.scale)
     return model.rsample((batch_size, ))
Example #14
0
 def log_prob(self, value):
     model = dists.StudentT(self.df, self.loc, self.scale)
     return model.log_prob(value).sum(-1)