Esempio n. 1
0
 def test(self):
     kern1 = RBF(1)
     kern2 = RBF(2)
     lik = Gaussian()
     X = np.zeros((1, 1))
     model = DGP(X, X, X, [kern1, kern2], lik)
     model.compute_log_likelihood()
def make_dgp(X, Y, Z, L):
    D = X.shape[1]

    # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional
    kernels = []
    #for l in range(L):
    kernels.append(RBF(5))
    kernels.append(RBF(2))
    kernels.append(RBF(9))

    # between layer noise (doesn't actually make much difference but we include it anyway)
    #for kernel in kernels[:-1]:
    #    kernel += White(D, variance=1e-5)

    mb = 1000 if X.shape[0] > 1000 else None
    model = DGP(X,
                Y,
                Z,
                kernels,
                Gaussian(),
                num_samples=10,
                minibatch_size=mb)

    # start the inner layers almost deterministically
    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5

    return model
    def compare_to_single_layer(self, Y, Ys, lik, L, num_outputs=None):
        kern = Matern52(self.X.shape[1], lengthscales=0.1)

        m_svgp = SVGP(self.X, Y, kern, lik, Z=self.X, num_latent=num_outputs)
        m_svgp.q_mu = self.q_mu
        m_svgp.q_sqrt = self.q_sqrt

        L_svgp = m_svgp.compute_log_likelihood()
        mean_svgp, var_svgp = m_svgp.predict_y(self.Xs)
        test_lik_svgp = m_svgp.predict_density(self.Xs, Ys)
        pred_m_svgp, pred_v_svgp = m_svgp.predict_f(self.Xs)
        pred_mfull_svgp, pred_vfull_svgp = m_svgp.predict_f_full_cov(self.Xs)

        kerns = []
        for _ in range(L - 1):
            kerns.append(
                Matern52(self.X.shape[1], lengthscales=0.1, variance=2e-6))
        kerns.append(Matern52(self.X.shape[1], lengthscales=0.1))

        m_dgp = DGP(self.X,
                    Y,
                    self.X,
                    kerns,
                    lik,
                    num_samples=2,
                    num_outputs=num_outputs)
        m_dgp.layers[-1].q_mu = self.q_mu
        m_dgp.layers[-1].q_sqrt = self.q_sqrt

        L_dgp = m_dgp.compute_log_likelihood()
        mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1)
        test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1)

        pred_m_dgp, pred_v_dgp = m_dgp.predict_f(self.Xs, 1)
        pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov(self.Xs, 1)

        if L == 1:  # these should all be exactly the same
            atol = 1e-7
            rtol = 1e-7
        else:  # jitter makes these not exactly equal
            atol = 1e-1
            rtol = 1e-2

        assert_allclose(L_svgp, L_dgp, rtol=rtol, atol=atol)

        assert_allclose(mean_svgp, mean_dgp[0], rtol=rtol, atol=atol)
        assert_allclose(var_svgp, var_dgp[0], rtol=rtol, atol=atol)
        assert_allclose(test_lik_svgp, test_lik_dgp, rtol=rtol, atol=atol)

        assert_allclose(pred_m_dgp[0], pred_m_svgp, rtol=rtol, atol=atol)
        assert_allclose(pred_v_dgp[0], pred_v_svgp, rtol=rtol, atol=atol)
        assert_allclose(pred_mfull_dgp[0],
                        pred_mfull_svgp,
                        rtol=rtol,
                        atol=atol)
        assert_allclose(pred_vfull_dgp[0],
                        pred_vfull_svgp,
                        rtol=rtol,
                        atol=atol)
def make_DGP(L, D_problem, D_hidden, X, Y, Z):
    kernels = []
    # First layer
    kernels.append(RBF(D_problem, lengthscales=0.2, variance=1.) + White(D_problem, variance=1e-5))
    for l in range(L-1):
        k = RBF(D_hidden, lengthscales=0.2, variance=1.) + White(D_hidden, variance=1e-5)
        kernels.append(k)

    m_dgp = DGP(X, Y, Z, kernels, Gaussian(), num_samples=10)

    # init the layers to near determinisic
    for layer in m_dgp.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5
    return m_dgp
Esempio n. 5
0
        def make_dgp_as_sgp(kernels):
            m_dgp = DGP(X, Y, Z, kernels, Gaussian())
            
            #set final layer to sgp
            m_dgp.layers[-1].kern.lengthscales = ls
            m_dgp.layers[-1].kern.variance = s
            m_dgp.likelihood.variance = noise
            m_dgp.layers[-1].q_mu = q_mu
            m_dgp.layers[-1].q_sqrt = q_sqrt
            
            # set other layers to identity 
            for layer in m_dgp.layers[:-1]:
#                1e-6 gives errors of 1e-3, so need to set right down
                layer.kern.variance.transform._lower = 1e-18
                layer.kern.variance = 1e-18
                
            return m_dgp
    def _fit(self, X, Y, Lik, **kwargs):
        if X.shape[0] > num_inducing:
            Z = kmeans2(X, num_inducing, minit='points')[0]
        else:
            # pad with random values
            Z = np.concatenate([X, np.random.randn(num_inducing - X.shape[0], X.shape[1])], 0)

        if not self.model:
            kerns = []
            for _ in range(2):
                kerns.append(gpflow.kernels.RBF(X.shape[1], lengthscales=float(X.shape[1])**0.5))

            mb_size = minibatch_size if X.shape[0] > 5000 else None

            self.model = DGP(X, Y, Z, kerns, Lik(),
                             minibatch_size=mb_size,
                             **kwargs)

            self.model.layers[0].q_sqrt = self.model.layers[0].q_sqrt.read_value() * 1e-5

            if isinstance(self.model.likelihood, gpflow.likelihoods.Gaussian):
                var_list = [[self.model.layers[-1].q_mu, self.model.layers[-1].q_sqrt]]
                self.model.layers[-1].q_mu.set_trainable(False)
                self.model.layers[-1].q_sqrt.set_trainable(False)
                self.ng = gpflow.train.NatGradOptimizer(gamma=gamma).make_optimize_tensor(self.model, var_list=var_list)
            else:
                self.ng = None

            self.adam = gpflow.train.AdamOptimizer(adam_lr).make_optimize_tensor(self.model)

            iters = iterations
            self.sess = self.model.enquire_session()
        else:
            iters = small_iterations  # after first time use fewer iterations

        # we might have new data
        self.model.X.assign(X, session=self.sess)
        self.model.Y.assign(Y, session=self.sess)

        self.model.layers[0].feature.Z.assign(Z, session=self.sess)
        self.model.layers[0].q_mu.assign(np.zeros((num_inducing, X.shape[1])), session=self.sess)
        self.model.layers[0].q_sqrt.assign(1e-5*np.tile(np.eye(num_inducing)[None], [X.shape[1], 1, 1]), session=self.sess)

        self.model.layers[1].feature.Z.assign(Z, session=self.sess)
        num_outputs = self.model.layers[1].q_sqrt.shape[0]
        self.model.layers[1].q_mu.assign(np.zeros((num_inducing, num_outputs)), session=self.sess)
        self.model.layers[1].q_sqrt.assign(np.tile(np.eye(num_inducing)[None], [num_outputs, 1, 1]), session=self.sess)

        try:
            for _ in range(iters):

                if _ % 100 == 0:
                    print('{} {}'.format(_, self.sess.run(self.model.likelihood_tensor)))
                if self.ng:
                    self.sess.run(self.ng)
                self.sess.run(self.adam)

        except KeyboardInterrupt:
            pass

        self.model.anchor(session=self.sess)
class RegressionModel(object):
    def __init__(self):
        self.model = None

    def fit(self, X, Y):
        class Lik(gpflow.likelihoods.Gaussian):
            def __init__(self):
                gpflow.likelihoods.Gaussian.__init__(self)
                self.variance = initial_likelihood_var
        return self._fit(X, Y, Lik)

    def _fit(self, X, Y, Lik, **kwargs):
        if X.shape[0] > num_inducing:
            Z = kmeans2(X, num_inducing, minit='points')[0]
        else:
            # pad with random values
            Z = np.concatenate([X, np.random.randn(num_inducing - X.shape[0], X.shape[1])], 0)

        if not self.model:
            kerns = []
            for _ in range(2):
                kerns.append(gpflow.kernels.RBF(X.shape[1], lengthscales=float(X.shape[1])**0.5))

            mb_size = minibatch_size if X.shape[0] > 5000 else None

            self.model = DGP(X, Y, Z, kerns, Lik(),
                             minibatch_size=mb_size,
                             **kwargs)

            self.model.layers[0].q_sqrt = self.model.layers[0].q_sqrt.read_value() * 1e-5

            if isinstance(self.model.likelihood, gpflow.likelihoods.Gaussian):
                var_list = [[self.model.layers[-1].q_mu, self.model.layers[-1].q_sqrt]]
                self.model.layers[-1].q_mu.set_trainable(False)
                self.model.layers[-1].q_sqrt.set_trainable(False)
                self.ng = gpflow.train.NatGradOptimizer(gamma=gamma).make_optimize_tensor(self.model, var_list=var_list)
            else:
                self.ng = None

            self.adam = gpflow.train.AdamOptimizer(adam_lr).make_optimize_tensor(self.model)

            iters = iterations
            self.sess = self.model.enquire_session()
        else:
            iters = small_iterations  # after first time use fewer iterations

        # we might have new data
        self.model.X.assign(X, session=self.sess)
        self.model.Y.assign(Y, session=self.sess)

        self.model.layers[0].feature.Z.assign(Z, session=self.sess)
        self.model.layers[0].q_mu.assign(np.zeros((num_inducing, X.shape[1])), session=self.sess)
        self.model.layers[0].q_sqrt.assign(1e-5*np.tile(np.eye(num_inducing)[None], [X.shape[1], 1, 1]), session=self.sess)

        self.model.layers[1].feature.Z.assign(Z, session=self.sess)
        num_outputs = self.model.layers[1].q_sqrt.shape[0]
        self.model.layers[1].q_mu.assign(np.zeros((num_inducing, num_outputs)), session=self.sess)
        self.model.layers[1].q_sqrt.assign(np.tile(np.eye(num_inducing)[None], [num_outputs, 1, 1]), session=self.sess)

        try:
            for _ in range(iters):

                if _ % 100 == 0:
                    print('{} {}'.format(_, self.sess.run(self.model.likelihood_tensor)))
                if self.ng:
                    self.sess.run(self.ng)
                self.sess.run(self.adam)

        except KeyboardInterrupt:
            pass

        self.model.anchor(session=self.sess)

    def _predict(self, Xs, S):
        ms, vs = [], []
        n = max(len(Xs) / 100, 1)  # predict in small batches
        for xs in np.array_split(Xs, n):
            m, v = self.model.predict_y(xs, S, session=self.sess)
            ms.append(m)
            vs.append(v)

        return np.concatenate(ms, 1), np.concatenate(vs, 1)  # num_posterior_samples, N_test, D_y

    def predict(self, Xs):
        ms, vs = self._predict(Xs, num_posterior_samples)

        # the first two moments
        m = np.average(ms, 0)
        v = np.average(vs + ms**2, 0) - m**2
        return m, v

    def sample(self, Xs, S):
        ms, vs = self._predict(Xs, S)
        return ms + vs**0.5 * np.random.randn(*ms.shape)
Esempio n. 8
0
    def _build_dgp_model(self,
                         depth,
                         sess,
                         weight,
                         X,
                         Y,
                         ls_scale,
                         y_scale,
                         minibatch_size=500,
                         Z=None,
                         M=100,
                         feature_trainable=False,
                         ls_init=(None, None, None),
                         ls_trainable=(True, True, True),
                         likelihood_var_trainable=True,
                         verbose=False):
        """
        Build svgp model
        """
        N, num_latent = Y.shape
        Z = kmeans2(X, M, minit='points')[0]
        with gp.defer_build():
            k_time = gp.kernels.RBF(
                1,
                active_dims=[0],
                lengthscales=[
                    0.3 if ls_init[0] is None else ls_init[0] / ls_scale[0]
                ])
            k_space = gp.kernels.RBF(
                2,
                active_dims=[1, 2],
                lengthscales=[
                    0.3 if ls_init[1] is None else ls_init[1] / ls_scale[1]
                ])
            k_freq = gp.kernels.RBF(
                1,
                active_dims=[3],
                lengthscales=[
                    10. if ls_init[2] is None else ls_init[2] / ls_scale[2]
                ])
            for k, f in zip([k_time, k_space, k_freq], ls_trainable):
                k.lengthscales.set_trainable(f)
                if not f:
                    logging.warning("Setting {} non-trainable".format(k))
            k_time.lengthscales.prior = gp.priors.Gaussian(0, 1. / 3.)
            k_space.lengthscales.prior = gp.priors.Gaussian(
                1. / ls_scale[1], 0.5 / ls_scale[1])
            kern = k_time * k_space * k_freq

            mean = gp.mean_functions.Zero()
            kernels = [kern]
            for l in range(1, depth):
                kernels.append(
                    RBF(4 - l, lengthscales=2., variance=2., ARD=True))
                #kernels[-1].lengthscales.prior = gp.priors.Gaussian(0,1./3.)
            m = DGP(X,
                    Y,
                    Z,
                    kernels,
                    gp.likelihoods.Gaussian(),
                    minibatch_size=minibatch_size,
                    num_outputs=num_latent,
                    num_samples=1)

            # start things deterministic
            for layer in m.layers[:-1]:
                layer.q_sqrt = layer.q_sqrt.value * 1e-5
            for layer in m.layers:
                layer.feature.Z.set_trainable(feature_trainable)
            m.compile()
        if verbose:
            logging.warning(m)
        return m
Esempio n. 9
0
def get_test_error(i,
                   dataset,
                   alpha,
                   learning_rate=0.001,
                   iterations=20000,
                   white=True,
                   normalized=True,
                   num_inducing=100,
                   beta=None,
                   gamma=None,
                   div_weights=None):
    """STEP (1) Read in the data via the helpful 'Dataset' object"""
    data = datasets.all_datasets[dataset].get_data(seed=0, split=i, prop=0.9)
    X_train, Y_train, X_test, Y_test, Y_std = [
        data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']
    ]
    print('N: {}, D: {}, Ns: {}, Y_std: {}'.format(X_train.shape[0],
                                                   X_train.shape[1],
                                                   X_test.shape[0], Y_std))

    Z = kmeans2(X_train, num_inducing, minit='points')[0]

    #Dimensionality of X
    D = X_train.shape[1]

    # the layer shapes are defined by the kernel dims, so here all
    # hidden layers are D dimensional
    kernels = []
    for l in range(L):
        kernels.append(RBF(D))

    # between layer noise (doesn't actually make much difference but we include it anyway)
    for kernel in kernels[:-1]:
        kernel += White(D, variance=1e-5)

    mb = 1000 if X_train.shape[0] > 1000 else None

    # get the likelihood model (possibly a robust one)
    if gamma is None and beta is None:
        #standard likelihood
        lklh = Gaussian()
    elif beta is not None and gamma is None:
        #beta-divergence robustified likelihood
        lklh = betaDivGaussian(beta)
    elif gamma is not None and beta is None:
        #gamma-divergeece robustified likelihood
        lklh = gammaDivGaussian(gamma)
    else:
        print(
            "ERROR! You have specified both beta and gamma. Either specify " +
            "both as None (for standard Gaussian likelihood) or one of them " +
            "as None (to use the other)")
        sys.exit()
    """STEP (2): Call 'DGP' for split i, which together with ADAM is 
                 responsible for the inference"""
    model = DGP(
        X_train,
        Y_train,
        Z,
        kernels,
        lklh,  #Gaussian(), #betaDivGaussian(0.01), #Gaussian(), #betaDivGaussian(0.1), #Gaussian(), #Gaussian_(), #gammaDivGaussian(0.1), #Gaussian_(), #gammaDivGaussian(0.01), #gammaDivGaussian(0.1), #Gaussian(), 
        num_samples=K,
        minibatch_size=mb,
        alpha=alpha,
        white=white,
        div_weights=div_weights)

    # start the inner layers almost deterministically
    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5

    #Build functions for evaluation test errors
    S = 100

    def batch_assess(model, assess_model, X, Y):
        n_batches = max(int(X.shape[0] / 1000.), 1)
        lik, sq_diff = [], []
        for X_batch, Y_batch in zip(np.array_split(X, n_batches),
                                    np.array_split(Y, n_batches)):
            l, sq = assess_model(model, X_batch, Y_batch)
            lik.append(l)
            sq_diff.append(sq)
        lik = np.concatenate(lik, 0)
        sq_diff = np.array(np.concatenate(sq_diff, 0), dtype=float)
        return np.average(lik), np.average(sq_diff)**0.5

    def assess_single_layer(model, X_batch, Y_batch):
        m, v = model.predict_y(X_batch)
        lik = np.sum(
            norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5),
            1)
        sq_diff = Y_std**2 * ((m - Y_batch)**2)
        return lik, sq_diff

    def assess_sampled(model, X_batch, Y_batch):
        m, v = model.predict_y(X_batch, S)
        S_lik = np.sum(
            norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5),
            2)
        lik = logsumexp(S_lik, 0, b=1 / float(S))

        mean = np.average(m, 0)
        sq_diff = Y_std**2 * ((mean - Y_batch)**2)
        return lik, sq_diff

    #Get start time
    start_time = time.time()

    #Fit to training set via ADAM
    np.random.seed(1)
    AdamOptimizer(learning_rate).minimize(model, maxiter=iterations)

    #get running time
    running_time = time.time() - start_time
    s = 'time: {:.4f},  lik: {:.4f}, rmse: {:.4f}'
    """STEP (3): Extract and return test performancee metrics to 'main'."""
    #Get test errors
    lik, rmse = batch_assess(model, assess_sampled, X_test, Y_test)
    print(s.format(running_time, lik, rmse))

    return -lik, rmse, running_time
Esempio n. 10
0
#np.save('plots/svgp',m_sgp)

Yp_mat = np.reshape(m_sgp, (n, n))  # doing this for heat map
Yp_mat_tr = np.transpose(np.fliplr(Yp_mat))

diff_sgp = m_sgp - Y_truth  # difference between truth and predicted values
diff_mat = Yt_mat_tr - Yp_mat_tr

############################################################################

# Prediction using one single layer DGP

kernels = [RBF(d, lengthscales=0.2, variance=1.)]

m_dgp = DGP(
    X, Y, Z, kernels, Gaussian_lik(), minibatch_size=None
)  # Making a one single layer DGP model using DGP in doubly_stochastic_dgp package
m_dgp.likelihood.likelihood.variance = 1e-2

for layer in m_dgp.layers[:-1]:
    layer.q_sqrt = layer.q_sqrt.value * 1e-5  # initializing covariance matrix of variational distribution q(u) (in each layer)

iterations = 2000
start_time = time.time()
ScipyOptimizer().minimize(m_dgp, maxiter=iterations)
#AdamOptimizer(0.001).minimize(m_dgp, maxiter=iterations)  #Scipy is much better than Adam here for one single layer
t_op = time.time() - start_time
print("--- %s seconds ---" % (t_op))  # 21.741 seconds ---

m_dgp.read_trainables()
Esempio n. 11
0
    dataset_name, L, split))
print('N: {}, D: {}, Ns: {}'.format(X.shape[0], X.shape[1], Xs.shape[0]))

Z = kmeans2(X, 100, minit='points')[0]

D = X.shape[1]

kernels = []
for l in range(L):
    kernels.append(RBF(D))

for kernel in kernels[:-1]:
    kernel += White(D, variance=2e-6)

mb = minibatch_size if X.shape[0] > minibatch_size else None
model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb)

# start the inner layers almost deterministically
for layer in model.layers[:-1]:
    layer.q_sqrt = layer.q_sqrt.value * 1e-5
model.likelihood.variance = 0.05

global_step = tf.Variable(0, trainable=False, name="global_step")
model.enquire_session().run(global_step.initializer)

s = "{}/{}_L{}_split{}".format(results_path, dataset_name, L, split)
fw = tf.summary.FileWriter(os.path.join(s.format(dataset_name, L)),
                           model.enquire_session().graph)

opt_method = gpflow_monitor.ManagedOptimisation(model, AdamOptimizer(0.01),
                                                global_step)
        def test_vs_DGP2(self):
            lik = Gaussian()
            lik_var = 0.1
            lik.variance = lik_var
            N, Ns, D_Y, D_X = self.X.shape[0], self.Xs.shape[
                0], self.D_Y, self.X.shape[1]

            q_mu = np.random.randn(N, D_X)

            Y = np.random.randn(N, D_Y)
            Ys = np.random.randn(Ns, D_Y)

            kern1 = Matern52(self.X.shape[1], lengthscales=0.5)
            kern2 = Matern52(self.X.shape[1], lengthscales=0.5)
            kerns = [kern1, kern2]
            # mf = Linear(A=np.random.randn(D_X, D_Y), b=np.random.randn(D_Y))

            mf = Zero()
            m_dgp = DGP(self.X,
                        Y,
                        self.X,
                        kerns,
                        lik,
                        mean_function=mf,
                        white=True)
            m_dgp.layers[0].q_mu = q_mu
            m_dgp.layers[0].q_sqrt = m_dgp.layers[0].q_sqrt.read_value(
            ) * 1e-24

            Fs, ms, vs = m_dgp.predict_all_layers(self.Xs, 1)
            Z = self.X.copy()
            Z[:len(self.Xs)] = ms[0][0]
            m_dgp.layers[
                1].feature.Z = Z  # need to put the inducing points in the right place

            var_list = [[m_dgp.layers[1].q_mu, m_dgp.layers[1].q_sqrt]]
            NatGradOptimizer(gamma=1).minimize(m_dgp,
                                               var_list=var_list,
                                               maxiter=1)

            mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1)
            test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1)
            pred_m_dgp, pred_v_gpr = m_dgp.predict_f(self.Xs, 1)
            pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov(
                self.Xs, 1)

            # mean_functions = [Identity(), mf]
            layer0 = GPMC_Layer(kerns[0], self.X.copy(), D_X, Identity())
            layer1 = GPR_Layer(kerns[1], mf, D_Y)

            m_heinonen = DGP_Heinonen(self.X, Y, lik, [layer0, layer1])

            m_heinonen.layers[0].q_mu = q_mu

            mean_heinonen, var_heinonen = m_heinonen.predict_y(self.Xs, 1)
            test_lik_heinonen = m_heinonen.predict_density(self.Xs, Ys, 1)
            pred_m_heinonen, pred_v_heinonen = m_heinonen.predict_f(self.Xs, 1)
            pred_mfull_heinonen, pred_vfull_heinonen = m_heinonen.predict_f_full_cov(
                self.Xs, 1)

            tol = 1e-4
            assert_allclose(mean_dgp, mean_heinonen, atol=tol, rtol=tol)
            assert_allclose(test_lik_dgp,
                            test_lik_heinonen,
                            atol=tol,
                            rtol=tol)
            assert_allclose(pred_m_dgp, pred_m_heinonen, atol=tol, rtol=tol)
            assert_allclose(pred_mfull_dgp,
                            pred_mfull_heinonen,
                            atol=tol,
                            rtol=tol)
            assert_allclose(pred_vfull_dgp,
                            pred_vfull_heinonen,
                            atol=tol,
                            rtol=tol)
Esempio n. 13
0
        def compare_to_single_layer(self,
                                    Y,
                                    Ys,
                                    lik,
                                    L,
                                    white,
                                    num_outputs=None):
            kern = Matern52(self.X.shape[1], lengthscales=0.5)

            m_svgp = SVGP(self.X,
                          Y,
                          kern,
                          lik,
                          Z=self.X,
                          whiten=white,
                          num_latent=num_outputs)
            m_svgp.q_mu = self.q_mu
            m_svgp.q_sqrt = self.q_sqrt

            L_svgp = m_svgp.compute_log_likelihood()
            mean_svgp, var_svgp = m_svgp.predict_y(self.Xs)
            test_lik_svgp = m_svgp.predict_density(self.Xs, Ys)
            pred_m_svgp, pred_v_svgp = m_svgp.predict_f(self.Xs)
            pred_mfull_svgp, pred_vfull_svgp = m_svgp.predict_f_full_cov(
                self.Xs)

            kerns = []
            for _ in range(L - 1):

                class NoTransformMatern52(Matern52):
                    def __init__(self, *args, variance=1., **kwargs):
                        Matern52.__init__(self, *args, **kwargs)
                        del self.variance
                        self.variance = Parameter(variance)

                kerns.append(
                    NoTransformMatern52(self.X.shape[1],
                                        variance=1e-24,
                                        lengthscales=0.5))
            kerns.append(kern)

            m_dgp = DGP(self.X,
                        Y,
                        self.X,
                        kerns,
                        lik,
                        white=white,
                        num_samples=2,
                        num_outputs=num_outputs)
            m_dgp.layers[-1].q_mu = self.q_mu
            m_dgp.layers[-1].q_sqrt = self.q_sqrt

            L_dgp = m_dgp.compute_log_likelihood()
            mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1)
            test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1)

            pred_m_dgp, pred_v_dgp = m_dgp.predict_f(self.Xs, 1)
            pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov(
                self.Xs, 1)

            if L == 1:  # these should all be exactly the same
                atol = 1e-7
                rtol = 1e-7
            else:  # jitter makes these not exactly equal
                atol = 1e-6
                rtol = 1e-6

            assert_allclose(L_svgp, L_dgp, rtol=rtol, atol=atol)

            assert_allclose(mean_svgp, mean_dgp[0], rtol=rtol, atol=atol)
            assert_allclose(var_svgp, var_dgp[0], rtol=rtol, atol=atol)
            assert_allclose(test_lik_svgp, test_lik_dgp, rtol=rtol, atol=atol)

            assert_allclose(pred_m_dgp[0], pred_m_svgp, rtol=rtol, atol=atol)
            assert_allclose(pred_v_dgp[0], pred_v_svgp, rtol=rtol, atol=atol)
            assert_allclose(pred_mfull_dgp[0],
                            pred_mfull_svgp,
                            rtol=rtol,
                            atol=atol)
            assert_allclose(pred_vfull_dgp[0],
                            pred_vfull_svgp,
                            rtol=rtol,
                            atol=atol)