Example #1
0
def test_mixed_mok_with_Id_vs_independent_mok():
    data = DataMixedKernelWithEye
    # Independent model
    k1 = mk.SharedIndependent(SquaredExponential(variance=0.5, lengthscales=1.2), data.L)
    f1 = InducingPoints(data.X[: data.M, ...])
    model_1 = SVGP(k1, Gaussian(), f1, q_mu=data.mu_data_full, q_sqrt=data.sqrt_data_full)
    set_trainable(model_1, False)
    set_trainable(model_1.q_sqrt, True)

    gpflow.optimizers.Scipy().minimize(
        model_1.training_loss_closure(Data.data),
        variables=model_1.trainable_variables,
        method="BFGS",
        compile=True,
    )

    # Mixed Model
    kern_list = [SquaredExponential(variance=0.5, lengthscales=1.2) for _ in range(data.L)]
    k2 = mk.LinearCoregionalization(kern_list, data.W)
    f2 = InducingPoints(data.X[: data.M, ...])
    model_2 = SVGP(k2, Gaussian(), f2, q_mu=data.mu_data_full, q_sqrt=data.sqrt_data_full)
    set_trainable(model_2, False)
    set_trainable(model_2.q_sqrt, True)

    gpflow.optimizers.Scipy().minimize(
        model_2.training_loss_closure(Data.data),
        variables=model_2.trainable_variables,
        method="BFGS",
        compile=True,
    )

    check_equality_predictions(Data.data, [model_1, model_2])
Example #2
0
def test_separate_independent_mok():
    """
    We use different independent kernels for each of the output dimensions.
    We can achieve this in two ways:
        1) efficient: SeparateIndependentMok with Shared/SeparateIndependentMof
        2) inefficient: SeparateIndependentMok with InducingPoints
    However, both methods should return the same conditional,
    and after optimization return the same log likelihood.
    """
    # Model 1 (Inefficient)
    q_mu_1 = np.random.randn(Data.M * Data.P, 1)
    q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P, Data.M * Data.P))[None, ...]  # 1 x MP x MP

    kern_list_1 = [SquaredExponential(variance=0.5, lengthscales=1.2) for _ in range(Data.P)]
    kernel_1 = mk.SeparateIndependent(kern_list_1)
    inducing_variable_1 = InducingPoints(Data.X[: Data.M, ...])
    model_1 = SVGP(
        kernel_1, Gaussian(), inducing_variable_1, num_latent_gps=1, q_mu=q_mu_1, q_sqrt=q_sqrt_1,
    )
    set_trainable(model_1, False)
    set_trainable(model_1.q_sqrt, True)
    set_trainable(model_1.q_mu, True)

    gpflow.optimizers.Scipy().minimize(
        model_1.training_loss_closure(Data.data),
        variables=model_1.trainable_variables,
        method="BFGS",
        compile=True,
    )

    # Model 2 (efficient)
    q_mu_2 = np.random.randn(Data.M, Data.P)
    q_sqrt_2 = np.array(
        [np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)]
    )  # P x M x M
    kern_list_2 = [SquaredExponential(variance=0.5, lengthscales=1.2) for _ in range(Data.P)]
    kernel_2 = mk.SeparateIndependent(kern_list_2)
    inducing_variable_2 = mf.SharedIndependentInducingVariables(
        InducingPoints(Data.X[: Data.M, ...])
    )
    model_2 = SVGP(
        kernel_2,
        Gaussian(),
        inducing_variable_2,
        num_latent_gps=Data.P,
        q_mu=q_mu_2,
        q_sqrt=q_sqrt_2,
    )
    set_trainable(model_2, False)
    set_trainable(model_2.q_sqrt, True)
    set_trainable(model_2.q_mu, True)

    gpflow.optimizers.Scipy().minimize(
        model_2.training_loss_closure(Data.data),
        variables=model_2.trainable_variables,
        method="BFGS",
        compile=True,
    )

    check_equality_predictions(Data.data, [model_1, model_2])
Example #3
0
# Analytically optimal sparse model ELBO:

# %%
sgpr.elbo().numpy()

# %% [markdown]
# SVGP ELBO before natural gradient step:

# %%
svgp.elbo(data).numpy()

# %%
variational_params = [(svgp.q_mu, svgp.q_sqrt)]

natgrad_opt = NaturalGradient(gamma=1.0)
natgrad_opt.minimize(svgp.training_loss_closure(data),
                     var_list=variational_params)

# %% [markdown]
# SVGP ELBO after a single natural gradient step:

# %%
svgp.elbo(data).numpy()

# %% [markdown]
# ### Minibatches
# A crucial property of the natural gradient method is that it still works with minibatches.
# In practice though, we need to use a smaller gamma.

# %%
natgrad_opt = NaturalGradient(gamma=0.1)
Example #4
0
def test_separate_independent_mof():
    """
    Same test as above but we use different (i.e. separate) inducing inducing
    for each of the output dimensions.
    """
    np.random.seed(0)

    # Model 1 (INefficient)
    q_mu_1 = np.random.randn(Data.M * Data.P, 1)
    q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P,
                                       Data.M * Data.P))[None,
                                                         ...]  # 1 x MP x MP

    kernel_1 = mk.SharedIndependent(
        SquaredExponential(variance=0.5, lengthscales=1.2), Data.P)
    inducing_variable_1 = InducingPoints(Data.X[:Data.M, ...])
    model_1 = SVGP(kernel_1,
                   Gaussian(),
                   inducing_variable_1,
                   q_mu=q_mu_1,
                   q_sqrt=q_sqrt_1)
    set_trainable(model_1, False)
    set_trainable(model_1.q_sqrt, True)
    set_trainable(model_1.q_mu, True)

    gpflow.optimizers.Scipy().minimize(
        model_1.training_loss_closure(Data.data),
        variables=model_1.trainable_variables,
        method="BFGS",
        compile=True,
    )

    # Model 2 (efficient)
    q_mu_2 = np.random.randn(Data.M, Data.P)
    q_sqrt_2 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kernel_2 = mk.SharedIndependent(
        SquaredExponential(variance=0.5, lengthscales=1.2), Data.P)
    inducing_variable_list_2 = [
        InducingPoints(Data.X[:Data.M, ...]) for _ in range(Data.P)
    ]
    inducing_variable_2 = mf.SeparateIndependentInducingVariables(
        inducing_variable_list_2)
    model_2 = SVGP(kernel_2,
                   Gaussian(),
                   inducing_variable_2,
                   q_mu=q_mu_2,
                   q_sqrt=q_sqrt_2)
    set_trainable(model_2, False)
    set_trainable(model_2.q_sqrt, True)
    set_trainable(model_2.q_mu, True)

    gpflow.optimizers.Scipy().minimize(
        model_2.training_loss_closure(Data.data),
        variables=model_2.trainable_variables,
        method="BFGS",
        compile=True,
    )

    # Model 3 (Inefficient): an idenitical inducing variable is used P times,
    # and treated as a separate one.
    q_mu_3 = np.random.randn(Data.M, Data.P)
    q_sqrt_3 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kern_list = [
        SquaredExponential(variance=0.5, lengthscales=1.2)
        for _ in range(Data.P)
    ]
    kernel_3 = mk.SeparateIndependent(kern_list)
    inducing_variable_list_3 = [
        InducingPoints(Data.X[:Data.M, ...]) for _ in range(Data.P)
    ]
    inducing_variable_3 = mf.SeparateIndependentInducingVariables(
        inducing_variable_list_3)
    model_3 = SVGP(kernel_3,
                   Gaussian(),
                   inducing_variable_3,
                   q_mu=q_mu_3,
                   q_sqrt=q_sqrt_3)
    set_trainable(model_3, False)
    set_trainable(model_3.q_sqrt, True)
    set_trainable(model_3.q_mu, True)

    gpflow.optimizers.Scipy().minimize(
        model_3.training_loss_closure(Data.data),
        variables=model_3.trainable_variables,
        method="BFGS",
        compile=True,
    )

    check_equality_predictions(Data.data, [model_1, model_2, model_3])
Example #5
0
def test_shared_independent_mok():
    """
    In this test we use the same kernel and the same inducing inducing
    for each of the outputs. The outputs are considered to be uncorrelated.
    This is how GPflow handled multiple outputs before the multioutput framework was added.
    We compare three models here:
        1) an ineffient one, where we use a SharedIndepedentMok with InducingPoints.
           This combination will uses a Kff of size N x P x N x P, Kfu if size N x P x M x P
           which is extremely inefficient as most of the elements are zero.
        2) efficient: SharedIndependentMok and SharedIndependentMof
           This combinations uses the most efficient form of matrices
        3) the old way, efficient way: using Kernel and InducingPoints
        Model 2) and 3) follow more or less the same code path.
    """
    np.random.seed(0)
    # Model 1
    q_mu_1 = np.random.randn(Data.M * Data.P, 1)  # MP x 1
    q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P,
                                       Data.M * Data.P))[None,
                                                         ...]  # 1 x MP x MP
    kernel_1 = mk.SharedIndependent(
        SquaredExponential(variance=0.5, lengthscales=1.2), Data.P)
    inducing_variable = InducingPoints(Data.X[:Data.M, ...])
    model_1 = SVGP(
        kernel_1,
        Gaussian(),
        inducing_variable,
        q_mu=q_mu_1,
        q_sqrt=q_sqrt_1,
        num_latent_gps=Data.Y.shape[-1],
    )
    set_trainable(model_1, False)
    set_trainable(model_1.q_sqrt, True)

    gpflow.optimizers.Scipy().minimize(
        model_1.training_loss_closure(Data.data),
        variables=model_1.trainable_variables,
        options=dict(maxiter=500),
        method="BFGS",
        compile=True,
    )

    # Model 2
    q_mu_2 = np.reshape(q_mu_1, [Data.M, Data.P])  # M x P
    q_sqrt_2 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kernel_2 = SquaredExponential(variance=0.5, lengthscales=1.2)
    inducing_variable_2 = InducingPoints(Data.X[:Data.M, ...])
    model_2 = SVGP(
        kernel_2,
        Gaussian(),
        inducing_variable_2,
        num_latent_gps=Data.P,
        q_mu=q_mu_2,
        q_sqrt=q_sqrt_2,
    )
    set_trainable(model_2, False)
    set_trainable(model_2.q_sqrt, True)

    gpflow.optimizers.Scipy().minimize(
        model_2.training_loss_closure(Data.data),
        variables=model_2.trainable_variables,
        options=dict(maxiter=500),
        method="BFGS",
        compile=True,
    )

    # Model 3
    q_mu_3 = np.reshape(q_mu_1, [Data.M, Data.P])  # M x P
    q_sqrt_3 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kernel_3 = mk.SharedIndependent(
        SquaredExponential(variance=0.5, lengthscales=1.2), Data.P)
    inducing_variable_3 = mf.SharedIndependentInducingVariables(
        InducingPoints(Data.X[:Data.M, ...]))
    model_3 = SVGP(
        kernel_3,
        Gaussian(),
        inducing_variable_3,
        num_latent_gps=Data.P,
        q_mu=q_mu_3,
        q_sqrt=q_sqrt_3,
    )
    set_trainable(model_3, False)
    set_trainable(model_3.q_sqrt, True)

    gpflow.optimizers.Scipy().minimize(
        model_3.training_loss_closure(Data.data),
        variables=model_3.trainable_variables,
        options=dict(maxiter=500),
        method="BFGS",
        compile=True,
    )

    check_equality_predictions(Data.data, [model_1, model_2, model_3])
Example #6
0
class TrainableSVGP():
    def __init__(self,
                 kernel,
                 inducing_points,
                 batch_size,
                 num_iter,
                 err_fn,
                 var_dist,
                 classif=None,
                 error_every=100,
                 train_hyperparams: bool = True,
                 lr: float = 0.001,
                 natgrad_lr: float = 0.01):
        self.train_hyperparams = train_hyperparams
        self.lr = lr
        self.natgrad_lr = natgrad_lr
        self.kernel = kernel
        self.Z = inducing_points.copy()
        self.batch_size = batch_size
        self.num_iter = num_iter
        self.err_fn = err_fn
        self.error_every = error_every
        self.do_classif = classif is not None and classif > 0
        self.num_classes = 1
        if self.do_classif:
            self.num_classes = int(classif)
        self.model = None
        self.var_dist = var_dist

    def fit(self, X, Y, Xval, Yval):
        N = X.shape[0]

        if self.var_dist == "diag":
            q_diag = True
        elif self.var_dist == "full":
            q_diag = False
        else:
            raise NotImplementedError(
                "GPFlow cannot implement %s variational distribution" %
                (self.var_dist))

        if self.do_classif:
            if self.num_classes == 2:
                likelihood = gpflow.likelihoods.Bernoulli()
                num_latent = 1
            else:
                # Softmax better than Robustmax (apparently per the gpflow slack)
                #likelihood = gpflow.likelihoods.MultiClass(self.num_classes, invlink=invlink)  # Multiclass likelihood
                likelihood = gpflow.likelihoods.Softmax(self.num_classes)
                num_latent = self.num_classes
                # Y must be 1D for the multiclass model to actually work.
                Y = np.argmax(Y, 1).reshape((-1, 1)).astype(int)
        else:
            num_latent = 1
            likelihood = gpflow.likelihoods.Gaussian()

        self.model = SVGP(kernel=self.kernel,
                          likelihood=likelihood,
                          inducing_variable=self.Z,
                          num_data=N,
                          num_latent_gps=num_latent,
                          whiten=False,
                          q_diag=q_diag)
        # Setup training
        if not self.train_hyperparams:
            set_trainable(self.model.inducing_variable.Z, False)
            set_trainable(self.kernel.lengthscales, False)
            set_trainable(self.kernel.variance, False)
        if self.natgrad_lr > 0:
            set_trainable(self.model.q_mu, False)
            set_trainable(self.model.q_sqrt, False)
            variational_params = [(self.model.q_mu, self.model.q_sqrt)]
        # Create the optimizers
        adam_opt = tf.optimizers.Adam(self.lr)
        if self.natgrad_lr > 0:
            natgrad_opt = NaturalGradient(gamma=self.natgrad_lr)

        # Print
        gpflow.utilities.print_summary(self.model)
        print("", flush=True)

        # Giacomo: If shuffle buffer is too large it will run OOM
        if self.num_classes == 2:
            Y = (Y + 1) / 2
            Yval = (Yval + 1) / 2
        generator = partial(data_generator, X, Y)
        #train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)) \
        train_dataset = tf.data.Dataset.from_generator(generator, args=(self.batch_size, ), output_types=(tf.float32, tf.float32)) \
            .prefetch(self.batch_size * 10) \
            .repeat() \
            .shuffle(min(N // self.batch_size, 1_000_000 // self.batch_size)) \
            .batch(1)
        train_iter = iter(train_dataset)

        loss = self.model.training_loss_closure(train_iter)
        t_elapsed = 0
        for step in range(self.num_iter):
            t_s = time.time()
            if self.natgrad_lr > 0:
                natgrad_opt.minimize(loss, var_list=variational_params)
            adam_opt.minimize(loss, var_list=self.model.trainable_variables)
            t_elapsed += time.time() - t_s
            if step % 700 == 0:
                print("Step %d -- Elapsed %.2fs" % (step, t_elapsed),
                      flush=True)
            if (step + 1) % self.error_every == 0:
                preds = self.predict(Xval)
                val_err, err_name = self.err_fn(Yval, preds)
                print(
                    f"Step {step + 1} - {t_elapsed:7.2f}s Elapsed - "
                    f"Validation {err_name} {val_err:7.5f}",
                    flush=True)

        preds = self.predict(Xval)
        val_err, err_name = self.err_fn(Yval, preds)
        print(
            f"Finished optimization - {t_elapsed:7.2f}s Elapsed - "
            f"Validation {err_name} {val_err:7.5f}",
            flush=True)
        print("Final model is ")
        gpflow.utilities.print_summary(self.model)
        print("", flush=True)
        return self

    def predict(self, X):
        preds = []
        dset = tf.data.Dataset.from_tensor_slices((X, )).batch(self.batch_size)
        for X_batch in iter(dset):
            batch_preds = self.model.predict_y(X_batch[0])[0].numpy()
            if self.do_classif:
                batch_preds = batch_preds.reshape((X_batch[0].shape[0], -1))
            preds.append(batch_preds)
        preds = np.concatenate(preds, axis=0)
        return preds

    @property
    def inducing_points(self):
        return self.model.inducing_variable.Z.numpy()

    def __str__(self):
        return ((
            "TrainableSVGP<kernel=%s, num_inducing_points=%d, batch_size=%d, "
            "num_iter=%d, lr=%f, natgrad_lr=%f, error_every=%d, train_hyperparams=%s, "
            "var_dist=%s, do_classif=%s, model=%s") %
                (self.kernel, self.Z.shape[0], self.batch_size, self.num_iter,
                 self.lr, self.natgrad_lr, self.error_every,
                 self.train_hyperparams, self.var_dist, self.do_classif,
                 self.model))