def test_regression_error(self,
                              cuda=False,
                              skip_logdet_forward=False,
                              cholesky=False):
        train_x, train_y = train_data(cuda=cuda)
        likelihood = GaussianLikelihood()
        model = SVGPRegressionModel(torch.linspace(0, 1, 25))
        mll = gpytorch.mlls.VariationalELBO(likelihood,
                                            model,
                                            num_data=len(train_y))
        if cuda:
            likelihood = likelihood.cuda()
            model = model.cuda()
            mll = mll.cuda()

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()
        optimizer = optim.Adam([{
            "params": model.parameters()
        }, {
            "params": likelihood.parameters()
        }],
                               lr=0.01)

        _wrapped_cg = MagicMock(wraps=gpytorch.utils.linear_cg)
        with gpytorch.settings.max_cholesky_size(
                math.inf if cholesky else 0
        ), gpytorch.settings.skip_logdet_forward(
                skip_logdet_forward), warnings.catch_warnings(
                    record=True) as w, patch(
                        "gpytorch.utils.linear_cg",
                        new=_wrapped_cg) as linear_cg_mock:
            for _ in range(150):
                optimizer.zero_grad()
                output = model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.step()

            # Make sure CG was called (or not), and no warnings were thrown
            self.assertEqual(len(w), 0)
            if cholesky:
                self.assertFalse(linear_cg_mock.called)
            else:
                self.assertTrue(linear_cg_mock.called)

        for param in model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)

        # Set back to eval mode
        model.eval()
        likelihood.eval()
        test_preds = likelihood(model(train_x)).mean.squeeze()
        mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2)
        self.assertLess(mean_abs_error.item(), 1e-1)
    def test_kissgp_gp_mean_abs_error(self):
        likelihood = GaussianLikelihood()
        gp_model = GPRegressionModel(train_x, train_y, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        with gpytorch.settings.max_preconditioner_size(
                5), gpytorch.settings.use_toeplitz(True):
            optimizer = optim.Adam(gp_model.parameters(), lr=0.1)
            optimizer.n_iter = 0
            for _ in range(8):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)

        # Test the model
        # Use the other toeplitz option here for testing
        with gpytorch.settings.max_preconditioner_size(
                5), gpytorch.settings.use_toeplitz(True):
            gp_model.eval()
            likelihood.eval()

            test_preds = likelihood(gp_model(test_x)).mean
            mean_abs_error = torch.mean(torch.abs(test_y - test_preds))
            self.assertLess(mean_abs_error.squeeze().item(), 0.2)
Beispiel #3
0
def main():
    # Initialize the likelihood and model
    # We use a Gaussian likelihood for regression so we have both a predictive
    # mean and variance for our predictions
    likelihood = GaussianLikelihood().cuda()
    train_x, train_y = prepare_training_data()
    model = GPRegressionModel(train_x.data, train_y.data, likelihood).cuda()

    # Use the adam optimizer
    optimizer = torch.optim.Adam(
        [
            {
                'params': model.parameters()
            },  # Includes GaussianLikelihood parameters
        ],
        lr=0.1)

    if mode == 'Train':
        # Find optimal model hyperparameters
        model.train()
        likelihood.train()

        # "Loss" for GPs - the marginal log likelihood
        # n_data refers to the amount of training data
        # "Loss" for GPs - the marginal log likelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        train(train_x, train_y, model, optimizer, mll)

    elif mode == 'Eval':
        print("start to test the model")
        predictions = eval_superpixels(model, likelihood)
        plot_result(predictions)
    else:
        raise Exception("No such mode")
Beispiel #4
0
    def test_train_on_batch_test_on_batch(self):
        # We're manually going to set the hyperparameters to something they shouldn't be
        likelihood = GaussianLikelihood(
            noise_prior=gpytorch.priors.NormalPrior(loc=torch.zeros(2),
                                                    scale=torch.ones(2)),
            batch_shape=torch.Size([2]))
        gp_model = ExactGPModel(train_x12,
                                train_y12,
                                likelihood,
                                batch_shape=torch.Size([2]))
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()
        optimizer = optim.Adam(gp_model.parameters(), lr=0.1)
        for _ in range(50):
            optimizer.zero_grad()
            output = gp_model(train_x12)
            loss = -mll(output, train_y12, train_x12).sum()
            loss.backward()
            optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)

        # Test the model
        gp_model.eval()
        likelihood.eval()

        # First test on non-batch
        non_batch_predictions = likelihood(gp_model(test_x1))
        preds1 = non_batch_predictions.mean
        mean_abs_error1 = torch.mean(torch.abs(test_y1 - preds1[0]))
        self.assertLess(mean_abs_error1.squeeze().item(), 0.1)

        # Make predictions for both sets of test points, and check MAEs.
        batch_predictions = likelihood(gp_model(test_x12))
        preds1 = batch_predictions.mean[0]
        preds2 = batch_predictions.mean[1]
        mean_abs_error1 = torch.mean(torch.abs(test_y1 - preds1))
        mean_abs_error2 = torch.mean(torch.abs(test_y2 - preds2))
        self.assertLess(mean_abs_error1.squeeze().item(), 0.1)
        self.assertLess(mean_abs_error2.squeeze().item(), 0.1)

        # Smoke test for batch mode derivatives failing
        test_x_param = torch.nn.Parameter(test_x12.data)
        batch_predictions = likelihood(gp_model(test_x_param))
        batch_predictions.mean.sum().backward()
        self.assertTrue(test_x_param.grad is not None)

        # Smoke test for non-batch mode derivatives failing
        test_x_param = torch.nn.Parameter(test_x1.data)
        batch_predictions = likelihood(gp_model(test_x_param))
        batch_predictions.mean.sum().backward()
        self.assertTrue(test_x_param.grad is not None)
    def test_posterior_latent_gp_and_likelihood_with_optimization(self):
        # We're manually going to set the hyperparameters to something they shouldn't be
        likelihood = GaussianLikelihood(log_noise_bounds=(-3, 3))
        gp_model = ExactGPModel(train_x.data, train_y.data, likelihood)
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)
        gp_model.covar_module.initialize(log_lengthscale=1)
        gp_model.mean_module.initialize(constant=0)
        likelihood.initialize(log_noise=1)

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()
        optimizer = optim.Adam(
            list(gp_model.parameters()) + list(likelihood.parameters()),
            lr=0.1,
        )
        optimizer.n_iter = 0
        for _ in range(50):
            optimizer.zero_grad()
            output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.n_iter += 1
            optimizer.step()

        # Test the model
        gp_model.eval()
        likelihood.eval()
        test_function_predictions = likelihood(gp_model(test_x))
        mean_abs_error = torch.mean(
            torch.abs(test_y - test_function_predictions.mean()))

        self.assertLess(mean_abs_error.data.squeeze()[0], 0.05)
def test_kissgp_gp_fast_pred_var():
    with gpytorch.fast_pred_var():
        train_x, train_y, test_x, test_y = make_data()
        likelihood = GaussianLikelihood()
        gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1)
        optimizer.n_iter = 0
        for i in range(25):
            optimizer.zero_grad()
            output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.n_iter += 1
            optimizer.step()

        # Test the model
        gp_model.eval()
        likelihood.eval()
        # Set the cache
        test_function_predictions = likelihood(gp_model(train_x))

        # Now bump up the likelihood to something huge
        # This will make it easy to calculate the variance
        likelihood.log_noise.data.fill_(3)
        test_function_predictions = likelihood(gp_model(train_x))

        noise = likelihood.log_noise.exp()
        var_diff = (test_function_predictions.var() - noise).abs()
        assert(torch.max(var_diff.data / noise.data) < 0.05)
Beispiel #7
0
    def test_kissgp_gp_mean_abs_error(self):
        likelihood = GaussianLikelihood()
        gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(
            list(gp_model.parameters()) + list(likelihood.parameters()),
            lr=0.2,
        )
        optimizer.n_iter = 0
        for _ in range(20):
            optimizer.zero_grad()
            output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.n_iter += 1
            optimizer.step()

        # Test the model
        gp_model.eval()
        likelihood.eval()

        test_preds = likelihood(gp_model(test_x)).mean()
        mean_abs_error = torch.mean(torch.abs(test_y - test_preds))
        self.assertLess(mean_abs_error.data.squeeze()[0], 0.1)
Beispiel #8
0
    def test_regression_error_shared_inducing_locations(self):
        train_x, train_y = train_data()
        likelihood = GaussianLikelihood()
        inducing_points = torch.linspace(0, 1, 25).unsqueeze(-1)
        model = SVGPRegressionModel(inducing_points)
        mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(-1))

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()
        optimizer = optim.Adam([{"params": model.parameters()}, {"params": likelihood.parameters()}], lr=0.01)
        for _ in range(200):
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss = loss.sum()
            loss.backward()
            optimizer.step()

        for param in model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)

        # Set back to eval mode
        model.eval()
        likelihood.eval()
        test_preds = likelihood(model(train_x)).mean.squeeze()
        mean_abs_error = torch.mean(torch.abs(train_y[0, :] - test_preds[0, :]) / 2)
        mean_abs_error2 = torch.mean(torch.abs(train_y[1, :] - test_preds[1, :]) / 2)
        self.assertLess(mean_abs_error.item(), 1e-1)
        self.assertLess(mean_abs_error2.item(), 1e-1)
def test_kissgp_gp_mean_abs_error_cuda():
    if torch.cuda.is_available():
        train_x, train_y, test_x, test_y = make_data(cuda=True)
        likelihood = GaussianLikelihood().cuda()
        gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood).cuda()
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1)
        optimizer.n_iter = 0
        for i in range(25):
            optimizer.zero_grad()
            output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.n_iter += 1
            optimizer.step()

        # Test the model
        gp_model.eval()
        likelihood.eval()
        test_preds = likelihood(gp_model(test_x)).mean()
        mean_abs_error = torch.mean(torch.abs(test_y - test_preds))

        assert(mean_abs_error.data.squeeze()[0] < 0.02)
Beispiel #10
0
    def test_sgpr_mean_abs_error(self):
        train_x, train_y, test_x, test_y = make_data()
        likelihood = GaussianLikelihood()
        gp_model = GPRegressionModel(train_x, train_y, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(gp_model.parameters(), lr=0.1)
        for _ in range(30):
            optimizer.zero_grad()
            output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()

        for param in gp_model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)

        # Test the model
        gp_model.eval()
        likelihood.eval()

        test_preds = likelihood(gp_model(test_x)).mean
        mean_abs_error = torch.mean(torch.abs(test_y - test_preds))

        self.assertLess(mean_abs_error.squeeze().item(), 0.05)
Beispiel #11
0
    def test_indexed_train_and_eval(self):
        likelihood = GaussianLikelihood()
        model = LMCModel()

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()
        optimizer = torch.optim.Adam(
            [
                {
                    "params": model.parameters()
                },
                {
                    "params": likelihood.parameters()
                },
            ],
            lr=0.01,
        )

        # Our loss object. We're using the VariationalELBO, which essentially just computes the ELBO
        mll = gpytorch.mlls.VariationalELBO(likelihood,
                                            model,
                                            num_data=train_y.size(0))

        # Create some task indices
        arange = torch.arange(train_x.size(0))
        train_i = torch.rand(train_x.size(0)).mul(4).floor().long()

        # We use more CG iterations here because the preconditioner introduced in the NeurIPS paper seems to be less
        # effective for VI.
        for i in range(400):
            # Within each iteration, we will go over each minibatch of data
            optimizer.zero_grad()
            output = model(train_x, task_indices=train_i)
            loss = -mll(output, train_y[arange, train_i])
            loss.backward()
            optimizer.step()

            for param in model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)

        # Test the model
        model.eval()
        likelihood.eval()

        # Make predictions for both sets of test points, and check MAEs.
        with torch.no_grad(), gpytorch.settings.max_eager_kernel_size(1):
            predictions = likelihood(model(train_x, task_indices=train_i))
            mean_abs_error = torch.mean(
                torch.abs(train_y[arange, train_i] - predictions.mean))
            self.assertLess(mean_abs_error.squeeze().item(), 0.15)

            # Smoke test for getting predictive uncertainties
            lower, upper = predictions.confidence_region()
            self.assertEqual(lower.shape, train_i.shape)
            self.assertEqual(upper.shape, train_i.shape)
Beispiel #12
0
    def test_fantasy_updates_batch(self, cuda=False):
        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
        # We're manually going to set the hyperparameters to something they shouldn't be
        likelihood = GaussianLikelihood()
        gp_model = ExactGPModel(train_x, train_y, likelihood)
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)
        gp_model.covar_module.base_kernel.initialize(lengthscale=exp(1))
        gp_model.mean_module.initialize(constant=0)
        likelihood.initialize(noise=exp(1))

        if cuda:
            gp_model.cuda()
            likelihood.cuda()

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()
        optimizer = optim.Adam(list(gp_model.parameters()) +
                               list(likelihood.parameters()),
                               lr=0.15)
        optimizer.n_iter = 0
        for _ in range(50):
            optimizer.zero_grad()
            with gpytorch.settings.debug(False):
                output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.n_iter += 1
            optimizer.step()

        for param in gp_model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        optimizer.step()

        with gpytorch.settings.fast_pred_var():
            # Test the model
            gp_model.eval()
            likelihood.eval()
            test_function_predictions = likelihood(gp_model(test_x))

            # Cut data down, and then add back via the fantasy interface
            gp_model.set_train_data(train_x[:5], train_y[:5], strict=False)
            likelihood(gp_model(test_x))

            fantasy_x = train_x[5:].clone().unsqueeze(0).unsqueeze(-1).repeat(
                3, 1, 1).requires_grad_(True)
            fantasy_y = train_y[5:].unsqueeze(0).repeat(3, 1)
            fant_model = gp_model.get_fantasy_model(fantasy_x, fantasy_y)
            fant_function_predictions = likelihood(fant_model(test_x))

            self.assertTrue(
                approx_equal(test_function_predictions.mean,
                             fant_function_predictions.mean[0]))

            fant_function_predictions.mean.sum().backward()
            self.assertTrue(fantasy_x.grad is not None)
    def test_kissgp_gp_mean_abs_error(self):
        likelihood = GaussianLikelihood()
        gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.2)
        optimizer.n_iter = 0
        for _ in range(15):
            optimizer.zero_grad()
            output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.n_iter += 1

        for param in gp_model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        optimizer.step()

        # Test the model
        gp_model.eval()
        likelihood.eval()

        with gpytorch.fast_pred_var():
            test_preds = likelihood(gp_model(test_x)).mean()
        mean_abs_error = torch.mean(torch.abs(test_y - test_preds))
        self.assertLess(mean_abs_error.data.squeeze().item(), 0.15)
    def test_spectral_mixture_gp_mean_abs_error(self):
        likelihood = GaussianLikelihood(log_noise_bounds=(-5, 5))
        gp_model = SpectralMixtureGPModel(train_x.data, train_y.data,
                                          likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()
        optimizer = optim.Adam(
            list(gp_model.parameters()) + list(likelihood.parameters()),
            lr=0.1,
        )
        optimizer.n_iter = 0

        with gpytorch.settings.num_trace_samples(100):
            for _ in range(50):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            # Test the model
            gp_model.eval()
            likelihood.eval()
            test_preds = likelihood(gp_model(test_x)).mean()
            mean_abs_error = torch.mean(torch.abs(test_y - test_preds))

        # The spectral mixture kernel should be trivially able to
        # extrapolate the sine function.
        self.assertLess(mean_abs_error.data.squeeze()[0], 0.15)
Beispiel #15
0
def train_gp(train_x, train_y, use_ard, num_steps, hypers={}):
    """Fit a GP model where train_x is in [0, 1]^d and train_y is standardized."""
    assert train_x.ndim == 2
    assert train_y.ndim == 1
    assert train_x.shape[0] == train_y.shape[0]

    # Create hyper parameter bounds
    noise_constraint = Interval(5e-4, 0.2)
    if use_ard:
        lengthscale_constraint = Interval(0.005, 2.0)
    else:
        lengthscale_constraint = Interval(0.005, math.sqrt(
            train_x.shape[1]))  # [0.005, sqrt(dim)]
    outputscale_constraint = Interval(0.05, 20.0)

    # Create models
    likelihood = GaussianLikelihood(noise_constraint=noise_constraint).to(
        device=train_x.device, dtype=train_y.dtype)
    ard_dims = train_x.shape[1] if use_ard else None
    model = GP(
        train_x=train_x,
        train_y=train_y,
        likelihood=likelihood,
        lengthscale_constraint=lengthscale_constraint,
        outputscale_constraint=outputscale_constraint,
        ard_dims=ard_dims,
    ).to(device=train_x.device, dtype=train_x.dtype)

    # Find optimal model hyperparameters
    model.train()
    likelihood.train()

    # "Loss" for GPs - the marginal log likelihood
    mll = ExactMarginalLogLikelihood(likelihood, model)

    # Initialize model hypers
    if hypers:
        model.load_state_dict(hypers)
    else:
        hypers = {}
        hypers["covar_module.outputscale"] = 1.0
        hypers["covar_module.base_kernel.lengthscale"] = 0.5
        hypers["likelihood.noise"] = 0.005
        model.initialize(**hypers)

    # Use the adam optimizer
    optimizer = torch.optim.Adam([{"params": model.parameters()}], lr=0.1)

    for _ in range(num_steps):
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        optimizer.step()

    # Switch to eval mode
    model.eval()
    likelihood.eval()

    return model
Beispiel #16
0
    def test_regression_error_full(self, skip_logdet_forward=False):
        train_x, train_y = train_data()
        likelihood = GaussianLikelihood()
        model = SVGPRegressionModel(inducing_points=train_x, learn_locs=False)
        mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_y))

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()
        optimizer = optim.Adam([{"params": model.parameters()}, {"params": likelihood.parameters()}], lr=0.01)
        with gpytorch.settings.skip_logdet_forward(skip_logdet_forward):
            for _ in range(200):
                optimizer.zero_grad()
                output = model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.step()

        for param in model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)

        # Set back to eval mode
        model.eval()
        likelihood.eval()
        test_preds = likelihood(model(train_x)).mean.squeeze()
        mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2)
        self.assertLess(mean_abs_error.item(), 1e-1)
    def test_kissgp_gp_mean_abs_error(self):
        likelihood = GaussianLikelihood()
        gp_model = GPRegressionModel(train_x, train_y, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        with gpytorch.settings.max_preconditioner_size(10), gpytorch.settings.max_cg_iterations(50):
            with gpytorch.beta_features.fast_pred_var():
                # Optimize the model
                gp_model.train()
                likelihood.train()

                optimizer = optim.Adam(gp_model.parameters(), lr=0.01)
                optimizer.n_iter = 0
                for _ in range(15):
                    optimizer.zero_grad()
                    output = gp_model(train_x)
                    loss = -mll(output, train_y)
                    loss.backward()
                    optimizer.n_iter += 1
                    optimizer.step()

                for param in gp_model.parameters():
                    self.assertTrue(param.grad is not None)
                    self.assertGreater(param.grad.norm().item(), 0)
                for param in likelihood.parameters():
                    self.assertTrue(param.grad is not None)
                    self.assertGreater(param.grad.norm().item(), 0)

                # Test the model
                gp_model.eval()
                likelihood.eval()

                test_preds = likelihood(gp_model(test_x)).mean()
                mean_abs_error = torch.mean(torch.abs(test_y - test_preds))
                self.assertLess(mean_abs_error.squeeze().item(), 0.2)
Beispiel #18
0
    def test_sgpr_mean_abs_error(self):
        # Suppress numerical warnings
        warnings.simplefilter("ignore", NumericalWarning)

        train_x, train_y, test_x, test_y = make_data()
        likelihood = GaussianLikelihood()
        gp_model = GPRegressionModel(train_x, train_y, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(gp_model.parameters(), lr=0.1)
        for _ in range(30):
            optimizer.zero_grad()
            output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()

            # Check that we have the right LazyTensor type
            kernel = likelihood(
                gp_model(train_x)).lazy_covariance_matrix.evaluate_kernel()
            self.assertIsInstance(kernel,
                                  gpytorch.lazy.LowRankRootAddedDiagLazyTensor)

        for param in gp_model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)

        # Test the model
        gp_model.eval()
        likelihood.eval()

        test_preds = likelihood(gp_model(test_x)).mean
        mean_abs_error = torch.mean(torch.abs(test_y - test_preds))

        self.assertLess(mean_abs_error.squeeze().item(), 0.1)

        # Test variances
        test_vars = likelihood(gp_model(test_x)).variance
        self.assertAllClose(
            test_vars,
            likelihood(gp_model(test_x)).covariance_matrix.diagonal(dim1=-1,
                                                                    dim2=-2))
        self.assertGreater(test_vars.min().item() + 0.1,
                           likelihood.noise.item())
        self.assertLess(
            test_vars.max().item() - 0.05,
            likelihood.noise.item() +
            gp_model.covar_module.base_kernel.outputscale.item())

        # Test on training data
        test_outputs = likelihood(gp_model(train_x))
        self.assertLess((test_outputs.mean - train_y).max().item(), 0.1)
        self.assertLess(test_outputs.variance.max().item(),
                        likelihood.noise.item() * 2)
Beispiel #19
0
    def test_posterior_latent_gp_and_likelihood_fast_pred_var(
            self, cuda=False):
        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
        with gpytorch.settings.fast_pred_var(), gpytorch.settings.debug(False):
            # We're manually going to set the hyperparameters to
            # something they shouldn't be
            likelihood = GaussianLikelihood(
                noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1))
            gp_model = ExactGPModel(train_x, train_y, likelihood)
            mll = gpytorch.mlls.ExactMarginalLogLikelihood(
                likelihood, gp_model)
            gp_model.covar_module.base_kernel.initialize(lengthscale=exp(1))
            gp_model.mean_module.initialize(constant=0)
            likelihood.initialize(noise=exp(1))

            if cuda:
                gp_model.cuda()
                likelihood.cuda()

            # Find optimal model hyperparameters
            gp_model.train()
            likelihood.train()
            optimizer = optim.Adam(list(gp_model.parameters()) +
                                   list(likelihood.parameters()),
                                   lr=0.1)
            optimizer.n_iter = 0
            for _ in range(50):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            optimizer.step()

            # Test the model
            gp_model.eval()
            likelihood.eval()
            # Set the cache
            test_function_predictions = likelihood(gp_model(train_x))

            # Now bump up the likelihood to something huge
            # This will make it easy to calculate the variance
            likelihood.noise_covar.raw_noise.data.fill_(3)
            test_function_predictions = likelihood(gp_model(train_x))

            noise = likelihood.noise_covar.noise
            var_diff = (test_function_predictions.variance - noise).abs()

            self.assertLess(torch.max(var_diff / noise), 0.05)
    def test_kissgp_gp_mean_abs_error(self):
        train_x, train_y, test_x, test_y = make_data()
        train_dataset = TensorDataset(train_x, train_y)
        loader = DataLoader(train_dataset, shuffle=True, batch_size=64)

        gp_model = GPRegressionModel()
        likelihood = GaussianLikelihood()
        mll = gpytorch.mlls.VariationalMarginalLogLikelihood(
            likelihood,
            gp_model,
            n_data=len(train_y),
        )

        # Optimize the model
        gp_model.train()
        likelihood.train()

        with gpytorch.beta_features.diagonal_correction():
            optimizer = optim.SGD(
                list(gp_model.parameters()) + list(likelihood.parameters()),
                lr=0.1,
            )
            scheduler = optim.lr_scheduler.MultiStepLR(
                optimizer,
                milestones=[15],
                gamma=0.1,
            )
            for _ in range(20):
                scheduler.step()
                for x_batch, y_batch in loader:
                    x_batch = Variable(x_batch.float())
                    y_batch = Variable(y_batch.float())

                    optimizer.zero_grad()
                    output = gp_model(x_batch)
                    loss = -mll(output, y_batch)
                    loss.backward()
                    optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            optimizer.step()

            # Test the model
            gp_model.eval()
            likelihood.eval()

            test_preds = likelihood(gp_model(Variable(test_x))).mean()
            mean_abs_error = torch.mean(
                torch.abs(Variable(test_y) - test_preds))

        self.assertLess(mean_abs_error.data.squeeze().item(), 0.1)
    def test_regression_error(
        self,
        cuda=False,
        mll_cls=gpytorch.mlls.VariationalELBO,
        distribution_cls=gpytorch.variational.CholeskyVariationalDistribution,
    ):
        train_x, train_y = train_data(cuda=cuda)
        likelihood = GaussianLikelihood()
        model = SVGPRegressionModel(torch.linspace(0, 1, 25), distribution_cls)
        mll = mll_cls(likelihood, model, num_data=len(train_y))
        if cuda:
            likelihood = likelihood.cuda()
            model = model.cuda()
            mll = mll.cuda()

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()
        optimizer = optim.Adam([{
            "params": model.parameters()
        }, {
            "params": likelihood.parameters()
        }],
                               lr=0.01)

        for _ in range(200):
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()

        for param in model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)

        # Set back to eval mode
        model.eval()
        likelihood.eval()
        test_preds = likelihood(model(train_x)).mean.squeeze()
        mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2)
        self.assertLess(mean_abs_error.item(), 0.014)

        if distribution_cls is gpytorch.variational.CholeskyVariationalDistribution:
            # finally test fantasization
            # we only will check that tossing the entire training set into the model will reduce the mae
            model.likelihood = likelihood
            fant_model = model.get_fantasy_model(train_x, train_y)
            fant_preds = fant_model.likelihood(
                fant_model(train_x)).mean.squeeze()
            updated_abs_error = torch.mean(torch.abs(train_y - fant_preds) / 2)
            # TODO: figure out why this error is worse than before
            self.assertLess(updated_abs_error.item(), 0.15)
    def test_posterior_latent_gp_and_likelihood_with_optimization(self):
        # We're manually going to set the hyperparameters to something they shouldn't be
        likelihood = GaussianLikelihood(log_noise_bounds=(-3, 3))
        gp_model = ExactGPModel(train_x1.data, train_y1.data, likelihood)
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)
        gp_model.covar_module.initialize(log_lengthscale=1)
        gp_model.mean_module.initialize(constant=0)
        likelihood.initialize(log_noise=1)

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()
        optimizer = optim.Adam(list(gp_model.parameters()) +
                               list(likelihood.parameters()),
                               lr=0.1)
        optimizer.n_iter = 0
        for _ in range(50):
            optimizer.zero_grad()
            output = gp_model(train_x1)
            loss = -mll(output, train_y1)
            loss.backward()
            optimizer.n_iter += 1
            optimizer.step()

        for param in gp_model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        for param in likelihood.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        optimizer.step()

        # Test the model
        gp_model.eval()
        likelihood.eval()

        # Create data batches
        train_x12 = torch.cat((train_x1.unsqueeze(0), train_x2.unsqueeze(0)),
                              dim=0).contiguous()
        train_y12 = torch.cat((train_y1.unsqueeze(0), train_y2.unsqueeze(0)),
                              dim=0).contiguous()
        test_x12 = torch.cat((test_x1.unsqueeze(0), test_x2.unsqueeze(0)),
                             dim=0).contiguous()

        # Update gp model to use both sine and cosine training data as train data
        gp_model.set_train_data(train_x12, train_y12, strict=False)

        # Make predictions for both sets of test points, and check MAEs.
        batch_predictions = likelihood(gp_model(test_x12))
        preds1 = batch_predictions.mean()[0]
        preds2 = batch_predictions.mean()[1]
        mean_abs_error1 = torch.mean(torch.abs(test_y1 - preds1))
        mean_abs_error2 = torch.mean(torch.abs(test_y2 - preds2))
        self.assertLess(mean_abs_error1.data.squeeze().item(), 0.05)
        self.assertLess(mean_abs_error2.data.squeeze().item(), 0.05)
Beispiel #23
0
    def test_regression_error(
        self,
        cuda=False,
        mll_cls=gpytorch.mlls.VariationalELBO,
        distribution_cls=gpytorch.variational.CholeskyVariationalDistribution,
    ):
        train_x, train_y = train_data(cuda=cuda)
        likelihood = GaussianLikelihood()
        model = SVGPRegressionModel(torch.linspace(0, 1, 25), distribution_cls)
        mll = mll_cls(likelihood, model, num_data=len(train_y))
        if cuda:
            likelihood = likelihood.cuda()
            model = model.cuda()
            mll = mll.cuda()

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()
        optimizer = optim.Adam([{
            "params": model.parameters()
        }, {
            "params": likelihood.parameters()
        }],
                               lr=0.01)

        _wrapped_cg = MagicMock(wraps=gpytorch.utils.linear_cg)
        _cg_mock = patch("gpytorch.utils.linear_cg", new=_wrapped_cg)
        with warnings.catch_warnings(record=True) as ws, _cg_mock as cg_mock:
            for _ in range(150):
                optimizer.zero_grad()
                output = model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.step()

            for param in model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)

            # Set back to eval mode
            model.eval()
            likelihood.eval()
            test_preds = likelihood(model(train_x)).mean.squeeze()
            mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2)
            self.assertLess(mean_abs_error.item(), 1e-1)

            # Make sure CG was called (or not), and no warnings were thrown
            self.assertFalse(cg_mock.called)
            self.assertFalse(
                any(
                    issubclass(w.category, ExtraComputationWarning)
                    for w in ws))
    def test_posterior_latent_gp_and_likelihood_with_optimization(
            self, cuda=False, checkpoint=0):
        train_x, test_x, train_y, test_y = self._get_data(
            cuda=cuda,
            num_data=(1000 if checkpoint else 11),
            add_noise=bool(checkpoint),
        )
        # We're manually going to set the hyperparameters to something they shouldn't be
        likelihood = GaussianLikelihood(
            noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1))
        gp_model = ExactGPModel(train_x, train_y, likelihood)
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)
        gp_model.covar_module.base_kernel.initialize(lengthscale=exp(1))
        gp_model.mean_module.initialize(constant=0)
        likelihood.initialize(noise=exp(1))

        if cuda:
            gp_model.cuda()
            likelihood.cuda()

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()
        optimizer = optim.Adam(list(gp_model.parameters()) +
                               list(likelihood.parameters()),
                               lr=0.15)
        optimizer.n_iter = 0
        with gpytorch.beta_features.checkpoint_kernel(
                checkpoint), gpytorch.settings.fast_pred_var():
            for _ in range(20 if checkpoint else 50):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            optimizer.step()

        # Test the model
        gp_model.eval()
        likelihood.eval()
        with gpytorch.settings.skip_posterior_variances(True):
            test_function_predictions = likelihood(gp_model(test_x))
        mean_abs_error = torch.mean(
            torch.abs(test_y - test_function_predictions.mean))

        self.assertLess(mean_abs_error.item(), 0.05)
Beispiel #25
0
class GPSurrogate(Surrogate):
    def __init__(self, **hps):
        super().__init__()

        defaults = {
            'gp_noise': 1e-4,
            'opt_iters': 200,
            'lr': 0.05,
        }
        defaults.update(hps)
        self.hps = defaults

        self.likelihood = GaussianLikelihood()
        self.likelihood.initialize(noise=self.hps['gp_noise'])

    def fit(self, x, y, iters=None):
        if not iters:
            iters = self.hps['opt_iters']

        train_x = torch.from_numpy(x.astype(np.float32))
        train_y = torch.from_numpy(y.astype(np.float32))

        self.regressor = GPRegressionModel(train_x, train_y, self.likelihood)

        # set train mode
        self.regressor.train()
        self.likelihood.train()

        optimizer = torch.optim.Adam(self.regressor.parameters(),
                                     lr=self.hps['lr'])

        mll = ExactMarginalLogLikelihood(self.likelihood, self.regressor)

        for i in range(iters):
            optimizer.zero_grad()
            output = self.regressor(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()

        return self.regressor

    def predict(self, x, var=False):
        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            pred_x = torch.from_numpy(x.astype(np.float32))

            # set eval mode
            self.regressor.eval()
            self.likelihood.eval()

            prediction = self.likelihood(self.regressor(pred_x))
            if var:
                return prediction.mean.numpy(), prediction.variance.numpy()
            return prediction.mean.numpy()
Beispiel #26
0
    def test_sgpr_mean_abs_error_cuda(self):
        # Suppress numerical warnings
        warnings.simplefilter("ignore", NumericalWarning)

        if not torch.cuda.is_available():
            return
        with least_used_cuda_device():
            train_x, train_y, test_x, test_y = make_data(cuda=True)
            likelihood = GaussianLikelihood().cuda()
            gp_model = GPRegressionModel(train_x, train_y, likelihood).cuda()
            mll = gpytorch.mlls.ExactMarginalLogLikelihood(
                likelihood, gp_model)

            # Optimize the model
            gp_model.train()
            likelihood.train()

            optimizer = optim.Adam(gp_model.parameters(), lr=0.1)
            optimizer.n_iter = 0
            for _ in range(25):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)

            # Test the model
            gp_model.eval()
            likelihood.eval()
            test_preds = likelihood(gp_model(test_x)).mean
            mean_abs_error = torch.mean(torch.abs(test_y - test_preds))

            self.assertLess(mean_abs_error.squeeze().item(), 0.02)

            # Test variances
            test_vars = likelihood(gp_model(test_x)).variance
            self.assertAllClose(
                test_vars,
                likelihood(gp_model(test_x)).covariance_matrix.diagonal(
                    dim1=-1, dim2=-2))
            self.assertGreater(test_vars.min().item() + 0.05,
                               likelihood.noise.item())
            self.assertLess(
                test_vars.max().item() - 0.05,
                likelihood.noise.item() +
                gp_model.covar_module.base_kernel.outputscale.item())
Beispiel #27
0
    def test_posterior_latent_gp_and_likelihood_fast_pred_var(self):
        with gpytorch.fast_pred_var():
            # We're manually going to set the hyperparameters to
            # something they shouldn't be
            likelihood = GaussianLikelihood(log_noise_bounds=(-3, 3))
            gp_model = ExactGPModel(train_x.data, train_y.data, likelihood)
            mll = gpytorch.mlls.ExactMarginalLogLikelihood(
                likelihood, gp_model)
            gp_model.rbf_covar_module.initialize(log_lengthscale=1)
            gp_model.mean_module.initialize(constant=0)
            likelihood.initialize(log_noise=1)

            # Find optimal model hyperparameters
            gp_model.train()
            likelihood.train()
            optimizer = optim.Adam(list(gp_model.parameters()) +
                                   list(likelihood.parameters()),
                                   lr=0.1)
            optimizer.n_iter = 0
            for _ in range(50):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            optimizer.step()

            # Test the model
            gp_model.eval()
            likelihood.eval()
            # Set the cache
            test_function_predictions = likelihood(gp_model(train_x))

            # Now bump up the likelihood to something huge
            # This will make it easy to calculate the variance
            likelihood.log_noise.data.fill_(3)
            test_function_predictions = likelihood(gp_model(train_x))

            noise = likelihood.log_noise.exp()
            var_diff = (test_function_predictions.var() - noise).abs()

            self.assertLess(torch.max(var_diff.data / noise.data), 0.05)
Beispiel #28
0
def svgp(args, dataloader, test_x, kernel=None):
    N = len(dataloader.dataset)

    inducing_points, _ = kmeans2(dataloader.dataset.train_x.numpy(),
                                 args.n_inducing,
                                 minit='points')
    inducing_points = torch.from_numpy(inducing_points).squeeze(-1)

    model = SVGP(inducing_points, kernel)
    # p(y|f)
    likelihood = GaussianLikelihood()

    model.train()
    likelihood.train()

    optimizer = optim.Adam([{
        'params': model.parameters()
    }, {
        'params': likelihood.parameters()
    }],
                           lr=args.learning_rate)

    mll = VariationalELBO(likelihood, model, N, combine_terms=False)

    for epoch in range(args.n_iters):
        for train_x, train_y in dataloader:
            train_x, train_y = train_x.squeeze(), train_y.squeeze()
            optimizer.zero_grad()
            output = model(train_x)

            log_ll, kl_div, log_prior = mll(output, train_y)
            loss = -(log_ll - kl_div + log_prior)
            loss.backward()
            optimizer.step()
        if epoch % 50 == 0:
            print("Iter {}, lower bound = {:.4f}, obs_var = {:.4f}".format(
                epoch, -loss.item(), likelihood.noise.item()))

    test_stats = TestStats(None, None)
    model.eval()
    likelihood.eval()
    with torch.no_grad():
        observed_pred = likelihood(model(test_x))
        test_y_mean = observed_pred.mean
        test_y_var = observed_pred.variance
    test_stats = test_stats._replace(test_y_mean=test_y_mean,
                                     test_y_var=test_y_var)

    return test_stats
Beispiel #29
0
    def test_posterior_latent_gp_and_likelihood_with_optimization(
            self, cuda=False):
        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
        # We're manually going to set the hyperparameters to something they shouldn't be
        likelihood = GaussianLikelihood(
            noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1))
        gp_model = ExactGPModel(train_x, train_y, likelihood)
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)
        gp_model.rbf_covar_module.initialize(log_lengthscale=1)
        gp_model.mean_module.initialize(constant=0)
        likelihood.initialize(log_noise=1)

        if cuda:
            gp_model.cuda()
            likelihood.cuda()

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(list(gp_model.parameters()) +
                               list(likelihood.parameters()),
                               lr=0.1)
        optimizer.n_iter = 0
        with gpytorch.settings.debug(False):
            for _ in range(75):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            optimizer.step()

            # Test the model
            gp_model.eval()
            likelihood.eval()
            test_function_predictions = likelihood(gp_model(test_x))
            mean_abs_error = torch.mean(
                torch.abs(test_y - test_function_predictions.mean))

        self.assertLess(mean_abs_error.squeeze().item(), 0.05)
Beispiel #30
0
def exact_gp(data, n_epochs, kernel=None):
    train_x, train_y, test_x = data
    N = train_x.size(0)

    if N > 1000:
        train_x = train_x[:1000]
        train_y = train_y[:1000]

    # p(y|x, f)
    likelihood = GaussianLikelihood()
    # p(f|X, Y)
    model = ExactGPModel(train_x, train_y, likelihood, kernel)

    model.train()
    likelihood.train()

    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # p(y | x, X, Y) = ∫p(y|x, f)p(f|X, Y)df
    mll = ExactMarginalLogLikelihood(likelihood, model)

    for epoch in range(1, n_epochs + 1):
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()

        if epoch % 50 == 0:
            print(
                'Iter {}/{} - Loss: {:.3f}   lengthscale: {:.3f}   noise: {:.3f}'
                .format(epoch, n_epochs, loss.item(),
                        model.covar_module.base_kernel.lengthscale.item(),
                        model.likelihood.noise.item()))

        optimizer.step()

    # test
    test_stats = TestStats(None, None)
    if N <= 1000:
        model.eval()
        likelihood.eval()
        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            observed_pred = likelihood(model(test_x))
            test_y_mean = observed_pred.mean
            test_y_var = observed_pred.variance
        test_stats = test_stats._replace(test_y_mean=test_y_mean,
                                         test_y_var=test_y_var)

    return model, test_stats