Ejemplo n.º 1
0
    def test_optimize(self):
        with defer_build():
            input_layer = InputLayer(input_dim=1,
                                     output_dim=1,
                                     num_inducing=self.M,
                                     kernel=RBF(1) + White(1),
                                     multitask=True)
            output_layer = OutputLayer(input_dim=1,
                                       output_dim=1,
                                       num_inducing=self.M,
                                       kernel=RBF(1) + White(1),
                                       multitask=True)

            seq = MultitaskSequential([input_layer, output_layer])

            model = MultitaskDSDGP(X=self.X,
                                   Y=self.Y,
                                   Z=self.Z,
                                   layers=seq,
                                   likelihood=SwitchedLikelihood(
                                       [Gaussian(), Gaussian()]),
                                   num_latent=1)
        model.compile()
        before = model.compute_log_likelihood()
        opt = gpflow.train.AdamOptimizer(0.01)
        opt.minimize(model, maxiter=100)
        after = model.compute_log_likelihood()
        self.assertGreaterEqual(after, before)
Ejemplo n.º 2
0
def test_latent_kernels():
    kernel_list = [SquaredExponential(), White(), White() + Linear()]

    multioutput_kernel_list = [
        SharedIndependent(SquaredExponential(), 3),
        SeparateIndependent(kernel_list),
        LinearCoregionalization(kernel_list, np.random.random((5, 3))),
    ]
    assert len(multioutput_kernel_list[0].latent_kernels) == 1
    assert multioutput_kernel_list[1].latent_kernels == tuple(kernel_list)
    assert multioutput_kernel_list[2].latent_kernels == tuple(kernel_list)
def make_DGP(L, D_problem, D_hidden, X, Y, Z):
    kernels = []
    # First layer
    kernels.append(RBF(D_problem, lengthscales=0.2, variance=1.) + White(D_problem, variance=1e-5))
    for l in range(L-1):
        k = RBF(D_hidden, lengthscales=0.2, variance=1.) + White(D_hidden, variance=1e-5)
        kernels.append(k)

    m_dgp = DGP(X, Y, Z, kernels, Gaussian(), num_samples=10)

    # init the layers to near determinisic
    for layer in m_dgp.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5
    return m_dgp
def make_dgp(X, Y, Z, L):
    D = X.shape[1]
    Y_mean, Y_std = np.average(Y), np.std(Y)

    # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional
    kernels = []
    for l in range(L):
        kernels.append(RBF(D, lengthscales=1., variance=1.))

    # between layer noise (doesn't actually make much difference but we include it anyway)
    for kernel in kernels[:-1]:
        kernel += White(D, variance=1e-5)

    mb = 10000 if X.shape[0] > 10000 else None
    model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb)

    # same final layer inits we used for the single layer model
    model.layers[-1].kern.variance = Y_std**2
    model.likelihood.variance = Y_std * 0.1
    model.layers[-1].mean_function = Constant(Y_mean)
    model.layers[-1].mean_function.fixed = True

    # start the inner layers almost deterministically
    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5

    return model
Ejemplo n.º 5
0
def residual_kernel(K_Y: np.ndarray, K_X: np.ndarray, use_expectation=True, with_gp=True, sigma_squared=1e-3, return_learned_K_X=False):
    """Kernel matrix of residual of Y given X based on their kernel matrices, Y=f(X)"""
    import gpflow
    from gpflow.kernels import White, Linear
    from gpflow.models import GPR

    K_Y, K_X = centering(K_Y), centering(K_X)
    T = len(K_Y)

    if with_gp:
        eig_Ky, eiy = truncated_eigen(*eigdec(K_Y, min(100, T // 4)))
        eig_Kx, eix = truncated_eigen(*eigdec(K_X, min(100, T // 4)))

        X = eix @ diag(sqrt(eig_Kx))  # X @ X.T is close to K_X
        Y = eiy @ diag(sqrt(eig_Ky))
        n_feats = X.shape[1]

        linear = Linear(n_feats, ARD=True)
        white = White(n_feats)
        gp_model = GPR(X, Y, linear + white)
        gpflow.train.ScipyOptimizer().minimize(gp_model)

        K_X = linear.compute_K_symm(X)
        sigma_squared = white.variance.value

    P = pdinv(np.eye(T) + K_X / sigma_squared)  # == I-K @ inv(K+Sigma) in Zhang et al. 2011
    if use_expectation:  # Flaxman et al. 2016 Gaussian Processes for Independence Tests with Non-iid Data in Causal Inference.
        RK = (K_X + P @ K_Y) @ P
    else:  # Zhang et al. 2011. Kernel-based Conditional Independence Test and Application in Causal Discovery.
        RK = P @ K_Y @ P

    if return_learned_K_X:
        return RK, K_X
    else:
        return RK
Ejemplo n.º 6
0
def regression_distance_k(Kx: np.ndarray, Ky: np.ndarray):
    warnings.warn('not tested yet!')
    import gpflow
    from gpflow.kernels import White, Linear
    from gpflow.models import GPR

    T = len(Kx)

    eig_Ky, eiy = truncated_eigen(*eigdec(Ky, min(100, T // 4)))
    eig_Kx, eix = truncated_eigen(*eigdec(Kx, min(100, T // 4)))

    X = eix @ diag(sqrt(eig_Kx))  # X @ X.T is close to K_X
    Y = eiy @ diag(sqrt(eig_Ky))
    n_feats = X.shape[1]

    linear = Linear(n_feats, ARD=True)
    white = White(n_feats)
    gp_model = GPR(X, Y, linear + white)
    gpflow.train.ScipyOptimizer().minimize(gp_model)

    Kx = linear.compute_K_symm(X)
    sigma_squared = white.variance.value

    P = Kx @ pdinv(Kx + sigma_squared * np.eye(T))

    M = P @ Ky @ P
    O = np.ones((T, 1))
    N = O @ np.diag(M).T
    D = np.sqrt(N + N.T - 2 * M)
    return D
Ejemplo n.º 7
0
    def temporal_kernel(self):
        kernel = White(variance=self.model_config['noise_inner'])
        m_inds = list(range(self.m))
        # Initialize a non-linear kernels over inputs
        if self.model_config['input_nonlinear']:
            scales = [self.model_config['scale']
                      ] * self.m if self.model_config[
                          'scale_tie'] else self.model_config['scale']
            if self.model_config['rq']:
                kernel += RationalQuadratic(active_dims=m_inds,
                                            variance=1.0,
                                            lengthscales=scales,
                                            alpha=1e-2)
            else:
                kernel += SquaredExponential(active_dims=m_inds,
                                             variance=1.0,
                                             lengthscales=scales)
        # Add a periodic kernel over inputs
        # Decay?????
        if self.model_config['per']:
            scales = [self.model_config['per_scale']] * self.m
            periods = [self.model_config['per_period']] * self.m
            base_kernel = SquaredExponential(active_dims=m_inds,
                                             variance=1.0,
                                             lengthscales=scales)
            kernel += Periodic(base_kernel, period=periods)

        # Add a linear kernel over inputs
        if self.model_config['input_linear']:
            variances = [self.model_config['input_linear_scale']] * self.m
            kernel += LinearKernel(active_dims=m_inds, variance=variances)
        return kernel
Ejemplo n.º 8
0
    def prepare(self):
        N = 100
        M = 10
        rng = np.random.RandomState(42)
        X = rng.randn(N, 2)
        Y = rng.randn(N, 1)
        Z = rng.randn(M, 2)

        X_ind = rng.randint(0, 2, (N, 1))
        Z_ind = rng.randint(0, 2, (M, 1))

        X = np.hstack([X, X_ind])
        Y = np.hstack([Y, X_ind])
        Z = np.hstack([Z, Z_ind])

        Xs = rng.randn(M, 2)
        Xs_ind = rng.randint(0, 2, (M, 1))
        Xs = np.hstack([Xs, Xs_ind])

        with defer_build():
            lik = SwitchedLikelihood([Gaussian(), Gaussian()])

            input_layer = InputLayer(input_dim=2,
                                     output_dim=1,
                                     num_inducing=M,
                                     kernel=RBF(2) + White(2),
                                     mean_function=Linear(A=np.ones((3, 1))),
                                     multitask=True)
            output_layer = OutputLayer(input_dim=1,
                                       output_dim=1,
                                       num_inducing=M,
                                       kernel=RBF(1) + White(1),
                                       multitask=True)

            seq = MultitaskSequential([input_layer, output_layer])

            model = MultitaskDSDGP(X=X,
                                   Y=Y,
                                   Z=Z,
                                   layers=seq,
                                   likelihood=lik,
                                   num_latent=1)
        model.compile()
        return model, Xs
Ejemplo n.º 9
0
    def __init__(self,
                 X,
                 Y,
                 inducing_points,
                 final_inducing_points,
                 hidden_units,
                 units,
                 share_inducing_inputs=True):
        Model.__init__(self)

        assert X.shape[0] == Y.shape[0]

        self.num_data, D_X = X.shape
        self.D_Y = 1
        self.num_samples = 100

        kernels = []
        for l in range(hidden_units + 1):
            ks = []
            if (l > 0):
                D = units
            else:
                D = D_X
            if (l < hidden_units):
                for w in range(units):
                    ks.append(
                        RBF(D, lengthscales=1., variance=1.) +
                        White(D, variance=1e-5))
            else:
                ks.append(RBF(D, lengthscales=1., variance=1.))
            kernels.append(ks)

        self.dims_in = [D_X] + [units] * hidden_units
        self.dims_out = [units] * hidden_units + [1]
        q_mus, q_sqrts, Zs, mean_functions = init_layers(
            X, self.dims_in, self.dims_out, inducing_points,
            final_inducing_points, share_inducing_inputs)

        layers = []
        for q_mu, q_sqrt, Z, mean_function, kernel in zip(
                q_mus, q_sqrts, Zs, mean_functions, kernels):
            layers.append(Layer(kernel, q_mu, q_sqrt, Z, mean_function))
        self.layers = ParamList(layers)

        for layer in self.layers[:-1]:  # fix the inner layer mean functions
            layer.mean_function.fixed = True

        self.likelihood = Gaussian()

        minibatch_size = 10000 if X.shape[0] > 10000 else None
        if minibatch_size is not None:
            self.X = MinibatchData(X, minibatch_size)
            self.Y = MinibatchData(Y, minibatch_size)
        else:
            self.X = DataHolder(X)
            self.Y = DataHolder(Y)
Ejemplo n.º 10
0
    def make_mf_dgp(cls, X, Y, Z, add_linear=True, minibatch_size=None):
        """
        Constructor for convenience. Constructs a mf-dgp model from training data and inducing point locations

        :param X: List of target
        :param Y:
        :param Z:
        :param add_linear:
        :return:
        """

        n_fidelities = len(X)

        Din = X[0].shape[1]
        Dout = Y[0].shape[1]

        kernels = [RBF(Din, active_dims=list(range(Din)), variance=1., lengthscales=1, ARD=True)]
        for l in range(1, n_fidelities):
            D = Din + Dout
            D_range = list(range(D))
            k_corr = RBF(Din, active_dims=D_range[:Din], lengthscales=1, variance=1.0, ARD=True)
            k_prev = RBF(Dout, active_dims=D_range[Din:], variance=1., lengthscales=1.0)
            k_in = RBF(Din, active_dims=D_range[:Din], variance=1., lengthscales=1, ARD=True)
            if add_linear:
                k_l = k_corr * (k_prev + Linear(Dout, active_dims=D_range[Din:], variance=1.)) + k_in
            else:
                k_l = k_corr * k_prev + k_in
            kernels.append(k_l)

        """
        A White noise kernel is currently expected by Mf-DGP at all layers except the last.
        In cases where no noise is desired, this should be set to 0 and fixed, as follows:

            white = White(1, variance=0.)
            white.variance.trainable = False
            kernels[i] += white
        """
        for i, kernel in enumerate(kernels[:-1]):
            kernels[i] += White(1, variance=1e-6)

        num_data = 0
        for i in range(len(X)):
            _log.info('\nData at Fidelity {}'.format(i + 1))
            _log.info('X - {}'.format(X[i].shape))
            _log.info('Y - {}'.format(Y[i].shape))
            _log.info('Z - {}'.format(Z[i].shape))
            num_data += X[i].shape[0]

        layers = init_layers_mf(Y, Z, kernels, num_outputs=Dout)

        model = DGP_Base(X, Y, Gaussian(), layers, num_samples=10, minibatch_size=minibatch_size)

        return model
Ejemplo n.º 11
0
    def test_contructor(self):
        input_layer = InputLayer(input_dim=1,
                                 output_dim=1,
                                 num_inducing=self.M,
                                 kernel=RBF(1) + White(1))
        output_layer = OutputLayer(input_dim=1,
                                   output_dim=1,
                                   num_inducing=self.M,
                                   kernel=RBF(1) + White(1))

        seq = Sequential([input_layer, output_layer])

        try:
            model = DSDGP(X=self.X,
                          Y=self.Y,
                          Z=self.Z,
                          layers=seq,
                          likelihood=Gaussian())
        except Exception as e:
            print(e)
            self.fail('DSDGP contructor fails')
def make_deep_GP(num_layers, X, Y, Z):
    kernels = []
    layer_sizes = []
    for l in range(num_layers):
        kernel = RBF(lengthscale=0.2, variance=1.0) + White(variance=1e-5)
        kernels.append(kernel)
        layer_sizes.append(1)

    dgp = DeepGP(X, Y, Z, kernels, layer_sizes, Gaussian(), num_samples=100)

    # init hidden layers to be near deterministic
    for layer in dgp.layers[:-1]:
        layer.q_sqrt.assign(layer.q_sqrt * 1e-5)
    return dgp
Ejemplo n.º 13
0
    def prepare(self):
        N = 100
        M = 10
        rng = np.random.RandomState(42)
        X = rng.randn(N, 2)
        Y = rng.randn(N, 1)
        Z = rng.randn(M, 2)
        Xs = rng.randn(M, 2)
        lik = Gaussian()
        input_layer = InputLayer(input_dim=2,
                                 output_dim=1,
                                 num_inducing=M,
                                 kernel=RBF(2) + White(2),
                                 mean_function=Linear(A=np.ones((2, 1))))
        output_layer = OutputLayer(input_dim=1,
                                   output_dim=1,
                                   num_inducing=M,
                                   kernel=RBF(1) + White(1))

        seq = Sequential([input_layer, output_layer])

        model = DSDGP(X=X, Y=Y, Z=Z, layers=seq, likelihood=lik)
        model.compile()
        return model, Xs
Ejemplo n.º 14
0
def compute_residual_eig(Y: np.ndarray, Kx: np.ndarray) -> np.ndarray:
    """Residual of Y based on Kx, a kernel matrix of X"""
    assert len(Y) == len(Kx)

    eig_Kx, eix = truncated_eigen(*eigdec(Kx, min(100, len(Kx) // 4)))
    phi_X = eix @ np.diag(np.sqrt(eig_Kx))  # X @ X.T is close to K_X
    n_feats = phi_X.shape[1]

    linear_kernel = Linear(n_feats, ARD=True)
    gp_model = GPR(phi_X, Y, linear_kernel + White(n_feats))
    gp_model.optimize()

    new_Kx = linear_kernel.compute_K_symm(phi_X)
    sigma_squared = gp_model.kern.white.variance.value[0]

    return (pdinv(np.eye(len(Kx)) + new_Kx / sigma_squared) @ Y).squeeze()
Ejemplo n.º 15
0
def residual_kernel_matrix_kernel_real(Kx, Z, num_eig, ARD=True):
    """K_X|Z"""
    assert len(Kx) == len(Z)
    assert num_eig <= len(Kx)

    T = len(Kx)
    D = Z.shape[1]
    I = eye(T)
    eig_Kx, eix = truncated_eigen(*eigdec(Kx, num_eig))

    rbf = RBF(D, ARD=ARD)
    white = White(D)
    gp_model = GPR(Z, 2 * sqrt(T) * eix @ diag(sqrt(eig_Kx)) / sqrt(eig_Kx[0]),
                   rbf + white)
    gpflow.train.ScipyOptimizer().minimize(gp_model)

    sigma_squared = white.variance.value
    Kz_x = rbf.compute_K_symm(Z)

    P = I - Kz_x @ pdinv(Kz_x + sigma_squared * I)
    return P @ Kx @ P.T
Ejemplo n.º 16
0
def regression_distance(Y: np.ndarray, Z: np.ndarray, ard=True):
    """d(z,z') = |f(z)-f(z')| where Y=f(Z) + noise and f ~ GP"""
    import gpflow
    from gpflow.kernels import White, RBF
    from gpflow.models import GPR

    n, dims = Z.shape

    rbf = RBF(dims, ARD=ard)
    rbf_white = rbf + White(dims)

    gp_model = GPR(Z, Y, rbf_white)
    gpflow.train.ScipyOptimizer().minimize(gp_model)

    Kz_y = rbf.compute_K_symm(Z)
    Ry = pdinv(rbf_white.compute_K_symm(Z))
    Fy = Y.T @ Ry @ Kz_y  # F(z)

    M = Fy.T @ Fy
    O = np.ones((n, 1))
    N = O @ (np.diag(M)[:, None]).T
    D = np.sqrt(N + N.T - 2 * M)

    return D, Kz_y
Ejemplo n.º 17
0
    def _kernels_generator(self):
        def _determine_indicies(m, pi, markov):
            # Build in the Markov structure: juggle with the indices of the outputs.
            p_last = pi - 1  # Index of last output that is given as input.
            p_start = 0 if markov is None else max(p_last - (markov - 1), 0)
            p_num = p_last - p_start + 1

            # Determine the indices corresponding to the outputs and inputs.
            m_inds = list(range(m))
            p_inds = list(range(m + p_start, m + p_last + 1))

            return m_inds, p_inds, p_num

        kernels = []
        for pi in range(self.num_outputs):
            m_inds, p_inds, p_num = _determine_indicies(
                self.m, pi, self.model_config['markov'])
            # Construct inner-layers noise kernel
            kernel = White(variance=self.model_config['noise_inner'])
            # Initialize a non-linear kernels over inputs
            #if pi==0:
            scales = [self.model_config['scale']
                      ] * self.m if self.model_config[
                          'scale_tie'] else self.model_config['scale']
            if self.model_config['rq']:
                kernel += RationalQuadratic(active_dims=m_inds,
                                            variance=1.0,
                                            lengthscales=scales,
                                            alpha=1e-2)
            else:
                kernel += SquaredExponential(active_dims=m_inds,
                                             variance=1.0,
                                             lengthscales=scales)
            # Add a periodic kernel over inputs
            # Decay?????
            if self.model_config['per']:
                scales = [self.model_config['per_scale']] * self.m
                periods = [self.model_config['per_period']] * self.m
                base_kernel = SquaredExponential(active_dims=m_inds,
                                                 variance=1.0,
                                                 lengthscales=scales)
                kernel += Periodic(base_kernel, period=periods)

            # Add a linear kernel over inputs
            if self.model_config['input_linear']:
                variances = [self.model_config['input_linear_scale']] * self.m
                kernel += LinearKernel(active_dims=m_inds, variance=variances)
            # Add a linear kernel over outputs
            if self.model_config['linear'] and pi > 0:
                variances = [self.model_config['linear_scale']] * p_num
                kernel += LinearKernel(active_dims=p_inds, variance=variances)

            # Add a non-linear kernel over outputs
            if self.model_config['nonlinear'] and pi > 0:
                if self.model_config['nonlinear_dependent']:
                    active_dims = m_inds.extend(p_inds)
                    scales = [self.model_config['scale']] * self.m
                    scales.extend([self.model_config['nonlinear_scale']] *
                                  p_num)
                else:
                    active_dims = p_inds
                    scales = [self.model_config['nonlinear_scale']] * p_num
                if self.model_config['rq']:
                    kernel += RationalQuadratic(active_dims=active_dims,
                                                variance=1.0,
                                                lengthscales=scales,
                                                alpha=1e-2)
                else:
                    kernel += SquaredExponential(active_dims=active_dims,
                                                 variance=1.0,
                                                 lengthscales=scales)

            kernels.append(kernel)

        return kernels
Ejemplo n.º 18
0
def prepare_model(name,
                  X,
                  y,
                  Z,
                  num_samples_train=5,
                  minibatch=None,
                  M=30,
                  small_architecture=True):
    """
    Initialize three layer deep GPs with different architectures,
    variational families, and inference methods.
    
    name can be one of {'fc', 'star', 'mf', 'fc_sampled'} and gives
    the fully-coupled, stripes-and-arrow, or mean-field dgp with analytical
    marginalisation of the inducing outputs, or the fully-coupled dgp with
    marginalisation by Monte Carlo sampling, respectively.
    
    The variational parameters are initialized as described in e.g.
    https://github.com/ICL-SML/Doubly-Stochastic-DGP/blob/master/demos/demo_regression_UCI.ipynb
    making the training more effective in the beginning.
    """
    #prepare the kernels (3 layers)
    #use rbf kernels in all layers and additionally white noise kernels in all but the last layer
    #disable training the variance of the rbf kernel in the intermediate layers
    #if small_architecture=True 2 GPs in both hidden layers, otherwise 5
    dim_X = X.shape[1]

    k = RBF(dim_X, ARD=True, lengthscales=1)
    k.variance.set_trainable(False)
    k += White(dim_X, variance=1e-3)

    Ks = [k]
    if small_architecture:
        k = RBF(2, ARD=True, lengthscales=1)
        k.variance.set_trainable(False)
        k += White(2, variance=1e-3)
        Ks += [k, RBF(2, ARD=True, lengthscales=1)]
    else:
        k = RBF(5, ARD=True, lengthscales=1)
        k.variance.set_trainable(False)
        k += White(5, variance=1e-3)
        Ks += [k, RBF(5, ARD=True, lengthscales=1)]

    assert name in ['fc', 'star', 'mf',
                    'fc_sampled'], 'Unknown name of dgp model used'

    if name == 'fc':
        #fully-coupled
        model = Full_DGP(X,
                         y,
                         Z.copy(),
                         Ks.copy(),
                         Gaussian(0.01),
                         minibatch_size=minibatch,
                         num_samples=num_samples_train)
    elif name == 'star':
        #stripes-and-arrow
        model = Fast_Approx_Full_DGP(X,
                                     y,
                                     Z.copy(),
                                     Ks.copy(),
                                     Gaussian(0.01),
                                     stripes=True,
                                     arrow=True,
                                     minibatch_size=minibatch,
                                     num_samples=num_samples_train)
    elif name == 'mf':
        #mean-field
        model = Mean_Field_DGP(X,
                               y,
                               Z.copy(),
                               Ks.copy(),
                               Gaussian(0.01),
                               minibatch_size=minibatch,
                               num_samples=num_samples_train)
    elif name == 'fc_sampled':
        #fully-coupled with marginalisation by Monte Carlo sampling
        model = Full_DGP_Sampled(X,
                                 y,
                                 Z.copy(),
                                 Ks.copy(),
                                 Gaussian(0.01),
                                 minibatch_size=minibatch,
                                 num_samples=num_samples_train)

    if name in ['fc', 'fc_sampled']:
        #start the inner layers almost deterministically,
        #this is done by default for mf and star dgp
        SM_prior = model.layers.S_M_sqrt.value
        SM_det = block_diag(SM_prior[0, :-M, :-M] * 1e-5, SM_prior[0, -M:,
                                                                   -M:])
        model.layers.S_M_sqrt = [SM_det]

    return model
Ejemplo n.º 19
0
def default_gp_kernel(X: np.ndarray):
    from gpflow.kernels import White, RBF

    _, n_feats = X.shape
    return RBF(n_feats, ARD=True) + White(n_feats)
Ejemplo n.º 20
0
def run_gp_optim(target_column: str, split_perc: float, imputation: str,
                 featureset: str):
    """
    Run whole GPR optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    kernels = []
    base_kernels = [
        SquaredExponential(),
        Matern52(),
        White(),
        RationalQuadratic(),
        Polynomial()
    ]
    for kern in base_kernels:
        if isinstance(kern, IsotropicStationary):
            base_kernels.append(Periodic(kern, period=seasonal_periods))
    TrainHelper.extend_kernel_combinations(kernels=kernels,
                                           base_kernels=base_kernels)
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'kernel': kernels,
        'mean_function': [None, gpflow.mean_functions.Constant()],
        'noise_variance': [0.01, 1, 10, 100],
        'optimizer': [gpflow.optimizers.Scipy()],
        'standardize_x': [False, True],
        'standardize_y': [False, True],
        'osa': [True]
    }
    # random sample from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.2)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        # deepcopy to prevent impact of previous optimizations
        kernel = gpflow.utilities.deepcopy(params_lst[i]['kernel'])
        mean_fct = gpflow.utilities.deepcopy(params_lst[i]['mean_function'])
        noise_var = params_lst[i]['noise_variance']
        optimizer = gpflow.utilities.deepcopy(params_lst[i]['optimizer'])
        stand_x = params_lst[i]['standardize_x']
        stand_y = params_lst[i]['standardize_y']
        one_step_ahead = params_lst[i]['osa']

        # dim_reduction only done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        kernel_string, mean_fct_string, optimizer_string = get_docresults_strings(
            kernel=kernel, mean_function=mean_fct, optimizer=optimizer)
        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsGPR.GaussianProcessRegressionGPFlow(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    kernel=kernel,
                    mean_function=mean_fct,
                    noise_variance=noise_var,
                    optimizer=optimizer,
                    standardize_x=stand_x,
                    standardize_y=stand_y,
                    one_step_ahead=one_step_ahead)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel_string,
                'mean_function':
                mean_fct_string,
                'noise_variance':
                noise_var,
                'optimizer':
                optimizer_string,
                'standardize_x':
                stand_x,
                'standardize_y':
                stand_y,
                'one_step_ahead':
                one_step_ahead,
                'optim_mod_params':
                model.model.parameters
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            # print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel_string,
                'mean_function':
                mean_fct_string,
                'noise_variance':
                noise_var,
                'optimizer':
                optimizer_string,
                'standardize_x':
                stand_x,
                'standardize_y':
                stand_y,
                'one_step_ahead':
                one_step_ahead,
                'optim_mod_params':
                'failed'
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='gpr',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Ejemplo n.º 21
0
def main(args):
    datasets = Datasets(data_path=args.data_path)

    # prepare output files
    outname1 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\
            + str(args.num_inducing) + '.rmse'
    if not os.path.exists(os.path.dirname(outname1)):
        os.makedirs(os.path.dirname(outname1))
    outfile1 = open(outname1, 'w')

    outname2 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\
            + str(args.num_inducing) + '.nll'
    outfile2 = open(outname2, 'w')

    outname3 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\
            + str(args.num_inducing) + '.time'
    outfile3 = open(outname3, 'w')

    # =========================================================================
    # CROSS-VALIDATION LOOP
    # =========================================================================
    running_err = 0
    running_loss = 0
    running_time = 0
    test_errs = np.zeros(args.splits)
    test_nlls = np.zeros(args.splits)
    test_times = np.zeros(args.splits)
    for i in range(args.splits):
        # =====================================================================
        # MODEL CONSTRUCTION
        # =====================================================================
        print('Split: {}'.format(i))
        print('Getting dataset...')
        # get dataset
        data = datasets.all_datasets[args.dataset].get_data(
            i, normalize=args.normalize_data)
        X, Y, Xs, Ys, Y_std = [
            data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']
        ]

        # inducing points via k-means
        Z = kmeans2(X, args.num_inducing, minit='points')[0]

        # set up batches
        batch_size = args.M if args.M < X.shape[0] else X.shape[0]
        train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\
            .prefetch(X.shape[0]//2)\
            .shuffle(buffer_size=(X.shape[0]//2))\
            .batch(batch_size)

        print('Setting up DGP model...')
        kernels = []
        dims = []

        # hidden_dim = min(args.max_dim, X.shape[1])
        hidden_dim = X.shape[1] if X.shape[1] < args.max_dim else args.max_dim
        for l in range(args.num_layers):
            if l == 0:
                dim = X.shape[1]
                dims.append(dim)
            else:
                dim = hidden_dim
                dims.append(dim)

            if args.ard:
                # SE kernel with lengthscale per dimension
                kernels.append(
                    SquaredExponential(lengthscale=[1.] * dim) +
                    White(variance=1e-5))
            else:
                # SE kernel with single lengthscale
                kernels.append(
                    SquaredExponential(lengthscale=1.) + White(variance=1e-5))

        # output dim
        dims.append(Y.shape[1])

        dgp_model = DGP(X,
                        Y,
                        Z,
                        dims,
                        kernels,
                        Gaussian(variance=0.05),
                        num_samples=args.num_samples,
                        num_data=X.shape[0])

        # initialise inner layers almost deterministically
        for layer in dgp_model.layers[:-1]:
            layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5,
                                     transform=triangular())

        # =====================================================================
        # TRAINING
        # =====================================================================
        optimiser = tf.optimizers.Adam(args.learning_rate)

        print('Training DGP model...')
        t0 = time.time()
        # training loop
        monitored_training_loop(dgp_model,
                                train_dataset,
                                optimiser=optimiser,
                                logdir=args.log_dir,
                                iterations=args.iterations,
                                logging_iter_freq=args.logging_iter_freq)
        t1 = time.time()

        # =====================================================================
        # TESTING
        # =====================================================================
        test_times[i] = t1 - t0
        print('Time taken to train: {}'.format(t1 - t0))
        outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0))
        outfile3.flush()
        os.fsync(outfile3.fileno())
        running_time += t1 - t0

        # minibatch test predictions
        means, vars = [], []
        test_batch_size = args.test_batch_size
        if len(Xs) > test_batch_size:
            for mb in range(-(-len(Xs) // test_batch_size)):
                m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) *
                                              test_batch_size, :],
                                           num_samples=args.test_samples)
                means.append(m)
                vars.append(v)
        else:
            m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples)
            means.append(m)
            vars.append(v)

        mean_SND = np.concatenate(means, 1)  # [S, N, D]
        var_SND = np.concatenate(vars, 1)  # [S, N, D]
        mean_ND = np.mean(mean_SND, 0)  # [N, D]

        # rmse
        test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5)
        test_errs[i] = test_err
        print('Average RMSE: {}'.format(test_err))
        outfile1.write('Split {}: {}\n'.format(i + 1, test_err))
        outfile1.flush()
        os.fsync(outfile1.fileno())
        running_err += test_err

        # nll
        test_nll = np.mean(
            logsumexp(norm.logpdf(Ys * Y_std, mean_SND * Y_std,
                                  var_SND**0.5 * Y_std),
                      0,
                      b=1 / float(args.test_samples)))
        test_nlls[i] = test_nll
        print('Average test log likelihood: {}'.format(test_nll))
        outfile2.write('Split {}: {}\n'.format(i + 1, test_nll))
        outfile2.flush()
        os.fsync(outfile2.fileno())
        running_loss += test_nll

    outfile1.write('Average: {}\n'.format(running_err / args.splits))
    outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs)))
    outfile2.write('Average: {}\n'.format(running_loss / args.splits))
    outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls)))
    outfile3.write('Average: {}\n'.format(running_time / args.splits))
    outfile3.write('Standard deviation: {}\n'.format(np.std(test_times)))
    outfile1.close()
    outfile2.close()
    outfile3.close()
Ejemplo n.º 22
0
def get_test_error(i,
                   dataset,
                   alpha,
                   learning_rate=0.001,
                   iterations=20000,
                   white=True,
                   normalized=True,
                   num_inducing=100,
                   beta=None,
                   gamma=None,
                   div_weights=None):
    """STEP (1) Read in the data via the helpful 'Dataset' object"""
    data = datasets.all_datasets[dataset].get_data(seed=0, split=i, prop=0.9)
    X_train, Y_train, X_test, Y_test, Y_std = [
        data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']
    ]
    print('N: {}, D: {}, Ns: {}, Y_std: {}'.format(X_train.shape[0],
                                                   X_train.shape[1],
                                                   X_test.shape[0], Y_std))

    Z = kmeans2(X_train, num_inducing, minit='points')[0]

    #Dimensionality of X
    D = X_train.shape[1]

    # the layer shapes are defined by the kernel dims, so here all
    # hidden layers are D dimensional
    kernels = []
    for l in range(L):
        kernels.append(RBF(D))

    # between layer noise (doesn't actually make much difference but we include it anyway)
    for kernel in kernels[:-1]:
        kernel += White(D, variance=1e-5)

    mb = 1000 if X_train.shape[0] > 1000 else None

    # get the likelihood model (possibly a robust one)
    if gamma is None and beta is None:
        #standard likelihood
        lklh = Gaussian()
    elif beta is not None and gamma is None:
        #beta-divergence robustified likelihood
        lklh = betaDivGaussian(beta)
    elif gamma is not None and beta is None:
        #gamma-divergeece robustified likelihood
        lklh = gammaDivGaussian(gamma)
    else:
        print(
            "ERROR! You have specified both beta and gamma. Either specify " +
            "both as None (for standard Gaussian likelihood) or one of them " +
            "as None (to use the other)")
        sys.exit()
    """STEP (2): Call 'DGP' for split i, which together with ADAM is 
                 responsible for the inference"""
    model = DGP(
        X_train,
        Y_train,
        Z,
        kernels,
        lklh,  #Gaussian(), #betaDivGaussian(0.01), #Gaussian(), #betaDivGaussian(0.1), #Gaussian(), #Gaussian_(), #gammaDivGaussian(0.1), #Gaussian_(), #gammaDivGaussian(0.01), #gammaDivGaussian(0.1), #Gaussian(), 
        num_samples=K,
        minibatch_size=mb,
        alpha=alpha,
        white=white,
        div_weights=div_weights)

    # start the inner layers almost deterministically
    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5

    #Build functions for evaluation test errors
    S = 100

    def batch_assess(model, assess_model, X, Y):
        n_batches = max(int(X.shape[0] / 1000.), 1)
        lik, sq_diff = [], []
        for X_batch, Y_batch in zip(np.array_split(X, n_batches),
                                    np.array_split(Y, n_batches)):
            l, sq = assess_model(model, X_batch, Y_batch)
            lik.append(l)
            sq_diff.append(sq)
        lik = np.concatenate(lik, 0)
        sq_diff = np.array(np.concatenate(sq_diff, 0), dtype=float)
        return np.average(lik), np.average(sq_diff)**0.5

    def assess_single_layer(model, X_batch, Y_batch):
        m, v = model.predict_y(X_batch)
        lik = np.sum(
            norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5),
            1)
        sq_diff = Y_std**2 * ((m - Y_batch)**2)
        return lik, sq_diff

    def assess_sampled(model, X_batch, Y_batch):
        m, v = model.predict_y(X_batch, S)
        S_lik = np.sum(
            norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5),
            2)
        lik = logsumexp(S_lik, 0, b=1 / float(S))

        mean = np.average(m, 0)
        sq_diff = Y_std**2 * ((mean - Y_batch)**2)
        return lik, sq_diff

    #Get start time
    start_time = time.time()

    #Fit to training set via ADAM
    np.random.seed(1)
    AdamOptimizer(learning_rate).minimize(model, maxiter=iterations)

    #get running time
    running_time = time.time() - start_time
    s = 'time: {:.4f},  lik: {:.4f}, rmse: {:.4f}'
    """STEP (3): Extract and return test performancee metrics to 'main'."""
    #Get test errors
    lik, rmse = batch_assess(model, assess_sampled, X_test, Y_test)
    print(s.format(running_time, lik, rmse))

    return -lik, rmse, running_time
Ejemplo n.º 23
0
X, Y, Xs, Ys, Y_std = [data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']]

print('############################ {} L={} split={}'.format(
    dataset_name, L, split))
print('N: {}, D: {}, Ns: {}'.format(X.shape[0], X.shape[1], Xs.shape[0]))

Z = kmeans2(X, 100, minit='points')[0]

D = X.shape[1]

kernels = []
for l in range(L):
    kernels.append(RBF(D))

for kernel in kernels[:-1]:
    kernel += White(D, variance=2e-6)

mb = minibatch_size if X.shape[0] > minibatch_size else None
model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb)

# start the inner layers almost deterministically
for layer in model.layers[:-1]:
    layer.q_sqrt = layer.q_sqrt.value * 1e-5
model.likelihood.variance = 0.05

global_step = tf.Variable(0, trainable=False, name="global_step")
model.enquire_session().run(global_step.initializer)

s = "{}/{}_L{}_split{}".format(results_path, dataset_name, L, split)
fw = tf.summary.FileWriter(os.path.join(s.format(dataset_name, L)),
                           model.enquire_session().graph)
Ejemplo n.º 24
0
def main(args):
    datasets = Datasets(data_path=args.data_path)

    # Prepare output files
    outname1 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\
            + str(args.num_inducing) + '.nll'
    if not os.path.exists(os.path.dirname(outname1)):
        os.makedirs(os.path.dirname(outname1))
    outfile1 = open(outname1, 'w')
    outname2 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\
            + str(args.num_inducing) + '.time'
    outfile2 = open(outname2, 'w')

    running_loss = 0
    running_time = 0
    for i in range(args.splits):
        print('Split: {}'.format(i))
        print('Getting dataset...')
        data = datasets.all_datasets[args.dataset].get_data(i)
        X, Y, Xs, Ys, Y_std = [
            data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']
        ]
        Z = kmeans2(X, args.num_inducing, minit='points')[0]

        # set up batches
        batch_size = args.M if args.M < X.shape[0] else X.shape[0]
        train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\
                .prefetch(X.shape[0]//2)\
                .shuffle(buffer_size=(X.shape[0]//2))\
                .batch(batch_size)

        print('Setting up DGP model...')
        kernels = []
        for l in range(args.num_layers):
            kernels.append(SquaredExponential() + White(variance=1e-5))

        dgp_model = DGP(X.shape[1],
                        kernels,
                        Gaussian(variance=0.05),
                        Z,
                        num_outputs=Y.shape[1],
                        num_samples=args.num_samples,
                        num_data=X.shape[0])

        # initialise inner layers almost deterministically
        for layer in dgp_model.layers[:-1]:
            layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5,
                                     transform=triangular())

        optimiser = tf.optimizers.Adam(args.learning_rate)

        def optimisation_step(model, X, Y):
            with tf.GradientTape() as tape:
                tape.watch(model.trainable_variables)
                obj = -model.elbo(X, Y, full_cov=False)
                grad = tape.gradient(obj, model.trainable_variables)
            optimiser.apply_gradients(zip(grad, model.trainable_variables))

        def monitored_training_loop(model, train_dataset, logdir, iterations,
                                    logging_iter_freq):
            # TODO: use tensorboard to log trainables and performance
            tf_optimisation_step = tf.function(optimisation_step)
            batches = iter(train_dataset)

            for i in range(iterations):
                X, Y = next(batches)
                tf_optimisation_step(model, X, Y)

                iter_id = i + 1
                if iter_id % logging_iter_freq == 0:
                    tf.print(
                        f'Epoch {iter_id}: ELBO (batch) {model.elbo(X, Y)}')

        print('Training DGP model...')
        t0 = time.time()
        monitored_training_loop(dgp_model,
                                train_dataset,
                                logdir=args.log_dir,
                                iterations=args.iterations,
                                logging_iter_freq=args.logging_iter_freq)
        t1 = time.time()
        print('Time taken to train: {}'.format(t1 - t0))
        outfile2.write('Split {}: {}\n'.format(i + 1, t1 - t0))
        outfile2.flush()
        os.fsync(outfile2.fileno())
        running_time += t1 - t0

        m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples)
        test_nll = np.mean(
            logsumexp(norm.logpdf(Ys * Y_std, m * Y_std, v**0.5 * Y_std),
                      0,
                      b=1 / float(args.test_samples)))
        print('Average test log likelihood: {}'.format(test_nll))
        outfile1.write('Split {}: {}\n'.format(i + 1, test_nll))
        outfile1.flush()
        os.fsync(outfile1.fileno())
        running_loss += t1 - t0

    outfile1.write('Average: {}\n'.format(running_loss / args.splits))
    outfile2.write('Average: {}\n'.format(running_time / args.splits))
    outfile1.close()
    outfile2.close()
Ejemplo n.º 25
0
def main(args):
    datasets = Datasets(data_path=args.data_path)

    # prepare output files
    outname1 = '../svgp_ard_tmp/svgp_ard_' + args.dataset + '_' + str(
        args.num_inducing) + '.rmse'
    if not os.path.exists(os.path.dirname(outname1)):
        os.makedirs(os.path.dirname(outname1))
    outfile1 = open(outname1, 'w')

    outname2 = '../svgp_ard_tmp/svgp_ard_' + args.dataset + '_' + str(
        args.num_inducing) + '.nll'
    outfile2 = open(outname2, 'w')

    outname3 = '../svgp_ard_tmp/svgp_ard)' + args.dataset + '_' + str(
        args.num_inducing) + '.time'
    outfile3 = open(outname3, 'w')

    # =========================================================================
    # CROSS-VALIDATION LOOP
    # =========================================================================
    running_err = 0
    running_loss = 0
    running_time = 0
    test_errs = np.zeros(args.splits)
    test_nlls = np.zeros(args.splits)
    test_times = np.zeros(args.splits)
    for i in range(args.splits):
        # =====================================================================
        # MODEL CONSTRUCTION
        # =====================================================================
        print('Split: {}'.format(i))
        print('Getting dataset...')
        data = datasets.all_datasets[args.dataset].get_data(
            i, normalize=args.normalize_data)
        X, Y, Xs, Ys, Y_std = [
            data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']
        ]
        Z = kmeans2(X, args.num_inducing, minit='points')[0]

        # set up batches
        batch_size = args.batch_size if args.batch_size < X.shape[0]\
            else X.shape[0]
        train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\
            .prefetch(X.shape[0]//2)\
            .shuffle(buffer_size=(X.shape[0]//2))\
            .batch(batch_size)

        print('Setting up SVGP model...')
        if args.ard:
            # SE kernel with lengthscale per dimension
            kernel = SquaredExponential(lengthscale=[1.] *
                                        X.shape[1]) + White(variance=1e-5)
        else:
            # SE kernel with single lengthscale
            kernel = SquaredExponential(lengthscale=1.) + White(variance=1e-5)
        likelihood = Gaussian(variance=0.05)

        model = gpflow.models.SVGP(kernel=kernel,
                                   likelihood=likelihood,
                                   inducing_variable=Z)

        # =====================================================================
        # TRAINING
        # =====================================================================
        print('Training SVGP model...')
        optimiser = tf.optimizers.Adam(args.learning_rate)
        t0 = time.time()
        monitored_training_loop(model,
                                train_dataset,
                                optimiser=optimiser,
                                logdir=args.log_dir,
                                iterations=args.iterations,
                                logging_iter_freq=args.logging_iter_freq)
        t1 = time.time()

        # =====================================================================
        # TESTING
        # =====================================================================
        test_times[i] = t1 - t0
        print('Time taken to train: {}'.format(t1 - t0))
        outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0))
        outfile3.flush()
        os.fsync(outfile3.fileno())
        running_time += t1 - t0

        # minibatch test predictions
        means, vars = [], []
        test_batch_size = args.test_batch_size
        if len(Xs) > test_batch_size:
            for mb in range(-(-len(Xs) // test_batch_size)):
                m, v = model.predict_y(Xs[mb * test_batch_size:(mb + 1) *
                                          test_batch_size, :])
                means.append(m)
                vars.append(v)
        else:
            m, v = model.predict_y(Xs)
            means.append(m)
            vars.append(v)

        mean_ND = np.concatenate(means, 0)  # [N, D]
        var_ND = np.concatenate(vars, 0)  # [N, D]

        # rmse
        test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5)
        test_errs[i] = test_err
        print('Average RMSE: {}'.format(test_err))
        outfile1.write('Split {}: {}\n'.format(i + 1, test_err))
        outfile1.flush()
        os.fsync(outfile1.fileno())
        running_err += test_err

        # nll
        test_nll = np.mean(
            norm.logpdf(Ys * Y_std, mean_ND * Y_std, var_ND**0.5 * Y_std))
        test_nlls[i] = test_nll
        print('Average test log likelihood: {}'.format(test_nll))
        outfile2.write('Split {}: {}\n'.format(i + 1, test_nll))
        outfile2.flush()
        os.fsync(outfile2.fileno())
        running_loss += test_nll

    outfile1.write('Average: {}\n'.format(running_err / args.splits))
    outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs)))
    outfile2.write('Average: {}\n'.format(running_loss / args.splits))
    outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls)))
    outfile3.write('Average: {}\n'.format(running_time / args.splits))
    outfile3.write('Standard deviation: {}\n'.format(np.std(test_times)))
    outfile1.close()
    outfile2.close()
    outfile3.close()