Exemple #1
0
def test_mixed_mok_with_Id_vs_independent_mok(session_tf):
    data = DataMixedKernelWithEye
    # Independent model
    k1 = mk.SharedIndependentMok(RBF(data.D, variance=0.5, lengthscales=1.2),
                                 data.L)
    f1 = InducingPoints(data.X[:data.M, ...].copy())
    m1 = SVGP(data.X,
              data.Y,
              k1,
              Gaussian(),
              f1,
              q_mu=data.mu_data_full,
              q_sqrt=data.sqrt_data_full)
    m1.set_trainable(False)
    m1.q_sqrt.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m1, maxiter=data.MAXITER)

    # Mixed Model
    kern_list = [
        RBF(data.D, variance=0.5, lengthscales=1.2) for _ in range(data.L)
    ]
    k2 = mk.SeparateMixedMok(kern_list, data.W)
    f2 = InducingPoints(data.X[:data.M, ...].copy())
    m2 = SVGP(data.X,
              data.Y,
              k2,
              Gaussian(),
              f2,
              q_mu=data.mu_data_full,
              q_sqrt=data.sqrt_data_full)
    m2.set_trainable(False)
    m2.q_sqrt.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m2, maxiter=data.MAXITER)

    check_equality_predictions(session_tf, [m1, m2])
Exemple #2
0
def test_compare_mixed_kernel(session_tf):
    data = DataMixedKernel

    kern_list = [RBF(data.D) for _ in range(data.L)]
    k1 = mk.SeparateMixedMok(kern_list, W=data.W)
    f1 = mf.SharedIndependentMof(InducingPoints(data.X[:data.M, ...].copy()))
    m1 = SVGP(data.X,
              data.Y,
              k1,
              Gaussian(),
              feat=f1,
              q_mu=data.mu_data,
              q_sqrt=data.sqrt_data)

    kern_list = [RBF(data.D) for _ in range(data.L)]
    k2 = mk.SeparateMixedMok(kern_list, W=data.W)
    f2 = mf.MixedKernelSharedMof(InducingPoints(data.X[:data.M, ...].copy()))
    m2 = SVGP(data.X,
              data.Y,
              k2,
              Gaussian(),
              feat=f2,
              q_mu=data.mu_data,
              q_sqrt=data.sqrt_data)

    check_equality_predictions(session_tf, [m1, m2])
Exemple #3
0
def test_multioutput_with_diag_q_sqrt(session_tf):
    data = DataMixedKernel

    q_sqrt_diag = np.ones((data.M, data.L)) * 2
    q_sqrt = np.repeat(np.eye(data.M)[None, ...], data.L,
                       axis=0) * 2  # L x M x M

    kern_list = [RBF(data.D) for _ in range(data.L)]
    k1 = mk.SeparateMixedMok(kern_list, W=data.W)
    f1 = mf.SharedIndependentMof(InducingPoints(data.X[:data.M, ...].copy()))
    m1 = SVGP(data.X,
              data.Y,
              k1,
              Gaussian(),
              feat=f1,
              q_mu=data.mu_data,
              q_sqrt=q_sqrt_diag,
              q_diag=True)

    kern_list = [RBF(data.D) for _ in range(data.L)]
    k2 = mk.SeparateMixedMok(kern_list, W=data.W)
    f2 = mf.SharedIndependentMof(InducingPoints(data.X[:data.M, ...].copy()))
    m2 = SVGP(data.X,
              data.Y,
              k2,
              Gaussian(),
              feat=f2,
              q_mu=data.mu_data,
              q_sqrt=q_sqrt,
              q_diag=False)

    check_equality_predictions(session_tf, [m1, m2])
Exemple #4
0
    def __init__(self,
                 layer_id,
                 kern,
                 U,
                 Z,
                 num_outputs,
                 mean_function,
                 white=False,
                 **kwargs):
        """
        A sparse variational GP layer in whitened representation. This layer holds the kernel,
        variational parameters, inducing points and mean function.
        The underlying model at inputs X is
        f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X)
        The variational distribution over the inducing points is
        q(v) = N(q_mu, q_sqrt q_sqrt^T)
        The layer holds D_out independent GPs with the same kernel and inducing points.
        :param kern: The kernel for the layer (input_dim = D_in)
        :param Z: Inducing points (M, D_in)
        :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs))
        :param mean_function: The mean function
        :return:
        """
        Layer.__init__(self, layer_id, U, num_outputs, **kwargs)

        #Initialize using kmeans

        self.dim_in = U[0].shape[1] if layer_id == 0 else num_outputs
        self.Z = Z if Z is not None else np.random.normal(
            0, 0.01, (100, self.dim_in))

        self.num_inducing = self.Z.shape[0]

        q_mu = np.zeros((self.num_inducing, num_outputs))
        self.q_mu = Parameter(q_mu)

        q_sqrt = np.tile(
            np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1])
        transform = transforms.LowerTriangular(self.num_inducing,
                                               num_matrices=num_outputs)
        self.q_sqrt = Parameter(q_sqrt, transform=transform)

        self.feature = InducingPoints(self.Z)
        self.kern = kern
        self.mean_function = mean_function

        self.num_outputs = num_outputs
        self.white = white

        if not self.white:  # initialize to prior
            Ku = self.kern.compute_K_symm(self.Z)
            Lu = np.linalg.cholesky(Ku +
                                    np.eye(self.Z.shape[0]) * settings.jitter)
            self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1])

        self.needs_build_cholesky = True
Exemple #5
0
def test_separate_independent_mok(session_tf):
    """
    We use different independent kernels for each of the output dimensions.
    We can achieve this in two ways:
        1) efficient: SeparateIndependentMok with Shared/SeparateIndependentMof
        2) inefficient: SeparateIndependentMok with InducingPoints
    However, both methods should return the same conditional,
    and after optimization return the same log likelihood.
    """
    # Model 1 (INefficient)
    q_mu_1 = np.random.randn(Data.M * Data.P, 1)
    q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P,
                                       Data.M * Data.P))[None,
                                                         ...]  # 1 x MP x MP
    kern_list_1 = [
        RBF(Data.D, variance=0.5, lengthscales=1.2) for _ in range(Data.P)
    ]
    kernel_1 = mk.SeparateIndependentMok(kern_list_1)
    feature_1 = InducingPoints(Data.X[:Data.M, ...].copy())
    m1 = SVGP(Data.X,
              Data.Y,
              kernel_1,
              Gaussian(),
              feature_1,
              q_mu=q_mu_1,
              q_sqrt=q_sqrt_1)
    m1.set_trainable(False)
    m1.q_sqrt.set_trainable(True)
    m1.q_mu.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m1, maxiter=Data.MAXITER)

    # Model 2 (efficient)
    q_mu_2 = np.random.randn(Data.M, Data.P)
    q_sqrt_2 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kern_list_2 = [
        RBF(Data.D, variance=0.5, lengthscales=1.2) for _ in range(Data.P)
    ]
    kernel_2 = mk.SeparateIndependentMok(kern_list_2)
    feature_2 = mf.SharedIndependentMof(
        InducingPoints(Data.X[:Data.M, ...].copy()))
    m2 = SVGP(Data.X,
              Data.Y,
              kernel_2,
              Gaussian(),
              feature_2,
              q_mu=q_mu_2,
              q_sqrt=q_sqrt_2)
    m2.set_trainable(False)
    m2.q_sqrt.set_trainable(True)
    m2.q_mu.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m2, maxiter=Data.MAXITER)

    check_equality_predictions(session_tf, [m1, m2])
    def __init__(self, kern, Z, num_outputs, mean_function):
        """
        A sparse variational GP layer in whitened representation. This layer holds the kernel,
        variational parameters, inducing points and mean function.

        The underlying model at inputs X is
        f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X)

        The variational distribution over the inducing points is
        q(v) = N(q_mu, q_sqrt q_sqrt^T)

        The layer holds D_out independent GPs with the same kernel and inducing points.

        :kern: The kernel for the layer (input_dim = D_in)
        :param q_mu: mean initialization (M, D_out)
        :param q_sqrt: sqrt of variance initialization (D_out,M,M)
        :param Z: Inducing points (M, D_in)
        :param mean_function: The mean function
        :return:
        """
        Parameterized.__init__(self)
        M = Z.shape[0]

        q_mu = np.zeros((M, num_outputs))
        self.q_mu = Parameter(q_mu)

        q_sqrt = np.tile(np.eye(M)[None, :, :], [num_outputs, 1, 1])
        transform = transforms.LowerTriangular(M, num_matrices=num_outputs)
        self.q_sqrt = Parameter(q_sqrt, transform=transform)

        self.feature = InducingPoints(Z)
        self.kern = kern
        self.mean_function = mean_function
Exemple #7
0
def test_sample_conditional_mixedkernel(session_tf):
    q_mu = np.random.randn(Data.M, Data.L)  # M x L
    q_sqrt = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.L)
    ])  # L x M x M
    Z = Data.X[:Data.M, ...]  # M x D
    N = int(10e5)
    Xs = np.ones((N, Data.D), dtype=float_type)

    values = {"Xnew": Xs, "q_mu": q_mu, "q_sqrt": q_sqrt}
    placeholders = _create_placeholder_dict(values)
    feed_dict = _create_feed_dict(placeholders, values)

    # Path 1: mixed kernel: most efficient route
    W = np.random.randn(Data.P, Data.L)
    mixed_kernel = mk.SeparateMixedMok([RBF(Data.D) for _ in range(Data.L)], W)
    mixed_feature = mf.MixedKernelSharedMof(InducingPoints(Z.copy()))

    sample = sample_conditional(placeholders["Xnew"],
                                mixed_feature,
                                mixed_kernel,
                                placeholders["q_mu"],
                                q_sqrt=placeholders["q_sqrt"],
                                white=True)
    value = session_tf.run(sample, feed_dict=feed_dict)

    # Path 2: independent kernels, mixed later
    separate_kernel = mk.SeparateIndependentMok(
        [RBF(Data.D) for _ in range(Data.L)])
    shared_feature = mf.SharedIndependentMof(InducingPoints(Z.copy()))
    sample2 = sample_conditional(placeholders["Xnew"],
                                 shared_feature,
                                 separate_kernel,
                                 placeholders["q_mu"],
                                 q_sqrt=placeholders["q_sqrt"],
                                 white=True)
    value2 = session_tf.run(sample2, feed_dict=feed_dict)
    value2 = np.matmul(value2, W.T)
    # check if mean and covariance of samples are similar
    np.testing.assert_array_almost_equal(np.mean(value, axis=0),
                                         np.mean(value2, axis=0),
                                         decimal=1)
    np.testing.assert_array_almost_equal(np.cov(value, rowvar=False),
                                         np.cov(value2, rowvar=False),
                                         decimal=1)
Exemple #8
0
    def _build_model(self, Y_var, freqs, X, Y, kern_params=None, Z=None, q_mu = None, q_sqrt = None, M=None, P=None, L=None, W=None, num_data=None, jitter=1e-6, tec_scale=None, W_trainable=False, use_mc=False, **kwargs):
        """
        Build the model from the data.
        X,Y: tensors the X and Y of data

        Returns:
        gpflow.models.Model
        """

        settings.numerics.jitter = jitter

        with gp.defer_build():
            # Define the likelihood
            likelihood = ComplexHarmonicPhaseOnlyGaussianEncodedHetero(tec_scale=tec_scale)
#            likelihood.variance = 0.3**2#(5.*np.pi/180.)**2
#            likelihood_var = log_normal_solve((5.*np.pi/180.)**2, 0.5*(5.*np.pi/180.)**2)
#            likelihood.variance.prior = LogNormal(likelihood_var[0],likelihood_var[1]**2)
#            likelihood.variance.transform = gp.transforms.positiveRescale(np.exp(likelihood_var[0]))
            likelihood.variance.trainable = False


            q_mu = q_mu/tec_scale #M, L
            q_sqrt = q_sqrt/tec_scale# L, M, M



            kern = mk.SeparateMixedMok([self._build_kernel(None, None, None, #kern_params[l].w, kern_params[l].mu, kern_params[l].v, 
                kern_var = np.var(q_mu[:,l]), **kwargs.get("priors",{})) for l in range(L)], W)
            kern.W.trainable = W_trainable
            kern.W.prior = gp.priors.Gaussian(W, 0.01**2)

            
            feature = mf.MixedKernelSeparateMof([InducingPoints(Z) for _ in range(L)])
            mean = Zero()
            model = HeteroscedasticPhaseOnlySVGP(Y_var, freqs, X, Y, kern, likelihood, 
                        feat = feature,
                        mean_function=mean, 
                        minibatch_size=None,
                        num_latent = P, 
                        num_data = num_data,
                        whiten = False, 
                        q_mu = None, 
                        q_sqrt = None, 
                        q_diag = True)
            for feat in feature.feat_list:
                feat.Z.trainable = True #True
            model.q_mu.trainable = True
            model.q_mu.prior = gp.priors.Gaussian(0., 0.05**2)
            model.q_sqrt.trainable = True
#            model.q_sqrt.prior = gp.priors.Gaussian(0., (0.005/tec_scale)**2)
            model.compile()
            tf.summary.image('W',kern.W.constrained_tensor[None,:,:,None])
            tf.summary.image('q_mu',model.q_mu.constrained_tensor[None,:,:,None])
#            tf.summary.image('q_sqrt',model.q_sqrt.constrained_tensor[:,:,:,None])

            return model
def test_sample_conditional(session_tf, whiten, full_cov, full_output_cov):
    q_mu = np.random.randn(Data.M, Data.P)  # M x P
    q_sqrt = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    Z = Data.X[:Data.M, ...]  # M x D
    Xs = np.ones((Data.N, Data.D), dtype=float_type)

    feature = InducingPoints(Z.copy())
    kernel = RBF(Data.D)

    values = {"Z": Z, "Xnew": Xs, "q_mu": q_mu, "q_sqrt": q_sqrt}
    placeholders = _create_placeholder_dict(values)
    feed_dict = _create_feed_dict(placeholders, values)

    # Path 1
    sample_f = sample_conditional(placeholders["Xnew"],
                                  feature,
                                  kernel,
                                  placeholders["q_mu"],
                                  q_sqrt=placeholders["q_sqrt"],
                                  white=whiten,
                                  full_cov=full_cov,
                                  full_output_cov=full_output_cov,
                                  num_samples=int(1e5))
    value_f, mean_f, var_f = session_tf.run(sample_f, feed_dict=feed_dict)
    value_f = value_f.reshape((-1, ) + value_f.shape[2:])

    # Path 2
    if full_output_cov:
        pytest.skip(
            "sample_conditional with X instead of feature does not support full_output_cov"
        )

    sample_x = sample_conditional(placeholders["Xnew"],
                                  placeholders["Z"],
                                  kernel,
                                  placeholders["q_mu"],
                                  q_sqrt=placeholders["q_sqrt"],
                                  white=whiten,
                                  full_cov=full_cov,
                                  full_output_cov=full_output_cov,
                                  num_samples=int(1e5))
    value_x, mean_x, var_x = session_tf.run(sample_x, feed_dict=feed_dict)
    value_x = value_x.reshape((-1, ) + value_x.shape[2:])

    # check if mean and covariance of samples are similar
    np.testing.assert_array_almost_equal(np.mean(value_x, axis=0),
                                         np.mean(value_f, axis=0),
                                         decimal=1)
    np.testing.assert_array_almost_equal(np.cov(value_x, rowvar=False),
                                         np.cov(value_f, rowvar=False),
                                         decimal=1)
    np.testing.assert_allclose(mean_x, mean_f)
    np.testing.assert_allclose(var_x, var_f)
Exemple #10
0
    def __init__(self, kern, num_outputs, mean_function,
                Z=None,
                feature=None,
                white=False, input_prop_dim=None,
                q_mu=None,
                q_sqrt=None, **kwargs):
        """
        A sparse variational GP layer in whitened representation. This layer holds the kernel,
        variational parameters, inducing points and mean function.

        The underlying model at inputs X is
        f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X)

        The variational distribution over the inducing points is
        q(v) = N(q_mu, q_sqrt q_sqrt^T)

        The layer holds D_out independent GPs with the same kernel and inducing points.

        :param kern: The kernel for the layer (input_dim = D_in)
        :param Z: Inducing points (M, D_in)
        :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs))
        :param mean_function: The mean function
        :return:
        """
        Layer.__init__(self, input_prop_dim, **kwargs)
        if feature is None:
            feature = InducingPoints(Z)

        self.num_inducing = len(feature)

        self.feature = feature
        self.kern = kern
        self.mean_function = mean_function

        self.num_outputs = num_outputs
        self.white = white

        if q_mu is None:
            q_mu = np.zeros((self.num_inducing, num_outputs), dtype=settings.float_type)
        self.q_mu = Parameter(q_mu)

        if q_sqrt is None:
            if not self.white:  # initialize to prior
                with gpflow.params_as_tensors_for(feature):
                    Ku = conditionals.Kuu(feature, self.kern, jitter=settings.jitter)
                    Lu = tf.linalg.cholesky(Ku)
                    Lu = self.enquire_session().run(Lu)
                    q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1])
            else:
                q_sqrt = np.tile(np.eye(self.num_inducing, dtype=settings.float_type)[None, :, :], [num_outputs, 1, 1])

        transform = transforms.LowerTriangular(self.num_inducing, num_matrices=num_outputs)
        self.q_sqrt = Parameter(q_sqrt, transform=transform)

        self.needs_build_cholesky = True
Exemple #11
0
    def _build_model(self, Y_var, X, Y, Z=None, q_mu = None, q_sqrt = None, M=None, P=None, L=None, W=None, num_data=None, jitter=1e-6, tec_scale=None, W_diag=False, **kwargs):
        """
        Build the model from the data.
        X,Y: tensors the X and Y of data

        Returns:
        gpflow.models.Model
        """

        
        settings.numerics.jitter = jitter

        with gp.defer_build():
            # Define the likelihood
            likelihood = GaussianTecHetero(tec_scale=tec_scale)
 
            q_mu = q_mu/tec_scale #M, L
            q_sqrt = q_sqrt/tec_scale# L, M, M

            kern = mk.SeparateMixedMok([self._build_kernel(kern_var = np.var(q_mu[:,l]), **kwargs.get("priors",{})) for l in range(L)], W)
            if W_diag:
#                kern.W.transform = Reshape(W.shape,(P,L,L))(gp.transforms.DiagMatrix(L)(gp.transforms.positive))
                kern.W.trainable = False
            else:
                kern.W.transform = Reshape(W.shape,(P//L,L,L))(MatrixSquare()(gp.transforms.LowerTriangular(L,P//L)))
                kern.W.trainable = True
            
            feature = mf.MixedKernelSeparateMof([InducingPoints(Z) for _ in range(L)])
            mean = Zero()
            model = HeteroscedasticTecSVGP(Y_var, X, Y, kern, likelihood, 
                        feat = feature,
                        mean_function=mean, 
                        minibatch_size=None,
                        num_latent = P, 
                        num_data = num_data,
                        whiten = False, 
                        q_mu = q_mu, 
                        q_sqrt = q_sqrt)
            for feat in feature.feat_list:
                feat.Z.trainable = True
            model.q_mu.trainable = True
            model.q_sqrt.trainable = True
#            model.q_sqrt.prior = gp.priors.Gaussian(q_sqrt, 0.005**2)
            model.compile()
            tf.summary.image('W',kern.W.constrained_tensor[None,:,:,None])
            tf.summary.image('q_mu',model.q_mu.constrained_tensor[None,:,:,None])
            tf.summary.image('q_sqrt',model.q_sqrt.constrained_tensor[:,:,:,None])
        return model
Exemple #12
0
    def __init__(self, kern, Z, num_outputs, mean_function, **kwargs):
        """
        A sparse variational GP layer with a Gaussian likelihood, where the 
        GP is integrated out

        :kern: The kernel for the layer (input_dim = D_in)
        :param Z: Inducing points (M, D_in)
        :param mean_function: The mean function
        :return:
        """

        Collapsed_Layer.__init__(self, **kwargs)
        self.feature = InducingPoints(Z)
        self.kern = kern
        self.mean_function = mean_function
        self.num_outputs = num_outputs
Exemple #13
0
 def __init__(self, Z, mean_function, kern, num_latent=1, whiten=True, name=None):
     super(Latent, self).__init__(name=name)
     self.mean_function = mean_function
     self.kern = kern
     self.num_latent = num_latent
     M = Z.shape[0]
     # M = tf.print(M,[M,'any thing i want'],message='Debug message:',summarize=100)
     
     self.feature = InducingPoints(Z)
     num_inducing = len(self.feature)
     self.whiten = whiten
     
     self.q_mu = Parameter(np.zeros((num_inducing, self.num_latent), dtype=settings.float_type))
     
     q_sqrt = np.tile(np.eye(M)[None, :, :], [self.num_latent, 1, 1])
     transform = transforms.LowerTriangular(M, num_matrices=self.num_latent)
     self.q_sqrt = Parameter(q_sqrt, transform=transform)
Exemple #14
0
def test_sample_conditional(session_tf, whiten):
    q_mu = np.random.randn(Data.M, Data.P)  # M x P
    q_sqrt = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    Z = Data.X[:Data.M, ...]  # M x D
    Xs = np.ones((int(10e5), Data.D), dtype=float_type)

    feature = InducingPoints(Z.copy())
    kernel = RBF(Data.D)

    values = {"Z": Z, "Xnew": Xs, "q_mu": q_mu, "q_sqrt": q_sqrt}
    placeholders = _create_placeholder_dict(values)
    feed_dict = _create_feed_dict(placeholders, values)

    # Path 1
    sample = sample_conditional(placeholders["Xnew"],
                                placeholders["Z"],
                                kernel,
                                placeholders["q_mu"],
                                q_sqrt=placeholders["q_sqrt"],
                                white=whiten)
    value = session_tf.run(sample, feed_dict=feed_dict)

    # Path 2
    sample2 = sample_conditional(placeholders["Xnew"],
                                 feature,
                                 kernel,
                                 placeholders["q_mu"],
                                 q_sqrt=placeholders["q_sqrt"],
                                 white=whiten)
    value2 = session_tf.run(sample2, feed_dict=feed_dict)

    # check if mean and covariance of samples are similar
    np.testing.assert_array_almost_equal(np.mean(value, axis=0),
                                         np.mean(value2, axis=0),
                                         decimal=1)
    np.testing.assert_array_almost_equal(np.cov(value, rowvar=False),
                                         np.cov(value2, rowvar=False),
                                         decimal=1)
Exemple #15
0
class SVGP_Layer(Layer):
    def __init__(self,
                 layer_id,
                 kern,
                 U,
                 Z,
                 num_outputs,
                 mean_function,
                 white=False,
                 **kwargs):
        """
        A sparse variational GP layer in whitened representation. This layer holds the kernel,
        variational parameters, inducing points and mean function.
        The underlying model at inputs X is
        f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X)
        The variational distribution over the inducing points is
        q(v) = N(q_mu, q_sqrt q_sqrt^T)
        The layer holds D_out independent GPs with the same kernel and inducing points.
        :param kern: The kernel for the layer (input_dim = D_in)
        :param Z: Inducing points (M, D_in)
        :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs))
        :param mean_function: The mean function
        :return:
        """
        Layer.__init__(self, layer_id, U, num_outputs, **kwargs)

        #Initialize using kmeans

        self.dim_in = U[0].shape[1] if layer_id == 0 else num_outputs
        self.Z = Z if Z is not None else np.random.normal(
            0, 0.01, (100, self.dim_in))

        self.num_inducing = self.Z.shape[0]

        q_mu = np.zeros((self.num_inducing, num_outputs))
        self.q_mu = Parameter(q_mu)

        q_sqrt = np.tile(
            np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1])
        transform = transforms.LowerTriangular(self.num_inducing,
                                               num_matrices=num_outputs)
        self.q_sqrt = Parameter(q_sqrt, transform=transform)

        self.feature = InducingPoints(self.Z)
        self.kern = kern
        self.mean_function = mean_function

        self.num_outputs = num_outputs
        self.white = white

        if not self.white:  # initialize to prior
            Ku = self.kern.compute_K_symm(self.Z)
            Lu = np.linalg.cholesky(Ku +
                                    np.eye(self.Z.shape[0]) * settings.jitter)
            self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1])

        self.needs_build_cholesky = True

    @params_as_tensors
    def build_cholesky_if_needed(self):
        # make sure we only compute this once
        if self.needs_build_cholesky:
            self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter)
            self.Lu = tf.cholesky(self.Ku)
            self.Ku_tiled = tf.tile(self.Ku[None, :, :],
                                    [self.num_outputs, 1, 1])
            self.Lu_tiled = tf.tile(self.Lu[None, :, :],
                                    [self.num_outputs, 1, 1])
            self.needs_build_cholesky = False

    def conditional_ND(self, X, full_cov=False):
        self.build_cholesky_if_needed()

        # mmean, vvar = conditional(X, self.feature.Z, self.kern,
        #             self.q_mu, q_sqrt=self.q_sqrt,
        #             full_cov=full_cov, white=self.white)
        Kuf = self.feature.Kuf(self.kern, X)

        A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True)
        if not self.white:
            A = tf.matrix_triangular_solve(tf.transpose(self.Lu),
                                           A,
                                           lower=False)

        mean = tf.matmul(A, self.q_mu, transpose_a=True)

        A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1])
        I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :]

        if self.white:
            SK = -I
        else:
            SK = -self.Ku_tiled

        if self.q_sqrt is not None:
            SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True)

        B = tf.matmul(SK, A_tiled)

        if full_cov:
            # (num_latent, num_X, num_X)
            delta_cov = tf.matmul(A_tiled, B, transpose_a=True)
            Kff = self.kern.K(X)
        else:
            # (num_latent, num_X)
            delta_cov = tf.reduce_sum(A_tiled * B, 1)
            Kff = self.kern.Kdiag(X)

        # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X)
        var = tf.expand_dims(Kff, 0) + delta_cov
        var = tf.transpose(var)

        return mean + self.mean_function(X), var

    def KL(self):
        """
        The KL divergence from the variational distribution to the prior
        :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP
        """
        # if self.white:
        #     return gauss_kl(self.q_mu, self.q_sqrt)
        # else:
        #     return gauss_kl(self.q_mu, self.q_sqrt, self.Ku)

        self.build_cholesky_if_needed()

        KL = -0.5 * self.num_outputs * self.num_inducing
        KL -= 0.5 * tf.reduce_sum(tf.log(tf.matrix_diag_part(self.q_sqrt)**2))

        if not self.white:
            KL += tf.reduce_sum(tf.log(tf.matrix_diag_part(
                self.Lu))) * self.num_outputs
            KL += 0.5 * tf.reduce_sum(
                tf.square(
                    tf.matrix_triangular_solve(
                        self.Lu_tiled, self.q_sqrt, lower=True)))
            Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu)
            KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m)
        else:
            KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt))
            KL += 0.5 * tf.reduce_sum(self.q_mu**2)

        return KL
Exemple #16
0
def init_layers(graph_adj, node_feature, kernels, n_layers, all_layers_dim, num_inducing,
                gc_kernel=True, mean_function="linear", white=False, q_diag=False):

    assert mean_function in ["linear", "zero"]  # mean function must be linear or zero

    layers = []

    # get initial Z
    sparse_adj = tuple_to_sparse_matrix(graph_adj[0], graph_adj[1], graph_adj[2])
    X_running = node_feature.copy()

    for i in range(n_layers):

        tf.logging.info("initialize {}th layer".format(i + 1))

        dim_in = all_layers_dim[i]
        dim_out = all_layers_dim[i + 1]

        conv_X = sparse_adj.dot(X_running)
        Z_running = kmeans2(conv_X, num_inducing[i], minit="points")[0]

        kernel = kernels[i]

        if gc_kernel and kernel.gc_weight:
            # Z_running = pca(Z_running, kernel.base_kernel.input_dim)  # 将维度降到和输出维度一致
            X_dim = X_running.shape[1]
            kernel_input_dim = kernel.base_kernel.input_dim
            if X_dim > kernel_input_dim:
                Z_running = pca(Z_running, kernel.base_kernel.input_dim)  # 将维度降到和输出维度一致
            elif X_dim < kernel_input_dim:
                Z_running = np.concatenate([Z_running, np.zeros((Z_running.shape[0], kernel_input_dim - X_dim))], axis=1)

        # print(type(Z_running))
        # print(Z_running)

        if dim_in > dim_out:
            _, _, V = np.linalg.svd(X_running, full_matrices=False)
            W = V[:dim_out, :].T
        elif dim_in < dim_out:
            W = np.concatenate([np.eye(dim_in), np.zeros((dim_in, dim_out - dim_in))], 1)

        if mean_function == "zero":
            mf = Zero()
        else:

            if dim_in == dim_out:
                mf = Identity()
            else:
                mf = Linear(W)
                mf.set_trainable(False)

        # self.Ku = Kuu(GraphConvolutionInducingpoints(Z_running), kernel, jitter=settings.jitter)
        # print("successfully calculate Ku")
        if gc_kernel:
            feature = GraphConvolutionInducingpoints(Z_running)
        else:
            feature = InducingPoints(Z_running)

        layers.append(svgp_layer(kernel, Z_running, feature, dim_out, mf, gc_kernel, white=white, q_diag=q_diag))

        if dim_in != dim_out:
            # Z_running = Z_running.dot(W)
            X_running = X_running.dot(W)

    return layers
Exemple #17
0
    def build_model(self,
                    ARGS,
                    X,
                    Y,
                    conditioning=False,
                    apply_name=True,
                    noise_var=None,
                    mean_function=None):

        if conditioning == False:
            N, D = X.shape

            # first layer inducing points
            if N > ARGS.M:
                Z = kmeans2(X, ARGS.M, minit='points')[0]
            else:
                # This is the old way of initializing Zs
                # M_pad = ARGS.M - N
                # Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

                # This is the new way of initializing Zs
                min_x, max_x = self.bounds[0]
                min_x = (min_x - self.x_mean) / self.x_std
                max_x = (max_x - self.x_mean) / self.x_std

                Z = np.linspace(min_x, max_x, num=ARGS.M)  # * X.shape[1])
                Z = Z.reshape((-1, X.shape[1]))
                #print(min_x)
                #print(max_x)
                #print(Z)

            #################################### layers
            P = np.linalg.svd(X, full_matrices=False)[2]
            # PX = P.copy()

            layers = []
            # quad_layers = []

            DX = D
            DY = 1

            D_in = D
            D_out = D

            with defer_build():

                # variance initialiaztion
                lik = Gaussian()
                lik.variance = ARGS.likelihood_variance

                if len(ARGS.configuration) > 0:
                    for c, d in ARGS.configuration.split('_'):
                        if c == 'G':
                            num_gps = int(d)
                            A = np.zeros((D_in, D_out))
                            D_min = min(D_in, D_out)
                            A[:D_min, :D_min] = np.eye(D_min)
                            mf = Linear(A=A)
                            mf.b.set_trainable(False)

                            def make_kern():
                                k = RBF(D_in,
                                        lengthscales=float(D_in)**0.5,
                                        variance=1.,
                                        ARD=True)
                                k.variance.set_trainable(False)
                                return k

                            PP = np.zeros((D_out, num_gps))
                            PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                            ZZ = np.random.randn(ARGS.M, D_in)
                            # print(Z.shape)
                            # print(ZZ.shape)
                            ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                            kern = SharedMixedMok(make_kern(), W=PP)
                            inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                            l = GPLayer(kern,
                                        inducing,
                                        num_gps,
                                        mean_function=mf)
                            if ARGS.fix_linear is True:
                                kern.W.set_trainable(False)
                                mf.set_trainable(False)

                            layers.append(l)

                            D_in = D_out

                        elif c == 'L':
                            d = int(d)
                            D_in += d
                            layers.append(LatentVariableLayer(d,
                                                              XY_dim=DX + 1))

                # kernel initialization
                kern = RBF(D_in,
                           lengthscales=float(D_in)**0.5,
                           variance=1.,
                           ARD=True)
                ZZ = np.random.randn(ARGS.M, D_in)
                ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
                layers.append(GPLayer(kern, InducingPoints(ZZ), DY))
                self.layers = layers
                self.lik = lik

            # global_step = tf.Variable(0, dtype=tf.int32)
            # self.global_step = global_step
        else:
            lik = self._gp.likelihood
            layers = self._gp.layers._list
            # val = self.session.run(self.global_step)
            # global_step = tf.Variable(val, dtype=tf.int32)
            # self.global_step = global_step
            self._gp.clear()

        with defer_build():

            #################################### model
            name = 'Model' if apply_name else None

            if ARGS.mode == 'VI':
                model = DGP_VI(X,
                               Y,
                               layers,
                               lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'SGHMC':
                for layer in layers:
                    if hasattr(layer, 'q_sqrt'):
                        del layer.q_sqrt
                        layer.q_sqrt = None
                        layer.q_mu.set_trainable(False)

                model = DGP_VI(X,
                               Y,
                               layers,
                               lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'IWAE':
                model = DGP_IWVI(X,
                                 Y,
                                 layers,
                                 lik,
                                 minibatch_size=ARGS.minibatch_size,
                                 num_samples=ARGS.num_IW_samples,
                                 name=name)

        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        if not ('SGHMC' == ARGS.mode):
            for layer in model.layers[:-1]:
                if isinstance(layer, GPLayer):
                    layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

            model.compile()

            #################################### optimization

            var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]

            model.layers[-1].q_mu.set_trainable(False)
            model.layers[-1].q_sqrt.set_trainable(False)

            gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma,
                                                       global_step,
                                                       1000,
                                                       ARGS.gamma_decay,
                                                       staircase=True),
                            dtype=tf.float64)
            lr = tf.cast(tf.train.exponential_decay(ARGS.lr,
                                                    global_step,
                                                    1000,
                                                    ARGS.lr_decay,
                                                    staircase=True),
                         dtype=tf.float64)

            op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(
                model, var_list=var_list)

            op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

            def train(s):
                s.run(op_increment)
                s.run(op_ng)
                s.run(op_adam)

            model.train_op = train
            model.init_op = lambda s: s.run(
                tf.variables_initializer([global_step]))
            model.global_step = global_step

        else:
            model.compile()

            sghmc_vars = []
            for layer in layers:
                if hasattr(layer, 'q_mu'):
                    sghmc_vars.append(layer.q_mu.unconstrained_tensor)

            hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model)

            self.sghmc_optimizer = SGHMC(model, sghmc_vars, hyper_train_op,
                                         100)

            def train_op(s):
                s.run(op_increment),
                self.sghmc_optimizer.sghmc_step(s),
                self.sghmc_optimizer.train_hypers(s)

            model.train_op = train_op
            model.sghmc_optimizer = self.sghmc_optimizer

            def init_op(s):
                epsilon = 0.01
                mdecay = 0.05
                with tf.variable_scope('sghmc'):
                    self.sghmc_optimizer.generate_update_step(epsilon, mdecay)
                v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope='sghmc')
                s.run(tf.variables_initializer(v))
                s.run(tf.variables_initializer([global_step]))

            # Added jitter due to input matrix invertability problems
            custom_config = gpflow.settings.get_settings()
            custom_config.numerics.jitter_level = 1e-8

            model.init_op = init_op
            model.global_step = global_step

        # build the computation graph for the gradient
        self.X_placeholder = tf.placeholder(tf.float64,
                                            shape=[None, X.shape[1]])
        self.Fs, Fmu, Fvar = model._build_predict(self.X_placeholder)
        self.mean_grad = tf.gradients(Fmu, self.X_placeholder)
        self.var_grad = tf.gradients(Fvar, self.X_placeholder)

        # calculated the gradient of the mean for the quantile-filtered distribution
        # print(Fs)
        # q = np.quantile(Fs, self.quantile, axis=0)
        # qFs = [f for f in Fs if f < q]
        # q_mean = np.mean(qFs, axis=0)
        # q_var = np.var(qFs, axis=0)
        # self.qmean_grad = tf.gradients(q_mean, self.X_placeholder)
        # self.qvar_grad = tf.gradients(q_var, self.X_placeholder)

        return model
Exemple #18
0
def test_separate_independent_mof(session_tf):
    """
    Same test as above but we use different (i.e. separate) inducing features
    for each of the output dimensions.
    """
    np.random.seed(0)

    # Model 1 (INefficient)
    q_mu_1 = np.random.randn(Data.M * Data.P, 1)
    q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P,
                                       Data.M * Data.P))[None,
                                                         ...]  # 1 x MP x MP
    kernel_1 = mk.SharedIndependentMok(
        RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P)
    feature_1 = InducingPoints(Data.X[:Data.M, ...].copy())
    m1 = SVGP(Data.X,
              Data.Y,
              kernel_1,
              Gaussian(),
              feature_1,
              q_mu=q_mu_1,
              q_sqrt=q_sqrt_1)
    m1.set_trainable(False)
    m1.q_sqrt.set_trainable(True)
    m1.q_mu.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m1, maxiter=Data.MAXITER)

    # Model 2 (efficient)
    q_mu_2 = np.random.randn(Data.M, Data.P)
    q_sqrt_2 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kernel_2 = mk.SharedIndependentMok(
        RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P)
    feat_list_2 = [
        InducingPoints(Data.X[:Data.M, ...].copy()) for _ in range(Data.P)
    ]
    feature_2 = mf.SeparateIndependentMof(feat_list_2)
    m2 = SVGP(Data.X,
              Data.Y,
              kernel_2,
              Gaussian(),
              feature_2,
              q_mu=q_mu_2,
              q_sqrt=q_sqrt_2)
    m2.set_trainable(False)
    m2.q_sqrt.set_trainable(True)
    m2.q_mu.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m2, maxiter=Data.MAXITER)

    # Model 3 (Inefficient): an idenitical feature is used P times,
    # and treated as a separate feature.
    q_mu_3 = np.random.randn(Data.M, Data.P)
    q_sqrt_3 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kern_list = [
        RBF(Data.D, variance=0.5, lengthscales=1.2) for _ in range(Data.P)
    ]
    kernel_3 = mk.SeparateIndependentMok(kern_list)
    feat_list_3 = [
        InducingPoints(Data.X[:Data.M, ...].copy()) for _ in range(Data.P)
    ]
    feature_3 = mf.SeparateIndependentMof(feat_list_3)
    m3 = SVGP(Data.X,
              Data.Y,
              kernel_3,
              Gaussian(),
              feature_3,
              q_mu=q_mu_3,
              q_sqrt=q_sqrt_3)
    m3.set_trainable(False)
    m3.q_sqrt.set_trainable(True)
    m3.q_mu.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m3, maxiter=Data.MAXITER)

    check_equality_predictions(session_tf, [m1, m2, m3])
Exemple #19
0
def test_shared_independent_mok(session_tf):
    """
    In this test we use the same kernel and the same inducing features
    for each of the outputs. The outputs are considered to be uncorrelated.
    This is how GPflow handled multiple outputs before the multioutput framework was added.
    We compare three models here:
        1) an ineffient one, where we use a SharedIndepedentMok with InducingPoints.
           This combination will uses a Kff of size N x P x N x P, Kfu if size N x P x M x P
           which is extremely inefficient as most of the elements are zero.
        2) efficient: SharedIndependentMok and SharedIndependentMof
           This combinations uses the most efficient form of matrices
        3) the old way, efficient way: using Kernel and InducingPoints
        Model 2) and 3) follow more or less the same code path.
    """
    # Model 1
    q_mu_1 = np.random.randn(Data.M * Data.P, 1)  # MP x 1
    q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P,
                                       Data.M * Data.P))[None,
                                                         ...]  # 1 x MP x MP
    kernel_1 = mk.SharedIndependentMok(
        RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P)
    feature_1 = InducingPoints(Data.X[:Data.M, ...].copy())
    m1 = SVGP(Data.X,
              Data.Y,
              kernel_1,
              Gaussian(),
              feature_1,
              q_mu=q_mu_1,
              q_sqrt=q_sqrt_1)
    m1.set_trainable(False)
    m1.q_sqrt.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m1, maxiter=Data.MAXITER)

    # Model 2
    q_mu_2 = np.reshape(q_mu_1, [Data.M, Data.P])  # M x P
    q_sqrt_2 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kernel_2 = RBF(Data.D, variance=0.5, lengthscales=1.2)
    feature_2 = InducingPoints(Data.X[:Data.M, ...].copy())
    m2 = SVGP(Data.X,
              Data.Y,
              kernel_2,
              Gaussian(),
              feature_2,
              q_mu=q_mu_2,
              q_sqrt=q_sqrt_2)
    m2.set_trainable(False)
    m2.q_sqrt.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m2, maxiter=Data.MAXITER)

    # Model 3
    q_mu_3 = np.reshape(q_mu_1, [Data.M, Data.P])  # M x P
    q_sqrt_3 = np.array([
        np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P)
    ])  # P x M x M
    kernel_3 = mk.SharedIndependentMok(
        RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P)
    feature_3 = mf.SharedIndependentMof(
        InducingPoints(Data.X[:Data.M, ...].copy()))
    m3 = SVGP(Data.X,
              Data.Y,
              kernel_3,
              Gaussian(),
              feature_3,
              q_mu=q_mu_3,
              q_sqrt=q_sqrt_3)
    m3.set_trainable(False)
    m3.q_sqrt.set_trainable(True)
    gpflow.training.ScipyOptimizer().minimize(m3, maxiter=Data.MAXITER)

    check_equality_predictions(session_tf, [m1, m2, m3])
Exemple #20
0
class SVGP_Layer(Layer):
    def __init__(self,
                 kern,
                 Z,
                 num_outputs,
                 mean_function,
                 white=False,
                 input_prop_dim=None,
                 **kwargs):
        """
        A sparse variational GP layer in whitened representation. This layer holds the kernel,
        variational parameters, inducing points and mean function.

        The underlying model at inputs X is
        f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X)

        The variational distribution over the inducing points is
        q(v) = N(q_mu, q_sqrt q_sqrt^T)

        The layer holds D_out independent GPs with the same kernel and inducing points.

        :param kern: The kernel for the layer (input_dim = D_in)
        :param Z: Inducing points (M, D_in)
        :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs))
        :param mean_function: The mean function
        :return:
        """
        Layer.__init__(self, input_prop_dim, **kwargs)
        self.num_inducing = Z.shape[0]

        q_mu = np.zeros((self.num_inducing, num_outputs))
        self.q_mu = Parameter(q_mu)

        q_sqrt = np.tile(
            np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1])
        transform = transforms.LowerTriangular(self.num_inducing,
                                               num_matrices=num_outputs)
        self.q_sqrt = Parameter(q_sqrt, transform=transform)

        self.feature = InducingPoints(Z)
        self.kern = kern
        self.mean_function = mean_function

        self.num_outputs = num_outputs
        self.white = white  #tf.constant(white, shape=(), dtype = tf.bool)        #white #

        if not self.white:  # initialize to prior
            Ku = self.kern.compute_K_symm(Z)
            Lu = np.linalg.cholesky(Ku + np.eye(Z.shape[0]) * settings.jitter)
            self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1])

        self.needs_build_cholesky = True

    @params_as_tensors
    def build_cholesky_if_needed(self):
        # make sure we only compute this once
        if self.needs_build_cholesky:
            self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter)
            self.Lu = tf.cholesky(self.Ku)
            self.Ku_tiled = tf.tile(self.Ku[None, :, :],
                                    [self.num_outputs, 1, 1])
            self.Lu_tiled = tf.tile(self.Lu[None, :, :],
                                    [self.num_outputs, 1, 1])
            #also compute K_inverse and it's det
            if not self.white:
                inp_ = (self.Ku + tf.eye(self.num_inducing, dtype=tf.float64) *
                        settings.jitter * 10)
                self.K_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64))

            self.needs_build_cholesky = False

    def conditional_ND(self, X, full_cov=False):
        self.build_cholesky_if_needed()

        # mmean, vvar = conditional(X, self.feature.Z, self.kern,
        #             self.q_mu, q_sqrt=self.q_sqrt,
        #             full_cov=full_cov, white=self.white)
        Kuf = self.feature.Kuf(self.kern, X)

        A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True)
        if not self.white:
            A = tf.matrix_triangular_solve(tf.transpose(self.Lu),
                                           A,
                                           lower=False)

        mean = tf.matmul(A, self.q_mu, transpose_a=True)

        A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1])
        I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :]

        if self.white:
            SK = -I
        else:
            SK = -self.Ku_tiled

        if self.q_sqrt is not None:
            SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True)

        B = tf.matmul(SK, A_tiled)

        if full_cov:
            # (num_latent, num_X, num_X)
            delta_cov = tf.matmul(A_tiled, B, transpose_a=True)
            Kff = self.kern.K(X)
        else:
            # (num_latent, num_X)
            delta_cov = tf.reduce_sum(A_tiled * B, 1)
            Kff = self.kern.Kdiag(X)

        # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X)
        var = tf.expand_dims(Kff, 0) + delta_cov
        var = tf.transpose(var)

        return mean + self.mean_function(X), var

    def KL(self):
        """
        The KL divergence from the variational distribution to the prior. 
        Notation in paper is KL[q(u)||p(u)]. 
        
        OR the alpha-renyi divergence from variational distribution to the prior

        :return: KL divergence from N(q_mu, q_sqrt * q_sqrt^T) to N(0, I) (if whitened)
                and to N(mu(Z), K(Z)) otherwise, independently for each GP
        """
        # if self.white:
        #     return gauss_kl(self.q_mu, self.q_sqrt)
        # else:
        #     return gauss_kl(self.q_mu, self.q_sqrt, self.Ku)
        #

        self.build_cholesky_if_needed()
        if self.alpha is None:
            """Get KL regularizer"""

            KL = -0.5 * self.num_outputs * self.num_inducing
            KL -= 0.5 * tf.reduce_sum(
                tf.log(tf.matrix_diag_part(self.q_sqrt)**2))

            if not self.white:
                # Whitening is relative to the prior. Here, the prior is NOT
                # whitened, meaning that we have N(0, K(Z,Z)) as prior.
                KL += tf.reduce_sum(tf.log(tf.matrix_diag_part(
                    self.Lu))) * self.num_outputs
                KL += 0.5 * tf.reduce_sum(
                    tf.square(
                        tf.matrix_triangular_solve(
                            self.Lu_tiled, self.q_sqrt, lower=True)))
                Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu)
                KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m)
            else:
                KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt))
                KL += 0.5 * tf.reduce_sum(self.q_mu**2)

            return self.weight * KL
        else:
            """Get AR regularizer. For the normal, this means 
            log(Normalizing Constant[alpha * eta_q + (1-alpha) * eta_0 ]) - 
            alpha*log(Normalizing Constant[eta_q]) - 
            (1-alpha)*log(Normalizing Constant[eta_0]).
            
            NOTE: the 2*pi factor will cancel, as well as the 0.5 * factor.
            NOTE: q_strt is s.t. q_sqrt * q_sqrt^T = variational variance, i.e.
                  q(v) = N(q_mu, q_sqrt q_sqrt^T).
            NOTE: self.Lu is cholesky decomp of self.Ku
            NOTE: self.feature are the inducing points Z, and 
                  self.Ku = self.feature.Kuu(kernel), meaning that self.Ku is
                  the kernel matrix computed at the inducing points Z.
            NOTE: We need the alpha-renyi div between prior and GP-variational
                  posterior for EACH of the GPs in this layer.
                  
            Shapes:
                q_sqrt:                                 13 x 100 x 100
                q_mu:                                   100 x 13
                tf.matrix_diag_part(self.q_sqrt):       13 x 100
                q_sqrt_inv:                             13 x 100 x 100
                Ku, Lu:                                 100 x 100
                num_inducing:                           100
                num_outputs:                            13
            """

            #convenience
            alpha = self.alpha

            #INEFFICIENT, can probably be done much better with cholesky solve
            inp_ = (tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) +
                    tf.eye(self.num_inducing, dtype=tf.float64) *
                    settings.jitter * 100)
            q_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64))

            #gives Sigma_q^-1 * mu_q
            q_var_x_q_mu = tf.matmul(
                q_inv,
                tf.reshape(self.q_mu,
                           shape=(self.num_outputs, self.num_inducing, 1)))

            #Get the two log-normalizers for the variational posteriors
            q_component_1 = 0.5 * tf.reduce_sum(
                tf.log(tf.matrix_diag_part(self.q_sqrt)**2))
            q_component_2 = 0.5 * tf.reduce_sum(q_var_x_q_mu * self.q_mu)

            logZq = (q_component_1 + q_component_2)

            if not self.white:
                #prior using self.Lu, still 0 mean fct
                logZpi = 0.5 * tf.reduce_sum(
                    tf.log(tf.matrix_diag_part(self.Lu)**2)) * self.num_outputs

                new_Sigma_inv = (alpha * q_inv + (1.0 - alpha) * self.K_inv +
                                 tf.eye(self.num_inducing, dtype=tf.float64) *
                                 settings.jitter)  # +

            else:
                logZpi = 0.0  #* self.num_outputs * self.num_inducing  - but that is still 0.
                new_Sigma_inv = (alpha * q_inv +
                                 (1.0 - alpha + settings.jitter) *
                                 tf.eye(self.num_inducing, dtype=tf.float64))

            new_Sigma_inv_chol = tf.cholesky(tf.cast(new_Sigma_inv,
                                                     tf.float64))
            log_det = -tf.reduce_sum(
                tf.log(tf.matrix_diag_part(new_Sigma_inv_chol)**2))

            #Get the new inverse variance of the exponential family member
            #corresponding to alpha * eta_q + (1-alpha) * eta_0.
            #var_inv_new = tf.matmul(chol_var_inv_new, chol_var_inv_new, transpose_b=True)

            #Compute mu_new: Compute (Sigma^-1*mu) = A via
            #A = alpha* Sigma_q^-1 * q_mu + (1-alpha * 0) and then multiply
            #both sides by Sigma! => Problem: I don't know sigma!
            mu_new = tf.linalg.solve(
                tf.cast(new_Sigma_inv, dtype=tf.float64),
                tf.cast(alpha * q_var_x_q_mu, dtype=tf.float64))

            #Note: Sigma^{-1}_new * mu_new = Sigma^{-1}_q * mu_q, so
            #      mu_new' * Sigma^{-1}_new * mu_new = mu_new' * (Sigma^{-1}_q * mu_q)
            mu_new_x_new_Sigma_inv = tf.reduce_sum(alpha * q_var_x_q_mu *
                                                   mu_new)

            #Observing that log(|Sigma|) = - log(|Sigma|^-1), we can now get
            #the normalizing constant of the new exp. fam member.
            logZnew = (0.5 * mu_new_x_new_Sigma_inv + 0.5 * log_det)

            #return the log of the AR-div between the normals, i.e.
            # (1/(alpha * (1-alpha))) * log(D), where D =
            #new normalizer / (q_normalizer^alpha * prior_normalizer^(1-alpha))
            AR = (1.0 / (alpha * (1.0 - alpha))) * (logZnew - alpha * logZq -
                                                    (1.0 - alpha) * logZpi)

            return self.weight * AR
Exemple #21
0
def build_model(ARGS, X, Y, apply_name=True):

    if ARGS.mode == 'CVAE':

        layers = []
        for l in ARGS.configuration.split('_'):
            try:
                layers.append(int(l))
            except:
                pass

        with defer_build():
            name = 'CVAE' if apply_name else None
            model = CVAE(X, Y, 1, layers, batch_size=ARGS.minibatch_size, name=name)

        model.compile()

        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, 0.98, staircase=True), dtype=tf.float64)
        op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

        model.train_op = lambda s: s.run([op_adam, op_increment])
        model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
        model.global_step = global_step

        model.compile()

    else:
        N, D = X.shape

        # first layer inducing points
        if N > ARGS.M:
            Z = kmeans2(X, ARGS.M, minit='points')[0]
        else:
            M_pad = ARGS.M - N
            Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

        #################################### layers
        P = np.linalg.svd(X, full_matrices=False)[2]
        # PX = P.copy()

        layers = []
        # quad_layers = []

        DX = D
        DY = 1

        D_in = D
        D_out = D
        with defer_build():
            lik = Gaussian()
            lik.variance = ARGS.likelihood_variance

            if len(ARGS.configuration) > 0:
                for c, d in ARGS.configuration.split('_'):
                    if c == 'G':
                        num_gps = int(d)
                        A = np.zeros((D_in, D_out))
                        D_min = min(D_in, D_out)
                        A[:D_min, :D_min] = np.eye(D_min)
                        mf = Linear(A=A)
                        mf.b.set_trainable(False)

                        def make_kern():
                            k = RBF(D_in, lengthscales=float(D_in) ** 0.5, variance=1., ARD=True)
                            k.variance.set_trainable(False)
                            return k

                        PP = np.zeros((D_out, num_gps))
                        PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                        ZZ = np.random.randn(ARGS.M, D_in)
                        ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                        kern = SharedMixedMok(make_kern(), W=PP)
                        inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                        l = GPLayer(kern, inducing, num_gps, mean_function=mf)
                        if ARGS.fix_linear is True:
                            kern.W.set_trainable(False)
                            mf.set_trainable(False)

                        layers.append(l)

                        D_in = D_out

                    elif c == 'L':
                        d = int(d)
                        D_in += d
                        layers.append(LatentVariableLayer(d, XY_dim=DX+1))

            kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True)
            ZZ = np.random.randn(ARGS.M, D_in)
            ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
            layers.append(GPLayer(kern, InducingPoints(ZZ), DY))


            #################################### model
            name = 'Model' if apply_name else None

            if ARGS.mode == 'VI':
                model = DGP_VI(X, Y, layers, lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'SGHMC':
                for layer in layers:
                    if hasattr(layer, 'q_sqrt'):
                        del layer.q_sqrt
                        layer.q_sqrt = None
                        layer.q_mu.set_trainable(False)

                model = DGP_VI(X, Y, layers, lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)


            elif ARGS.mode == 'IWAE':
                model = DGP_IWVI(X, Y, layers, lik,
                                 minibatch_size=ARGS.minibatch_size,
                                 num_samples=ARGS.num_IW_samples,
                                 name=name)



        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        if not ('SGHMC' == ARGS.mode):
            for layer in model.layers[:-1]:
                if isinstance(layer, GPLayer):
                    layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

            model.compile()

            #################################### optimization

            var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]

            model.layers[-1].q_mu.set_trainable(False)
            model.layers[-1].q_sqrt.set_trainable(False)

            gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True),
                            dtype=tf.float64)
            lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64)

            op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(model, var_list=var_list)

            op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

            def train(s):
                s.run(op_increment)
                s.run(op_ng)
                s.run(op_adam)

            model.train_op = train
            model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
            model.global_step = global_step

        else:
            model.compile()

            hmc_vars = []
            for layer in layers:
                if hasattr(layer, 'q_mu'):
                    hmc_vars.append(layer.q_mu.unconstrained_tensor)

            hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model)

            sghmc_optimizer = SGHMC(model, hmc_vars, hyper_train_op, 100)

            def train_op(s):
                s.run(op_increment),
                sghmc_optimizer.sghmc_step(s),
                sghmc_optimizer.train_hypers(s)

            model.train_op = train_op
            model.sghmc_optimizer = sghmc_optimizer
            def init_op(s):
                epsilon = 0.01
                mdecay = 0.05
                with tf.variable_scope('hmc'):
                    sghmc_optimizer.generate_update_step(epsilon, mdecay)
                v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='hmc')
                s.run(tf.variables_initializer(v))
                s.run(tf.variables_initializer([global_step]))

            model.init_op = init_op
            model.global_step = global_step

    return model
Exemple #22
0
def build_model(ARGS, X, Y, apply_name=True):
    N, D = X.shape

    # first layer inducing points
    if N > ARGS.M:
        Z = kmeans2(X, ARGS.M, minit="points")[0]
    else:
        M_pad = ARGS.M - N
        Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

    #################################### layers
    P = np.linalg.svd(X, full_matrices=False)[2]

    layers = []

    DX = D
    DY = 1

    D_in = D
    D_out = D
    with defer_build():
        lik = Gaussian()
        lik.variance = ARGS.likelihood_variance

        if len(ARGS.configuration) > 0:
            for c, d in ARGS.configuration.split("_"):
                if c == "G":
                    num_gps = int(d)
                    A = np.zeros((D_in, D_out))
                    D_min = min(D_in, D_out)
                    A[:D_min, :D_min] = np.eye(D_min)
                    mf = Linear(A=A)
                    mf.b.set_trainable(False)

                    def make_kern():
                        k = RBF(D_in,
                                lengthscales=float(D_in)**0.5,
                                variance=1.0,
                                ARD=True)
                        k.variance.set_trainable(False)
                        return k

                    PP = np.zeros((D_out, num_gps))
                    PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                    ZZ = np.random.randn(ARGS.M, D_in)
                    ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                    kern = SharedMixedMok(make_kern(), W=PP)
                    inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                    l = GPLayer(kern,
                                inducing,
                                num_gps,
                                layer_num=len(layers),
                                mean_function=mf)
                    if ARGS.fix_linear is True:
                        kern.W.set_trainable(False)
                        mf.set_trainable(False)

                    layers.append(l)

                    D_in = D_out

                elif c == "L":
                    d = int(d)
                    D_in += d
                    encoder_dims = [
                        int(dim.strip())
                        for dim in ARGS.encoder_dims.split(",")
                    ]
                    layers.append(
                        LatentVariableLayer(d,
                                            XY_dim=DX + 1,
                                            encoder_dims=encoder_dims,
                                            qz_mode=ARGS.qz_mode))

        kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True)
        ZZ = np.random.randn(ARGS.M, D_in)
        ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
        layers.append(GPLayer(kern, InducingPoints(ZZ), DY))

        #################################### model
        name = "Model" if apply_name else None

        if ARGS.mode == "VI":
            model = DGP_VI(X,
                           Y,
                           layers,
                           lik,
                           minibatch_size=ARGS.minibatch_size,
                           name=name)

        elif ARGS.mode == "IWAE":
            model = DGP_IWVI(
                X=X,
                Y=Y,
                layers=layers,
                likelihood=lik,
                minibatch_size=ARGS.minibatch_size,
                num_samples=ARGS.num_IW_samples,
                name=name,
                encoder_minibatch_size=ARGS.encoder_minibatch_size,
            )

        elif ARGS.mode == "CIWAE":
            model = DGP_CIWAE(
                X,
                Y,
                layers,
                lik,
                minibatch_size=ARGS.minibatch_size,
                num_samples=ARGS.num_IW_samples,
                name=name,
                beta=ARGS.beta,
            )

        else:
            raise ValueError(f"Unknown mode {ARGS.mode}.")

    global_step = tf.Variable(0, dtype=tf.int32)
    op_increment = tf.assign_add(global_step, 1)

    for layer in model.layers[:-1]:
        if isinstance(layer, GPLayer):
            layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

    model.compile()

    #################################### optimization

    # Whether to train the final layer with the other parameters, using Adam, or by itself, using natural
    # gradients.
    if ARGS.use_nat_grad_for_final_layer:
        # Turn off training so the parameters are not optimised by Adam. We pass them directly to the natgrad
        # optimiser, which bypasses this flag.
        model.layers[-1].q_mu.set_trainable(False)
        model.layers[-1].q_sqrt.set_trainable(False)

        gamma = tf.cast(
            tf.train.exponential_decay(ARGS.gamma,
                                       global_step,
                                       1000,
                                       ARGS.gamma_decay,
                                       staircase=True),
            dtype=tf.float64,
        )
        final_layer_vars = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]
        final_layer_opt_op = NatGradOptimizer(
            gamma=gamma).make_optimize_tensor(model, var_list=final_layer_vars)
    else:
        final_layer_opt_op = NoOp()

    lr = tf.cast(
        tf.train.exponential_decay(ARGS.lr,
                                   global_step,
                                   decay_steps=1000,
                                   decay_rate=ARGS.lr_decay,
                                   staircase=True),
        dtype=tf.float64,
    )

    encoder_lr = tf.cast(
        tf.train.exponential_decay(
            ARGS.encoder_lr,
            global_step,
            decay_steps=1000,
            decay_rate=ARGS.encoder_lr_decay,
            staircase=True,
        ),
        dtype=tf.float64,
    )

    dreg_optimizer = DregOptimizer(
        enable_dreg=ARGS.use_dreg,
        optimizer=ARGS.optimizer,
        encoder_optimizer=ARGS.encoder_optimizer,
        learning_rate=lr,
        encoder_learning_rate=encoder_lr,
        assert_no_nans=ARGS.assert_no_nans,
        encoder_grad_clip_value=ARGS.clip_encoder_grads,
    )
    other_layers_opt_op = dreg_optimizer.make_optimize_tensor(model)

    model.lr = lr
    model.train_op = tf.group(op_increment, final_layer_opt_op,
                              other_layers_opt_op)
    model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
    model.global_step = global_step

    return model
    def _make_part_model(self,
                         X,
                         Y,
                         weights,
                         Z,
                         q_mu,
                         q_sqrt,
                         W,
                         freqs,
                         minibatch_size=None,
                         priors=None):
        """
        Create a gpflow model for a selection of data
        X: array (N, Din)
        Y: array (N, P, Nf)
        weights: array like Y the statistical weights of each datapoint
        minibatch_size : int 
        Z: list of array (M, Din)
            The inducing points mean locations.
        q_mu: list of array (M, L)
        q_sqrt: list of array (L, M, M)
        W: array [P,L]
        freqs: array [Nf,] the freqs
        priors : dict of priors for the global model
        Returns:
        model : gpflow.models.Model 
        """
        N, P, Nf = Y.shape
        _, Din = X.shape

        assert priors is not None
        likelihood_var = priors['likelihood_var']
        tec_kern_time_ls = priors['tec_kern_time_ls']
        tec_kern_dir_ls = priors['tec_kern_dir_ls']
        tec_kern_var = priors['tec_kern_var']
        tec_mean = priors['tec_mean']
        Z_var = priors['Z_var']

        P, L = W.shape

        with defer_build():

            # Define the likelihood
            likelihood = WrappedPhaseGaussianMulti(
                tec_scale=priors['tec_scale'], freqs=freqs)
            likelihood.variance = np.exp(likelihood_var[0])  #median as initial
            likelihood.variance.prior = LogNormal(likelihood_var[0],
                                                  likelihood_var[1]**2)
            likelihood.variance.set_trainable(True)

            def _kern():
                kern_thin_layer = ThinLayer(np.array([0., 0., 0.]),
                                            priors['tec_scale'],
                                            active_dims=slice(2, 6, 1))
                kern_time = Matern32(1, active_dims=slice(6, 7, 1))
                kern_dir = Matern32(2, active_dims=slice(0, 2, 1))

                ###
                # time kern
                kern_time.lengthscales = np.exp(tec_kern_time_ls[0])
                kern_time.lengthscales.prior = LogNormal(
                    tec_kern_time_ls[0], tec_kern_time_ls[1]**2)
                kern_time.lengthscales.set_trainable(True)

                kern_time.variance = 1.  #np.exp(tec_kern_var[0])
                #kern_time.variance.prior = LogNormal(tec_kern_var[0],tec_kern_var[1]**2)
                kern_time.variance.set_trainable(False)  #

                ###
                # directional kern
                kern_dir.variance = np.exp(tec_kern_var[0])
                kern_dir.variance.prior = LogNormal(tec_kern_var[0],
                                                    tec_kern_var[1]**2)
                kern_dir.variance.set_trainable(True)

                kern_dir.lengthscales = np.exp(tec_kern_dir_ls[0])
                kern_dir.lengthscales.prior = LogNormal(
                    tec_kern_dir_ls[0], tec_kern_dir_ls[1]**2)
                kern_dir.lengthscales.set_trainable(True)

                kern = kern_dir * kern_time  #(kern_thin_layer + kern_dir)*kern_time
                return kern

            kern = mk.SeparateMixedMok([_kern() for _ in range(L)], W)

            feature_list = []
            for _ in range(L):
                feat = InducingPoints(Z)
                #feat.Z.prior = Gaussian(Z,Z_var)
                feature_list.append(feat)
            feature = mf.MixedKernelSeparateMof(feature_list)

            mean = Zero()

            model = HomoscedasticPhaseOnlySVGP(weights,
                                               X,
                                               Y,
                                               kern,
                                               likelihood,
                                               feat=feature,
                                               mean_function=mean,
                                               minibatch_size=minibatch_size,
                                               num_latent=P,
                                               num_data=N,
                                               whiten=False,
                                               q_mu=q_mu,
                                               q_sqrt=q_sqrt)
            model.compile()
        return model