def __init__(self, input_dim, output_dim, kern, Z, beta=10.0, natgrads=False, S_param='chol', name='layer'): super(Layer, self).__init__(name) self.input_dim = input_dim self.output_dim = output_dim self.num_inducing = Z.shape[0] #a factor by which to multiply the KL (only used in parallel implementations) self.KL_scaling = 1. #store Z, kern, beta in this Parameterized object. assert Z.shape[1] == self.input_dim self.kern = kern self.Z = GPy.core.Param('Z', Z) self.beta = GPy.core.Param('beta', beta, GPy.core.parameterization.transformations.Logexp()) self.link_parameters(self.Z, self.kern, self.beta) self.natgrads = natgrads # initialize q(U) #make the mean a random draw from I if not self.natgrads: self.q_of_U_mean = GPy.core.Param('q(U)_mean', np.random.randn(self.num_inducing, self.output_dim)) self.link_parameter(self.q_of_U_mean) #make the mean a random draw from Kmm #self.q_of_U_mean = GPy.core.Param('q(U)_mean', np.random.multivariate_normal(np.zeros(self.num_inducing), self.kern.K(self.Z), self.output_dim).T) self.S_param = S_param if S_param=='chol': chols = choleskies.triang_to_flat(np.dstack([np.eye(self.num_inducing)*0.1 for i in range(self.output_dim)])) self.q_of_U_choleskies = GPy.core.Param('q(U)_chol', chols) self.link_parameter(self.q_of_U_choleskies) elif S_param=='diag': self.q_of_U_diags = GPy.core.Param('q(U)_diag',np.ones((self.num_inducing, self.output_dim)),GPy.core.parameterization.transformations.Logexp()) self.link_parameter(self.q_of_U_diags) else: raise NotImplementedError else: #initialize using the natural gradient method mean = np.random.randn(self.num_inducing, self.output_dim) precision = np.dstack([np.eye(self.num_inducing)*10 for i in range(self.output_dim)]) Sim = np.einsum('ijk,jk->ik', precision, mean) self.set_vb_param(np.hstack((Sim.flatten(), -0.5*precision.flatten() ))) #and empty list to contain the lower layers self.lower_layers = []
def gradient_updates(self): """set the derivatives in the kernel and in Z""" self.kern.update_gradients_full(self.dL_dKmm, self.Z) g = self.kern._gradient_array_.copy() # self.dL_dpsi2 = np.repeat(self.dL_dpsi2[None,:,:], self.q_of_X_in.shape[0], axis=0) self.kern.update_gradients_expectations(Z=self.Z, variational_posterior=self.q_of_X_in, dL_dpsi0=self.dL_dpsi0, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2) self.kern._gradient_array_ += g self.Z.gradient = self.kern.gradients_X(self.dL_dKmm, self.Z) self.Z.gradient += self.kern.gradients_Z_expectations(Z=self.Z, variational_posterior=self.q_of_X_in, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2, dL_dpsi0=self.dL_dpsi0) if not self.natgrads: self.q_of_U_mean.gradient = self.dL_dEu + 2.*np.einsum('ijk,jk->ik', self.dL_duuT, self.q_of_U_mean) if self.S_param is 'chol': L = choleskies.flat_to_triang(self.q_of_U_choleskies) dL_dchol = 2.*np.einsum('ijk,jlk->ilk', self.dL_duuT, L) self.q_of_U_choleskies.gradient = choleskies.triang_to_flat(dL_dchol) else: self.q_of_U_diags.gradient = np.vstack([np.diag(self.dL_duuT[:,:,i]) for i in xrange(self.output_dim)]).T
def gradient_updates(self): #note that the kerel gradients are a little different because there's no q(X), just a fixed X self.kern.update_gradients_full(self.dL_dKmm, self.Z) g = self.kern._gradient_array_.copy() dL_dKnm = self.dL_dpsi1 + 2.*self.psi1.dot(self.dL_dpsi2) self.kern.update_gradients_full(dL_dKnm, self.X, self.Z) g += self.kern._gradient_array_.copy() self.kern.update_gradients_diag(self.dL_dpsi0, self.X) self.kern._gradient_array_ += g self.Z.gradient = self.kern.gradients_X(self.dL_dKmm, self.Z) self.Z.gradient += self.kern.gradients_X(dL_dKnm.T, self.Z, self.X) self.q_of_U_mean.gradient = self.dL_dEu + 2.*np.einsum('ijk,jk->ik',self.dL_duuT, self.q_of_U_mean) if self.S_param is 'chol': L = choleskies.flat_to_triang(self.q_of_U_choleskies) dL_dchol = 2.*np.einsum('ijk,jlk->ilk', self.dL_duuT, L) self.q_of_U_choleskies.gradient = choleskies.triang_to_flat(dL_dchol) else: self.q_of_U_diags.gradient = np.vstack([np.diag(self.dL_duuT[:,:,i]) for i in xrange(self.output_dim)]).T