def parameters_changed(self): f_index = self.Y_metadata['function_index'].flatten() d_index = self.Y_metadata['d_index'].flatten() T = len(self.likelihood.likelihoods_list) self.batch_scale = [] [ self.batch_scale.append( float(self.Xmulti_all[t].shape[0] / self.Xmulti[t].shape[0])) for t in range(T) ] self._log_marginal_likelihood, gradients, self.posteriors, _ = self.inference_method.inference( q_u_means=self.q_u_means, q_u_chols=self.q_u_chols, X=self.Xmulti, Y=self.Ymulti, Z=self.Z, kern_list=self.kern_list, likelihood=self.likelihood, B_list=self.B_list, Y_metadata=self.Y_metadata, batch_scale=self.batch_scale) D = self.likelihood.num_output_functions(self.Y_metadata) N = self.X.shape[0] M = self.num_inducing _, B_list = util.LCM(input_dim=self.Xdim, output_dim=D, rank=1, kernels_list=self.kern_list, W_list=self.W_list, kappa_list=self.kappa_list) Z_grad = np.zeros_like(self.Z.values) for q, kern_q in enumerate(self.kern_list): # Update the variational parameter gradients: # SVI + VEM if self.stochastic: if self.vem_step: self.q_u_means[:, q:q + 1].gradient = gradients['dL_dmu_u'][q] self.q_u_chols[:, q:q + 1].gradient = gradients['dL_dL_u'][q] else: self.q_u_means[:, q:q + 1].gradient = np.zeros( gradients['dL_dmu_u'][q].shape) self.q_u_chols[:, q:q + 1].gradient = np.zeros( gradients['dL_dL_u'][q].shape) else: self.q_u_means[:, q:q + 1].gradient = gradients['dL_dmu_u'][q] self.q_u_chols[:, q:q + 1].gradient = gradients['dL_dL_u'][q] # Update kernel hyperparameters: lengthscale and variance kern_q.update_gradients_full( gradients['dL_dKmm'][q], self.Z[:, q * self.Xdim:q * self.Xdim + self.Xdim]) grad = kern_q.gradient.copy() # Update kernel hyperparameters: W + kappa Kffdiag = [] KuqF = [] for d in range(D): Kffdiag.append(gradients['dL_dKdiag'][q][d]) KuqF.append(gradients['dL_dKmn'][q][d] * kern_q.K( self.Z[:, q * self.Xdim:q * self.Xdim + self.Xdim], self.Xmulti[f_index[d]])) util.update_gradients_diag(self.B_list[q], Kffdiag) Bgrad = self.B_list[q].gradient.copy() util.update_gradients_Kmn(self.B_list[q], KuqF, D) Bgrad += self.B_list[q].gradient.copy() # SVI + VEM if self.stochastic: if self.vem_step: self.B_list[q].gradient = np.zeros(Bgrad.shape) else: self.B_list[q].gradient = Bgrad else: self.B_list[q].gradient = Bgrad for d in range( self.likelihood.num_output_functions(self.Y_metadata)): kern_q.update_gradients_full( gradients['dL_dKmn'][q][d], self.Z[:, q * self.Xdim:q * self.Xdim + self.Xdim], self.Xmulti[f_index[d]]) grad += B_list[q].W[d] * kern_q.gradient.copy() kern_q.update_gradients_diag(gradients['dL_dKdiag'][q][d], self.Xmulti[f_index[d]]) grad += B_list[q].B[d, d] * kern_q.gradient.copy() # SVI + VEM if self.stochastic: if self.vem_step: kern_q.gradient = np.zeros(grad.shape) else: kern_q.gradient = grad else: kern_q.gradient = grad if not self.Z.is_fixed: Z_grad[:, q * self.Xdim:q * self.Xdim + self.Xdim] += kern_q.gradients_X( gradients['dL_dKmm'][q], self.Z[:, q * self.Xdim:q * self.Xdim + self.Xdim]) for d in range( self.likelihood.num_output_functions(self.Y_metadata)): Z_grad[:, q * self.Xdim:q * self.Xdim + self.Xdim] += B_list[q].W[d] * kern_q.gradients_X( gradients['dL_dKmn'][q][d], self.Z[:, q * self.Xdim:q * self.Xdim + self.Xdim], self.Xmulti[f_index[d]]) if not self.Z.is_fixed: # SVI + VEM if self.stochastic: if self.vem_step: self.Z.gradient[:] = np.zeros(Z_grad.shape) else: self.Z.gradient[:] = Z_grad else: self.Z.gradient[:] = Z_grad
def __init__(self, X, Y, Z, kern_list, likelihood, Y_metadata, name='SVMOGP', batch_size=None): self.batch_size = batch_size self.kern_list = kern_list self.likelihood = likelihood self.Y_metadata = Y_metadata self.num_inducing = Z.shape[0] # M self.num_latent_funcs = len(kern_list) # Q self.num_output_funcs = likelihood.num_output_functions( self.Y_metadata) self.W_list, self.kappa_list = util.random_W_kappas( self.num_latent_funcs, self.num_output_funcs, rank=1) self.Xmulti = X self.Ymulti = Y # Batch the data self.Xmulti_all, self.Ymulti_all = X, Y if batch_size is None: self.stochastic = False Xmulti_batch, Ymulti_batch = X, Y else: # Makes a climin slicer to make drawing minibatches much quicker self.stochastic = True self.slicer_list = [] [ self.slicer_list.append( draw_mini_slices(Xmulti_task.shape[0], self.batch_size)) for Xmulti_task in self.Xmulti ] Xmulti_batch, Ymulti_batch = self.new_batch() self.Xmulti, self.Ymulti = Xmulti_batch, Ymulti_batch # Initialize inducing points Z #Z = kmm_init(self.X_all, self.num_inducing) self.Xdim = Z.shape[1] Z = np.tile(Z, (1, self.num_latent_funcs)) inference_method = SVMOGPInf() super(SVMOGP, self).__init__(X=Xmulti_batch[0][1:10], Y=Ymulti_batch[0][1:10], Z=Z, kernel=kern_list[0], likelihood=likelihood, mean_function=None, X_variance=None, inference_method=inference_method, Y_metadata=Y_metadata, name=name, normalizer=False) self.unlink_parameter( self.kern) # Unlink SparseGP default param kernel _, self.B_list = util.LCM(input_dim=self.Xdim, output_dim=self.num_output_funcs, rank=1, kernels_list=self.kern_list, W_list=self.W_list, kappa_list=self.kappa_list) # Set-up optimization parameters: [Z, m_u, L_u] self.q_u_means = Param( 'm_u', 5 * np.random.randn(self.num_inducing, self.num_latent_funcs) + np.tile(np.random.randn(1, self.num_latent_funcs), (self.num_inducing, 1))) chols = choleskies.triang_to_flat( np.tile( np.eye(self.num_inducing)[None, :, :], (self.num_latent_funcs, 1, 1))) self.q_u_chols = Param('L_u', chols) self.link_parameter(self.Z, index=0) self.link_parameter(self.q_u_means) self.link_parameters(self.q_u_chols) [self.link_parameter(kern_q) for kern_q in kern_list] # link all kernels [self.link_parameter(B_q) for B_q in self.B_list] self.vem_step = True # [True=VE-step, False=VM-step] self.ve_count = 0 self.elbo = np.zeros((1, 1))
def __init__(self, X, Y, Z, kern_list, likelihood, Y_metadata, name='SVMOGP', batch_size=None, non_chained=True): self.batch_size = batch_size self.kern_list = kern_list self.likelihood = likelihood self.Y_metadata = Y_metadata self.num_inducing = Z.shape[0] # M self.num_latent_funcs = len(kern_list) # Q self.num_output_funcs = likelihood.num_output_functions(Y_metadata) if (not non_chained): assert self.num_output_funcs == self.num_latent_funcs, "we need a latent function per likelihood parameter" if non_chained: self.W_list, self.kappa_list = util.random_W_kappas( self.num_latent_funcs, self.num_output_funcs, rank=1) else: self.W_list, self.kappa_list = util.Chained_W_kappas( self.num_latent_funcs, self.num_output_funcs, rank=1) self.Xmulti = X self.Ymulti = Y self.iAnnMulti = Y_metadata['iAnn'] # Batch the data self.Xmulti_all, self.Ymulti_all, self.iAnn_all = X, Y, Y_metadata[ 'iAnn'] if batch_size is None: #self.stochastic = False Xmulti_batch, Ymulti_batch, iAnnmulti_batch = X, Y, Y_metadata[ 'iAnn'] else: # Makes a climin slicer to make drawing minibatches much quicker #self.stochastic = False #"This was True as Pablo had it" self.slicer_list = [] [ self.slicer_list.append( draw_mini_slices(Xmulti_task.shape[0], self.batch_size)) for Xmulti_task in self.Xmulti ] Xmulti_batch, Ymulti_batch, iAnnmulti_batch = self.new_batch() self.Xmulti, self.Ymulti, self.iAnnMulti = Xmulti_batch, Ymulti_batch, iAnnmulti_batch self.Y_metadata.update(iAnn=iAnnmulti_batch) # Initialize inducing points Z #Z = kmm_init(self.X_all, self.num_inducing) self.Xdim = Z.shape[1] Z = np.tile(Z, (1, self.num_latent_funcs)) inference_method = SVMOGPInf() super(SVMOGP, self).__init__(X=Xmulti_batch[0][1:10], Y=Ymulti_batch[0][1:10], Z=Z, kernel=kern_list[0], likelihood=likelihood, mean_function=None, X_variance=None, inference_method=inference_method, Y_metadata=Y_metadata, name=name, normalizer=False) self.unlink_parameter( self.kern) # Unlink SparseGP default param kernel _, self.B_list = util.LCM(input_dim=self.Xdim, output_dim=self.num_output_funcs, rank=1, kernels_list=self.kern_list, W_list=self.W_list, kappa_list=self.kappa_list) # Set-up optimization parameters: [Z, m_u, L_u] self.q_u_means = Param( 'm_u', 0.0 * np.random.randn(self.num_inducing, self.num_latent_funcs) + 0.0 * np.tile(np.random.randn(1, self.num_latent_funcs), (self.num_inducing, 1))) chols = choleskies.triang_to_flat( np.tile( np.eye(self.num_inducing)[None, :, :], (self.num_latent_funcs, 1, 1))) self.q_u_chols = Param('L_u', chols) self.link_parameter(self.Z, index=0) self.link_parameter(self.q_u_means) self.link_parameters(self.q_u_chols) [self.link_parameter(kern_q) for kern_q in kern_list] # link all kernels [self.link_parameter(B_q) for B_q in self.B_list] self.vem_step = True # [True=VE-step, False=VM-step] self.ve_count = 0 self.elbo = np.zeros((1, 1)) self.index_VEM = 0 #this is a variable to index correctly the self.elbo when using VEM self.Gauss_Newton = False #This is a flag for using the Gauss-Newton approximation when dL_dV is needed