def kl_divergences(self, q_U, p_U, dims): """ Description: Returns the sum of KL divergences Equation: \sum_q KL[q(u_q)|| p(u_q)] Paper: In Section 2.2.2 / Variational Bounds and Appendix 1 """ Q = dims['Q'] M = dims['M'] #------------------------------------------# ALGEBRA #-----------------------------------------------# ####### Algebra for q(u) ####### m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] ####### Algebra for p(u) ####### Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() #----------------------------# KL DIVERGENCE BETWEEN TWO GAUSSIANS #-----------------------------------# KL = 0 for q in range(Q): KL += 0.5 * np.sum(Kuui[q, :, :] * S_u[q, :, :]) \ + 0.5 * np.dot(m_u[:, q, None].T, np.dot(Kuui[q, :, :], m_u[:, q, None])) \ - 0.5 * M \ + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu[q, :, :])))) \ - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :])))) return KL
def calculate_KL(self, q_U, p_U, M, J): """ Calculates the KL divergence (see KL-div for multivariate normals) Equation: \sum_Q KL{q(uq)|p(uq)} """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((J, M, M)) [np.dot(L_u[j, :, :], L_u[j, :, :].T, S_u[j, :, :]) for j in range(J)] # Algebra for p(u): Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() KL = 0 for j in range(J): KL += 0.5 * np.sum(Kuui[j, :, :] * S_u[j, :, :]) \ + 0.5 * np.dot(m_u[:, j, None].T,np.dot(Kuui[j,:,:],m_u[:, j, None])) \ - 0.5 * M \ + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu[j, :, :])))) \ - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[j, :, :])))) return KL
def natural_grad_qu(model, n_iter=1, step_size=step_rate, momentum=0.0): global mk_ant, mk_aux, mk, V_i, Vk, Lk, Vk, Vki_ant """"Initialize the step-sizes""" "" beta2_k = step_size #use step_size*0.1 for Convolutional MOGP gamma2_k = momentum alpha2_k = step_size N_posteriors = model.q_u_means.shape[1] if n_iter == 1: V_i = choleskies.multiple_dpotri( choleskies.flat_to_triang(model.q_u_chols.values)).copy() Vk = np.zeros_like(V_i) for i in range(N_posteriors): Vk[i, :, :] = 0.5 * (model.posteriors[i].covariance.copy() + model.posteriors[i].covariance.T.copy()) Lk = np.zeros_like(Vk) mk = model.q_u_means.values.copy() Vki_ant = V_i.copy() mk_aux = mk.copy() dL_dm, dL_dV = compute_stoch_grads_for_qu_HetMOGP(model=model) mk_ant = mk_aux.copy() mk_aux = mk.copy() if not model.q_u_means.is_fixed and not model.q_u_chols.is_fixed: mk_ant = mk_aux.copy() mk_aux = mk.copy() for i in range(N_posteriors): try: V_i[i, :, :] = V_i[i, :, :] + 2 * beta2_k * dL_dV[ i] #+ 1.0e-6*np.eye(*Vk[i,:,:].shape) Vk[i, :, :] = np.linalg.inv(V_i[i, :, :]) Vk[i, :, :] = 0.5 * (np.array(Vk[i, :, :]) + np.array(Vk[i, :, :].T)) Lk[i, :, :] = np.linalg.cholesky(Vk[i, :, :]) mk[:, i] = mk[:, i] - alpha2_k * np.dot( Vk[i, :, :], dL_dm[i]) + gamma2_k * np.dot( np.dot(Vk[i, :, :], Vki_ant[i, :, :]), (mk[:, i] - mk_ant[:, i])) except LinAlgError: print("Overflow") Vk[i, :, :] = np.linalg.inv(V_i[i, :, :]) Vk[i, :, :] = 1.0e-1 * np.eye( *Vk[i, :, :].shape ) #nearestPD(Vk[i,:,:]) # + 1.0e-3*np.eye(*Vk[i,:,:].shape) Lk[i, :, :] = linalg.jitchol(Vk[i, :, :]) V_i[i, :, :] = np.linalg.inv(Vk[i, :, :]) mk[:, i] = mk[:, i] * 0.0 Vki_ant = V_i.copy() model.L_u.setfield(choleskies.triang_to_flat(Lk.copy()), np.float64) model.m_u.setfield(mk.copy(), np.float64)
def calculate_q_f(self, X, Z, q_U, p_U, kern_list, kern_list_Gdj, kern_aux, B, M, N, j): """ Calculates the mean and variance of q(f_d) as Equation: E_q(U)\{p(f_d|U)\} """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) #S_u = np.empty((M, M)) S_u = np.dot(L_u[j, :, :], L_u[j, :, :].T) + 1e-6 * np.eye(M) # [np.dot(L_u[j, :, :], L_u[j, :, :].T, S_u[j, :, :]) for j in range(J)] #for j in range(J): S_u[j,:,:] = S_u[j,:,:] + 1e-6*np.eye(M) # Algebra for p(f_d|u): #Kfdu = util.conv_cross_covariance_full(X, Z, B, kern_list, kern_list_Gdj, kern_aux,j) #Kff = util.conv_function_covariance(X, B, kern_list, kern_list_Gdj, kern_aux,j) Kff, Kfdu = util.both_convoled_Kff_and_Kfu_full( X, Z, B, kern_list, kern_list_Gdj, kern_aux, j) Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() Kff_diag = np.diag(Kff) # Algebra for q(f_d) = E_{q(u)}[p(f_d|u)] #Afdu = np.empty((N, M)) #Afdu = K_{fduq}Ki_{uquq} m_fd = np.zeros((N, 1)) v_fd = np.zeros((N, 1)) S_fd = np.zeros((N, N)) v_fd += Kff_diag[:, None] #+ 1e-1 S_fd += Kff #+ 1e-1*np.eye(N) # Expectation part #R, _ = linalg.dpotrs(np.asfortranarray(Luu[q, :, :]), Kfdu[:, q * M:(q * M) + M].T) #R = np.dot(Kuui[q, :, :], Kfdu[:, q * M:(q * M) + M].T) R = np.linalg.solve(Kuu[j, :, :], Kfdu.T) Afdu = R.T #Afdu = K_{fduq}Ki_{uquq} m_fd += np.dot(Afdu, m_u[:, j, None]) #exp #tmp = dtrmm(alpha=1.0, a=L_u[q, :, :].T, b=R, lower=0, trans_a=0) #v_fd += np.sum(np.square(tmp), 0)[:,None] - np.sum(R * Kfdu[:, q * M:(q * M) + M].T,0)[:,None] #exp S_fd += np.dot(np.dot(R.T, S_u), R) - np.dot(Kfdu, R) #S_fd += np.dot(np.dot(R.T, S_u[q, :, :]), R) - np.dot(np.dot(R.T, Kuu[q, :, :]), R) # - np.dot(Kfdu[:, q * M:(q * M) + M], R) v_fd = np.diag(S_fd)[:, None] if (v_fd < 0).any(): #v_fd = np.abs(v_fd) #v_fd[v_fd < 0] = 1.0e-6 print('v negative!') #print(np.linalg.eig(S_u[q, :, :])) q_fd = qfd(m_fd=m_fd, v_fd=v_fd, Kfdu=Kfdu, Afdu=Afdu, S_fd=S_fd) return q_fd
def calculate_mu_var(self, X, Y, Z, q_u_mean, q_u_chol, kern, mean_function, num_inducing, num_data, num_outputs): """ Calculate posterior mean and variance for the latent function values for use in the expectation over the likelihood """ #expand cholesky representation L = choleskies.flat_to_triang(q_u_chol) #S = linalg.ijk_ljk_to_ilk(L, L) #L.dot(L.T) S = np.empty((num_outputs, num_inducing, num_inducing)) [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)] #logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])]) logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])]) #compute mean function stuff if mean_function is not None: prior_mean_u = mean_function.f(Z) prior_mean_f = mean_function.f(X) else: prior_mean_u = np.zeros((num_inducing, num_outputs)) prior_mean_f = np.zeros((num_data, num_outputs)) #compute kernel related stuff Kmm = kern.K(Z) #Knm = kern.K(X, Z) Kmn = kern.K(Z, X) Knn_diag = kern.Kdiag(X) #Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm) Lm = linalg.jitchol(Kmm) logdetKmm = 2.*np.sum(np.log(np.diag(Lm))) Kmmi, _ = linalg.dpotri(Lm) #compute the marginal means and variances of q(f) #A = np.dot(Knm, Kmmi) A, _ = linalg.dpotrs(Lm, Kmn) #mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u) mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u) #v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * linalg.ij_jlk_to_ilk(A, S), 1) v = np.empty((num_data, num_outputs)) for i in range(num_outputs): tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0) v[:,i] = np.sum(np.square(tmp),0) v += (Knn_diag - np.sum(A*Kmn,0))[:,None] #compute the KL term Kmmim = np.dot(Kmmi, q_u_mean) #KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0) KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0) KL = KLs.sum() latent_detail = LatentFunctionDetails(q_u_mean=q_u_mean, q_u_chol=q_u_chol, mean_function=mean_function, mu=mu, v=v, prior_mean_u=prior_mean_u, L=L, A=A, S=S, Kmm=Kmm, Kmmi=Kmmi, Kmmim=Kmmim, KL=KL) return latent_detail
def calculate_KL(self, q_U, p_U_new, p_U_old, p_U_var, M, Mold, Q): """ Calculates the KL divergence (see KL-div for multivariate normals) Equation: \sum_Q KL{q(uq)|p(uq)} """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] # Algebra for p(u|psi_new): Kuu_new = p_U_new.Kuu.copy() Luu_new = p_U_new.Luu.copy() Kuui_new = p_U_new.Kuui.copy() # Algebra for p(u|psi_old): Kuu_old = p_U_old.Kuu.copy() Luu_old = p_U_old.Luu.copy() Kuui_old = p_U_old.Kuui.copy() # Algebra for q(u|phi_old): Mu_var = p_U_var.Mu.copy() Kuu_var = p_U_var.Kuu.copy() Luu_var = p_U_var.Luu.copy() Kuui_var = p_U_var.Kuui.copy() KLnew = 0 KLold = 0 KLvar = 0 for q in range(Q): KLnew += 0.5 * np.sum(Kuui_new[q, :, :] * S_u[q, :, :]) \ + 0.5 * np.dot(m_u[:, q, None].T,np.dot(Kuui_new[q,:,:],m_u[:, q, None])) \ - 0.5 * M \ + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_new[q, :, :])))) \ - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :])))) KLold += 0.5 * np.sum(Kuui_old[q, :, :] * S_u[q, :, :]) \ + 0.5 * np.dot(m_u[:, q, None].T, np.dot(Kuui_old[q, :, :], m_u[:, q, None])) \ - 0.5 * M \ + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_old[q, :, :])))) \ - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :])))) KLvar += 0.5 * np.sum(Kuui_var[q, :, :] * S_u[q, :, :]) \ + 0.5 * np.dot((Mu_var[q, :, :] - m_u[:, q, None]).T, np.dot(Kuui_var[q, :, :], (Mu_var[q, :, :] - m_u[:, q, None]))) \ - 0.5 * M \ + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_var[q, :, :])))) \ - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :])))) return KLnew, KLold, KLvar
def variational_q_fd(self, X, Z, q_U, p_U, kern_list, B, N, dims, d): """ Description: Returns the posterior approximation q(f) for the latent output functions (LOFs) Equation: q(f) = \int p(f|u)q(u)du Paper: In Section 2.2.2 / Variational Bounds """ Q = dims['Q'] M = dims['M'] #-----------------------------------------# POSTERIOR ALGEBRA #-------------------------------------# ####### Algebra for q(u) ####### m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] ####### Algebra for p(f_d|u) ####### Kfdu = multi_output.cross_covariance(X, Z, B, kern_list, d) Luu = p_U.Luu.copy() Kff = multi_output.function_covariance(X, B, kern_list, d) Kff_diag = np.diag(Kff) ####### Algebra for q(f_d) = E_{q(u)}[p(f_d|u)] ####### Afdu = np.empty((Q, N, M)) # Afdu = K_{fduq}Ki_{uquq} m_fd = np.zeros((N, 1)) v_fd = np.zeros((N, 1)) S_fd = np.zeros((N, N)) v_fd += Kff_diag[:, None] S_fd += Kff for q in range(Q): ####### Expectation w.r.t. u_q part ####### R, _ = linalg.dpotrs(np.asfortranarray(Luu[q, :, :]), Kfdu[:, q * M:(q * M) + M].T) Afdu[q, :, :] = R.T m_fd += np.dot(Afdu[q, :, :], m_u[:, q, None]) # exp tmp = dtrmm(alpha=1.0, a=L_u[q, :, :].T, b=R, lower=0, trans_a=0) v_fd += np.sum(np.square(tmp), 0)[:, None] - np.sum( R * Kfdu[:, q * M:(q * M) + M].T, 0)[:, None] # exp S_fd += np.dot(np.dot(R.T, S_u[q, :, :]), R) - np.dot( Kfdu[:, q * M:(q * M) + M], R) if (v_fd < 0).any(): print('v negative!') #--------------------------------------# VARIATIONAL POSTERIOR (LOFs) #-----------------------------------# ####### Variational output distribution q_fd() ####### q_fd = qfd(m_fd=m_fd, v_fd=v_fd, Kfdu=Kfdu, Afdu=Afdu, S_fd=S_fd) return q_fd
def calculate_q_f(self, X, Z, q_U, p_U, kern_list, B, M, N, Q, D, d): """ Calculates the mean and variance of q(f_d) as Equation: E_q(U)\{p(f_d|U)\} """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] # Algebra for p(f_d|u): Kfdu = util.cross_covariance(X, Z, B, kern_list, d) Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() Kff = util.function_covariance(X, B, kern_list, d) Kff_diag = np.diag(Kff) # Algebra for q(f_d) = E_{q(u)}[p(f_d|u)] Afdu = np.empty((Q, N, M)) #Afdu = K_{fduq}Ki_{uquq} m_fd = np.zeros((N, 1)) v_fd = np.zeros((N, 1)) S_fd = np.zeros((N, N)) v_fd += Kff_diag[:, None] S_fd += Kff for q in range(Q): # Expectation part R, _ = linalg.dpotrs(np.asfortranarray(Luu[q, :, :]), Kfdu[:, q * M:(q * M) + M].T) Afdu[q, :, :] = R.T m_fd += np.dot(Afdu[q, :, :], m_u[:, q, None]) #exp tmp = dtrmm(alpha=1.0, a=L_u[q, :, :].T, b=R, lower=0, trans_a=0) v_fd += np.sum(np.square(tmp), 0)[:, None] - np.sum( R * Kfdu[:, q * M:(q * M) + M].T, 0)[:, None] #exp S_fd += np.dot(np.dot(R.T, S_u[q, :, :]), R) - np.dot( Kfdu[:, q * M:(q * M) + M], R) if (v_fd < 0).any(): print('v negative!') q_fd = qfd(m_fd=m_fd, v_fd=v_fd, Kfdu=Kfdu, Afdu=Afdu, S_fd=S_fd) return q_fd
def posteriors_F(self, Xnew, which_out=None): # This function returns all the q(f*) associated to each output (It is the ) # We assume that Xnew can be a list of length equal to the number of likelihoods defined for the HetMOGP # or Xnew can be a numpy array so that we can replicate it per each outout if isinstance(Xnew, list): Xmulti_all_new = Xnew else: Xmulti_all_new = [] for i in range(self.num_output_funcs): Xmulti_all_new.append(Xnew.copy()) M = self.Z.shape[0] Q = len(self.kern_list) D = self.likelihood.num_output_functions(self.Y_metadata) Kuu, Luu, Kuui = util.VIK_covariance(self.Z, self.kern_list, self.kern_list_Tq, self.kern_aux) p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui) q_U = qu(mu_u=self.q_u_means, chols_u=self.q_u_chols) S_u = np.empty((Q, M, M)) L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] # for every latent function f_d calculate q(f_d) and keep it as q(F): posteriors_F = [] f_index = self.Y_metadata['function_index'].flatten() d_index = self.Y_metadata['d_index'].flatten() if which_out is None: indix_aux = f_index.copy() else: which_out = np.array(which_out) indix_aux = -1 * np.ones_like(f_index) for i in range(which_out.shape[0]): posix = np.where(f_index == which_out[i]) indix_aux[posix] = f_index[posix].copy() for d in range(D): if f_index[d] == indix_aux[d]: Xtask = Xmulti_all_new[f_index[d]] q_fd, _ = self.inference_method.calculate_q_f( X=Xtask, Z=self.Z, q_U=q_U, S_u=S_u, p_U=p_U, kern_list=self.kern_list, kern_list_Gdj=self.kern_list_Gdj, kern_list_Tq=self.kern_list_Tq, kern_aux=self.kern_aux, B=self.B_list, M=M, N=Xtask.shape[0], Q=Q, D=D, d=d) # Posterior objects for output functions (used in prediction) posterior_fd = Posterior(mean=q_fd.m_fd.copy(), cov=q_fd.S_fd.copy(), K=util.function_covariance( X=Xtask, B=self.B_list, kernel_list=self.kern_list, d=d), prior_mean=np.zeros(q_fd.m_fd.shape)) posteriors_F.append(posterior_fd) else: #posteriors_F.append(fake_posterior) posteriors_F.append([]) return posteriors_F
def inference(self, q_u_means, q_u_chols, X, Y, Z, kern_list, kern_list_Gdj, kern_aux, likelihood, B_list, Y_metadata, KL_scale=1.0, batch_scale=None, predictive=False, Gauss_Newton=False): M = Z.shape[0] T = len(Y) if batch_scale is None: batch_scale = [1.0] * T Ntask = [] [Ntask.append(Y[t].shape[0]) for t in range(T)] Q = len(kern_list) D = likelihood.num_output_functions(Y_metadata) Kuu, Luu, Kuui = util.latent_funs_cov(Z, kern_list) p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui) q_U = qu(mu_u=q_u_means.copy(), chols_u=q_u_chols.copy()) S_u = np.empty((Q, M, M)) L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Su_add_Kuu = np.zeros((Q, M, M)) Su_add_Kuu_chol = np.zeros((Q, M, M)) for q in range(Q): Su_add_Kuu[q, :, :] = S_u[q, :, :] + Kuu[q, :, :] Su_add_Kuu_chol[q, :, :] = linalg.jitchol(Su_add_Kuu[q, :, :]) # for every latent function f_d calculate q(f_d) and keep it as q(F): q_F = [] posteriors_F = [] f_index = Y_metadata['function_index'].flatten() d_index = Y_metadata['d_index'].flatten() for d in range(D): Xtask = X[f_index[d]] q_fd, q_U = self.calculate_q_f(X=Xtask, Z=Z, q_U=q_U, S_u=S_u, p_U=p_U, kern_list=kern_list, kern_list_Gdj=kern_list_Gdj, kern_aux=kern_aux, B=B_list, M=M, N=Xtask.shape[0], Q=Q, D=D, d=d) # Posterior objects for output functions (used in prediction) #I have to get rid of function below Posterior for it is not necessary posterior_fd = Posterior(mean=q_fd.m_fd.copy(), cov=q_fd.S_fd.copy(), K=util.conv_function_covariance( X=Xtask, B=B_list, kernel_list=kern_list, kernel_list_Gdj=kern_list_Gdj, kff_aux=kern_aux, d=d), prior_mean=np.zeros(q_fd.m_fd.shape)) posteriors_F.append(posterior_fd) q_F.append(q_fd) mu_F = [] v_F = [] for t in range(T): mu_F_task = np.empty((X[t].shape[0], 1)) v_F_task = np.empty((X[t].shape[0], 1)) for d, q_fd in enumerate(q_F): if f_index[d] == t: mu_F_task = np.hstack((mu_F_task, q_fd.m_fd)) v_F_task = np.hstack((v_F_task, q_fd.v_fd)) mu_F.append(mu_F_task[:, 1:]) v_F.append(v_F_task[:, 1:]) # posterior_Fnew for predictive if predictive: return posteriors_F # inference for rest of cases else: # Variational Expectations VE = likelihood.var_exp(Y, mu_F, v_F, Y_metadata) VE_dm, VE_dv = likelihood.var_exp_derivatives( Y, mu_F, v_F, Y_metadata, Gauss_Newton) for t in range(T): VE[t] = VE[t] * batch_scale[t] VE_dm[t] = VE_dm[t] * batch_scale[t] VE_dv[t] = VE_dv[t] * batch_scale[t] # KL Divergence KL = self.calculate_KL(q_U=q_U, Su_add_Kuu=Su_add_Kuu, Su_add_Kuu_chol=Su_add_Kuu_chol, p_U=p_U, M=M, Q=Q, D=D) # Log Marginal log(p(Y)) F = 0 for t in range(T): F += VE[t].sum() log_marginal = F - KL # Gradients and Posteriors dL_dS_u = [] dL_dmu_u = [] dL_dL_u = [] dL_dKmm = [] dL_dKmn = [] dL_dKdiag = [] posteriors = [] for q in range(Q): (dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag_q) = self.calculate_gradients( q_U=q_U, S_u=S_u, Su_add_Kuu_chol=Su_add_Kuu_chol, p_U=p_U, q_F=q_F, VE_dm=VE_dm, VE_dv=VE_dv, Ntask=Ntask, M=M, Q=Q, D=D, f_index=f_index, d_index=d_index, q=q) dL_dmu_u.append(dL_dmu_q) dL_dL_u.append(dL_dL_q) dL_dS_u.append(dL_dS_q) dL_dKmm.append(dL_dKqq) dL_dKmn.append(dL_dKdq) dL_dKdiag.append(dL_dKdiag_q) posteriors.append(posterior_q) gradients = { 'dL_dmu_u': dL_dmu_u, 'dL_dL_u': dL_dL_u, 'dL_dS_u': dL_dS_u, 'dL_dKmm': dL_dKmm, 'dL_dKmn': dL_dKmn, 'dL_dKdiag': dL_dKdiag } return log_marginal, gradients, posteriors, posteriors_F
def calculate_gradients(self, q_U, S_u, Su_add_Kuu_chol, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index, q): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u) and p(u): m_u = q_U.mu_u.copy() #L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) #S_u = np.empty((Q, M, M)) #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(Su_add_Kuu_chol[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # KL Terms dKL_dmu_q = [] dKL_dKqq = 0 for d in range(D): dKL_dmu_q.append(np.dot(Kuui[q, :, :], m_u[d][:, q, None])) #same dKL_dKqq += -0.5 * S_qi + 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \ - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[d][:, q, None], m_u[d][:, q, None].T)).dot(Kuui[q, :, :].T) # same #dKL_dS_q = 0.5 * (Kuui[q,:,:] - S_qi) #old dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi) * D # VE Terms #dVE_dmu_q = np.zeros((M, 1)) dVE_dmu_q = [] dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] dL_dmu_q = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q.append( np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d]])[:, None]) dL_dmu_q.append(dVE_dmu_q[d] - dKL_dmu_q[d]) Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :]) dVE_dKqq += -tmp_dv - tmp_dv.T #+ AdvA last term not included in the derivative Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d], None]) dVE_dKqq += -np.dot(Adm, np.dot(Kuui[q, :, :], m_u[d][:, q, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u[q, :, :], Kuui[q, :, :]) tmp = 2. * tmp #2. * (tmp - np.eye(M)) # the term -2Adv not included dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[d][:, q, None]), VE_dm[f_index[d]][:, d_index[d], None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) # Sum of VE and KL terms #dL_dmu_q = dVE_dmu_q - dKL_dmu_q dL_dS_q = dVE_dS_q - dKL_dS_q dL_dKqq = dVE_dKqq - dKL_dKqq dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1]) dL_dL_q = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) # Posterior posterior_q = [] for d in range(D): posterior_q.append( Posterior(mean=m_u[d][:, q, None], cov=S_u[q, :, :] + Kuu[q, :, :], K=Kuu[q, :, :], prior_mean=np.zeros(m_u[d][:, q, None].shape))) return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
def calculate_gradients(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index, j): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u) and p(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) #S_u = np.empty((Q, M, M)) S_u = np.dot( L_u[j, :, :], L_u[j, :, :].T ) #This could be done outside and recieve it to reduce computation #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[j, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # KL Terms dKL_dmu_j = np.dot(Kuui[j, :, :], m_u[:, j, None]) dKL_dS_j = 0.5 * (Kuui[j, :, :] - S_qi) dKL_dKjj = 0.5 * Kuui[j,:,:] - 0.5 * Kuui[j,:,:].dot(S_u).dot(Kuui[j,:,:]) \ - 0.5 * np.dot(Kuui[j,:,:],np.dot(m_u[:, j, None],m_u[:, j, None].T)).dot(Kuui[j,:,:].T) # VE Terms dVE_dmu_j = np.zeros((M, 1)) dVE_dS_j = np.zeros((M, M)) dVE_dKjj = np.zeros((M, M)) dVE_dKjd = [] dVE_dKdiag = [] Nt = Ntask[f_index[j]] dVE_dmu_j += np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j]])[:, None] Adv = q_F[j].Afdu.T * VE_dv[f_index[j]][:, d_index[j], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_F[j].Afdu).reshape(M, M) dVE_dS_j += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u).dot(Kuui[j, :, :]) dVE_dKjj += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j], None]) dVE_dKjj += -np.dot(Adm, np.dot(Kuui[j, :, :], m_u[:, j, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u, Kuui[j, :, :]) tmp = 2. * (tmp - np.eye(M)) dve_kjd = np.dot(np.dot(Kuui[j, :, :], m_u[:, j, None]), VE_dm[f_index[j]][:, d_index[j], None].T) dve_kjd += np.dot(tmp.T, Adv) dVE_dKjd.append(dve_kjd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[j]][:, d_index[j]]) dVE_dKjj = 0.5 * (dVE_dKjj + dVE_dKjj.T) # Sum of VE and KL terms dL_dmu_j = dVE_dmu_j - dKL_dmu_j dL_dS_j = dVE_dS_j - dKL_dS_j dL_dKjj = dVE_dKjj - dKL_dKjj dL_dKdj = dVE_dKjd[0].copy() #Here we just pass the unique position dL_dKdiag = dVE_dKdiag[0].copy( ) #Here we just pass the unique position # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_j = choleskies.flat_to_triang(chol_u[:, j:j + 1]) dL_dL_j = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_j[None, :, :], L_j)]) dL_dL_j = choleskies.triang_to_flat(dL_dL_j) # Posterior posterior_j = Posterior(mean=m_u[:, j, None], cov=S_u, K=Kuu[j, :, :], prior_mean=np.zeros(m_u[:, j, None].shape)) return dL_dmu_j, dL_dL_j, dL_dS_j, posterior_j, dL_dKjj, dL_dKdj, dL_dKdiag
def elbo_derivatives(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, dims, f_index, d_index, q): """ Description: Returns ELBO derivatives w.r.t. variational parameters and hyperparameters Equation: gradients = {dL/dmu, dL/dS, dL/dKmm, dL/Kmn, dL/dKdiag} Paper: In Appendix 4 and 5 Extra_Info: Gradients w.r.t. hyperparameters use chain-rule and GPy. Note that Kmm, Kmn, Kdiag are matrices """ Q = dims['Q'] M = dims['M'] #------------------------------------# ALGEBRA FOR DERIVATIVES #------------------------------------# ####### Algebra for q(u) and p(u) ####### m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") #-------------------------------------# DERIVATIVES OF ELBO TERMS #----------------------------------# ####### KL Terms ####### dKL_dmu_q = np.dot(Kuui[q, :, :], m_u[:, q, None]) dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi) dKL_dKqq = 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \ - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[:, q, None], m_u[:, q, None].T)).dot(Kuui[q, :, :].T) ####### Variational Expectation (VE) Terms ####### dVE_dmu_q = np.zeros((M, 1)) dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d]])[:, None] Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA ####### Derivatives dKuquq ####### tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :]) dVE_dKqq += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d], None]) dVE_dKqq += -np.dot(Adm, np.dot(Kuui[q, :, :], m_u[:, q, None]).T) ####### Derivatives dKuqfd ####### tmp = np.dot(S_u[q, :, :], Kuui[q, :, :]) tmp = 2. * (tmp - np.eye(M)) dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[:, q, None]), VE_dm[f_index[d]][:, d_index[d], None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) ####### Derivatives dKdiag ####### dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) #--------------------------------------# FINAL ELBO DERIVATIVES #------------------------------------# ####### ELBO derivatives ---> sum of VE and KL terms ####### dL_dmu_q = dVE_dmu_q - dKL_dmu_q dL_dS_q = dVE_dS_q - dKL_dS_q dL_dKqq = dVE_dKqq - dKL_dKqq dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag ####### Pass S_q gradients to its low-triangular representation L_q ####### chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1]) dL_dL_q = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) return dL_dmu_q, dL_dL_q, dL_dS_q, dL_dKqq, dL_dKdq, dL_dKdiag
def calculate_gradients(self, q_U, p_U_new, p_U_old, p_U_var, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index,q): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # Algebra for p(u) Kuu_new = p_U_new.Kuu.copy() Luu_new = p_U_new.Luu.copy() Kuui_new = p_U_new.Kuui.copy() Kuu_old = p_U_old.Kuu.copy() Luu_old = p_U_old.Luu.copy() Kuui_old = p_U_old.Kuui.copy() Mu_var = p_U_var.Mu.copy() Kuu_var = p_U_var.Kuu.copy() Luu_var = p_U_var.Luu.copy() Kuui_var = p_U_var.Kuui.copy() # KL Terms dKLnew_dmu_q = np.dot(Kuui_new[q,:,:], m_u[:, q, None]) dKLnew_dS_q = 0.5 * (Kuui_new[q,:,:] - S_qi) dKLold_dmu_q = np.dot(Kuui_old[q,:,:], m_u[:, q, None]) dKLold_dS_q = 0.5 * (Kuui_old[q,:,:] - S_qi) dKLvar_dmu_q = np.dot(Kuui_var[q,:,:], (m_u[:, q, None] - Mu_var[q, :, :])) # important!! (Eq. 69 MCB) dKLvar_dS_q = 0.5 * (Kuui_var[q,:,:] - S_qi) dKLnew_dKqq = 0.5 * Kuui_new[q,:,:] - 0.5 * Kuui_new[q,:,:].dot(S_u[q, :, :]).dot(Kuui_new[q,:,:]) \ - 0.5 * np.dot(Kuui_new[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_new[q,:,:].T) dKLold_dKqq = 0.5 * Kuui_old[q,:,:] - 0.5 * Kuui_old[q,:,:].dot(S_u[q, :, :]).dot(Kuui_old[q,:,:]) \ - 0.5 * np.dot(Kuui_old[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_old[q,:,:].T) #dKLvar_dKqq = 0.5 * Kuui_var[q,:,:] - 0.5 * Kuui_var[q,:,:].dot(S_u[q, :, :]).dot(Kuui_var[q,:,:]) \ # - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_var[q,:,:].T) \ # + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(m_u[:,q,None], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) \ # + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(Mu_var[q,:,:], m_u[:,q,None].T)).dot(Kuui_var[q,:,:].T) \ # - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(Mu_var[q,:,:], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) #KLvar += 0.5 * np.sum(Kuui_var[q, :, :] * S_u[q, :, :]) \ # + 0.5 * np.dot((Mu_var[q, :, :] - m_u[:, q, None]).T, np.dot(Kuui_var[q, :, :], (Mu_var[q, :, :] - m_u[:, q, None]))) \ # - 0.5 * M \ # + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_var[q, :, :])))) \ # - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :])))) # # VE Terms dVE_dmu_q = np.zeros((M, 1)) dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d]])[:, None] Adv = q_fd.Afdu[q,:,:].T * VE_dv[f_index[d]][:,d_index[d],None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui_new[q,:,:]) dVE_dKqq += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d],None]) dVE_dKqq += - np.dot(Adm, np.dot(Kuui_new[q,:,:], m_u[:, q, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u[q, :, :], Kuui_new[q,:,:]) tmp = 2. * (tmp - np.eye(M)) dve_kqd = np.dot(np.dot(Kuui_new[q,:,:], m_u[:, q, None]), VE_dm[f_index[d]][:,d_index[d],None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[d]][:,d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) # Derivatives of variational parameters dL_dmu_q = dVE_dmu_q - dKLnew_dmu_q + dKLold_dmu_q - dKLvar_dmu_q dL_dS_q = dVE_dS_q - dKLnew_dS_q + dKLold_dS_q - dKLvar_dS_q # Derivatives of prior hyperparameters # if using Zgrad, dL_dKqq = dVE_dKqq - dKLnew_dKqq + dKLold_dKqq - dKLvar_dKqq # otherwise for hyperparameters: dL_dKqq = dVE_dKqq - dKLnew_dKqq dL_dKqq = dVE_dKqq - dKLnew_dKqq #+ dKLold_dKqq - dKLvar_dKqq # dKLold_dKqq sólo para Zgrad, dKLvar_dKqq to be done (for Zgrad) dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:,q:q+1]) dL_dL_q = 2. * np.array([np.dot(a, b) for a, b in zip(dL_dS_q[None,:,:], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) # Posterior posterior_q = Posterior(mean=m_u[:, q, None], cov=S_u[q, :, :], K=Kuu_new[q,:,:], prior_mean=np.zeros(m_u[:, q, None].shape)) return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag