def infLaplace(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1): """ Laplace approximation to the posterior Gaussian process. The function takes a specified covariance function (see kernels.py) and likelihood function (see likelihoods.py). """ tol = 1e-6 # tolerance for when to stop the Newton iterations smax = 2 Nline = 20 thr = 1e-4 # line search parameters maxit = 20 # max number of Newton steps in f inffunc = "inferences.infLaplace" K = src.Tools.general.feval(covfunc, hyp.cov, x) # evaluate the covariance m = src.Tools.general.feval(meanfunc, hyp.mean, x) # evaluate the mean vector n, D = x.shape Psi_old = np.inf # make sure while loop starts by the largest old objective val if "last_alpha" not in infLaplace.__dict__: # find a good starting point for alpha and f alpha = np.zeros((n, 1)) f = np.dot(K, alpha) + m # start at mean if sizes not match vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = -lp.sum() else: alpha = last_alpha f = np.dot(K, alpha) + m # try last one vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() # objective for last alpha vargout = -src.Tools.general.feval(likfunc, hyp.lik, y, m, None, inffunc, None, 1) Psi_def = vargout[0] # objective for default init f==m if Psi_def < Psi_new: # if default is better, we use it alpha = np.zeros((n, 1)) f = np.dot(K, alpha) + m vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = -lp.sum() isWneg = np.any(W < 0) # flag indicating whether we found negative values of W it = 0 # this happens for the Student's t likelihood while (Psi_old - Psi_new > tol) and it < maxit: # begin Newton Psi_old = Psi_new it += 1 if isWneg: # stabilise the Newton direction in case W has negative values W = np.maximum(W, 0) # stabilise the Hessian to guarantee postive definiteness tol = 1e-10 # increase accuracy to also get the derivatives right # In Vanhatalo et. al., GPR with Student's t likelihood, NIPS 2009, they use # a more conservative strategy then we do being equivalent to 2 lines below. # nu = exp(hyp.lik(1)); # degree of freedom hyperparameter # W = W + 2/(nu+1)*dlp.^2; # add ridge according to Vanhatalo sW = np.sqrt(W) L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T b = W * (f - m) + dlp dalpha = b - sW * solve_chol(L, sW * np.dot(K, b)) - alpha vargout = brentmin(0, smax, Nline, thr, _Psi_line, 4, dalpha, alpha, hyp, K, m, likfunc, y, inffunc) s = vargout[0] Psi_new = vargout[1] Nfun = vargout[2] alpha = vargout[3] f = vargout[4] dlp = vargout[5] W = vargout[6] isWneg = np.any(W < 0) last_alpha = alpha # remember for next call vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 4) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] d3lp = vargout[3] W = -d2lp isWneg = np.any(W < 0) post = postStruct() post.alpha = alpha # return the posterior parameters post.sW = np.sqrt(np.abs(W)) * np.sign(W) # preserve sign in case of negative if isWneg: [ldA, iA, post.L] = _logdetA(K, W, 3) nlZ = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() + ldA / 2.0 nlZ = nlZ[0] else: sW = post.sW post.L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T nlZ = np.dot(alpha.T, (f - m)) / 2.0 + (np.log(np.diag(post.L)) - np.reshape(lp, (lp.shape[0],))).sum() nlZ = nlZ[0] if nargout > 2: # do we want derivatives? dnlZ = dnlzStruct(hyp) # allocate space for derivatives if isWneg: # switch between Cholesky and LU decomposition mode Z = -post.L # inv(K+inv(W)) g = np.atleast_2d((iA * K).sum(axis=1)).T / 2 # deriv. of ln|B| wrt W; g = diag(inv(inv(K)+diag(W)))/2 else: Z = np.tile(sW, (1, n)) * solve_chol( post.L, np.diag(np.reshape(sW, (sW.shape[0],))) ) # sW*inv(B)*sW=inv(K+inv(W)) C = np.linalg.solve(post.L.T, np.tile(sW, (1, n)) * K) # deriv. of ln|B| wrt W g = np.atleast_2d((np.diag(K) - (C ** 2).sum(axis=0).T)).T / 2.0 # g = diag(inv(inv(K)+W))/2 dfhat = g * d3lp # deriv. of nlZ wrt. fhat for ii in range(len(hyp.cov)): # covariance hypers dK = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii) dnlZ.cov[ii] = (Z * dK).sum() / 2.0 - np.dot(alpha.T, np.dot(dK, alpha)) / 2.0 # explicit part b = np.dot(dK, dlp) tmp = np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b))) dnlZ.cov[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0] # implicit part for ii in range(len(hyp.lik)): # likelihood hypers [lp_dhyp, dlp_dhyp, d2lp_dhyp] = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, ii, 3) dnlZ.lik[ii] = -np.dot(g.T, d2lp_dhyp) - lp_dhyp.sum() # explicit part b = np.dot(K, dlp_dhyp) dnlZ.lik[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0] # implicit part for ii in range(len(hyp.mean)): # mean hypers dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii) dnlZ.mean[ii] = -np.dot(alpha.T, dm) # explicit part dnlZ.mean[ii] -= np.dot(dfhat.T, dm - np.dot(K, np.dot(Z, dm)))[0, 0] # implicit part vargout = [post, nlZ, dnlZ] else: vargout = [post, nlZ] return vargout
def infFITC_Laplace(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1): """ infFITC_Laplace - FITC-Laplace approximation to the posterior Gaussian process. The function is equivalent to infLaplace with the covariance function: Kt = Q + G; G = diag(g); g = diag(K-Q); Q = Ku' * inv(Kuu + snu2 * eye(nu)) * Ku where Ku and Kuu are covariances w.r.t. to inducing inputs xu and snu2 = sn2/1e6 is the noise of the inducing inputs. We fixed the standard deviation of the inducing inputs snu to be a one per mil of the measurement noise's standard deviation sn. In case of a likelihood without noise parameter sn2, we simply use snu2 = 1e-6. The implementation exploits the Woodbury matrix identity inv(Kt) = inv(G) - inv(G) * Ku' * inv(Kuu+Ku * inv(G) * Ku') * Ku * inv(G) in order to be applicable to large datasets. The computational complexity is O(n nu^2) where n is the number of data points x and nu the number of inducing inputs in xu. The posterior N(f|h,Sigma) is given by h = m+mu with mu = nn + P' * gg and Sigma = inv(inv(K)+diag(W)) = diag(d) + P' * R0' * R' * R * R0 * P. The function takes a specified covariance function (see kernels.py) and likelihood function (likelihoods.py), and is designed to be used with gp.py and in conjunction with covFITC. """ cov1 = covfunc[0] if not cov1 == ["kernels.covFITC"]: raise Exception("Only covFITC supported.") # check cov tol = 1e-6 # tolerance for when to stop the Newton iterations smax = 2 Nline = 100 thr = 1e-4 # line search parameters maxit = 20 # max number of Newton steps in f inffunc = "inferences.infLaplace" diagK, Kuu, Ku = src.Tools.general.feval(covfunc, hyp.cov, x) # evaluate the covariance m = src.Tools.general.feval(meanfunc, hyp.mean, x) # evaluate the mean vector if hyp.lik: # hard coded inducing inputs noise sn2 = np.exp(2.0 * hyp.lik[-1]) snu2 = 1.0e-6 * sn2 # similar to infFITC else: snu2 = 1.0e-6 n, D = x.shape nu = Kuu.shape[0] rot180 = lambda A: np.rot90(np.rot90(A)) # little helper funct chol_inv = lambda A: np.linalg.solve(rot180(np.linalg.cholesky(rot180(A))), np.eye(nu)) # chol(inv(A)) R0 = chol_inv(Kuu + snu2 * np.eye(nu)) # initial R, used for refresh O(nu^3) V = np.dot(R0, Ku) d0 = diagK - np.array([(V * V).sum(axis=0)]).T # initial d, needed Psi_old = np.inf # make sure while loop starts by the largest old objective val if "last_alpha" not in infFITC_Laplace.__dict__: # find a good starting point for alpha and f alpha = np.zeros((n, 1)) f = _mvmK(alpha, V, d0) + m # start at mean if sizes not match vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = -lp.sum() else: alpha = last_alpha f = _mvmK(alpha, V, d0) + m # try last one vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() # objective for last alpha vargout = -src.Tools.general.feval(likfunc, hyp.lik, y, m, None, inffunc, None, 1) Psi_def = vargout[0] # objective for default init f==m if Psi_def < Psi_new: # if default is better, we use it alpha = np.zeros((n, 1)) f = _mvmK(alpha, V, d0) + m vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = -lp.sum() isWneg = np.any(W < 0) # flag indicating whether we found negative values of W it = 0 # this happens for the Student's t likelihood while (Psi_old - Psi_new > tol) and it < maxit: # begin Newton Psi_old = Psi_new it += 1 if isWneg: # stabilise the Newton direction in case W has negative values W = np.maximum(W, 0) # stabilise the Hessian to guarantee postive definiteness tol = 1e-8 # increase accuracy to also get the derivatives right # In Vanhatalo et. al., GPR with Student's t likelihood, NIPS 2009, they use # a more conservative strategy then we do being equivalent to 2 lines below. # nu = exp(hyp.lik(1)); # degree of freedom hyperparameter # W = W + 2/(nu+1)*dlp.^2; # add ridge according to Vanhatalo b = W * (f - m) + dlp dd = 1 / (1 + W * d0) RV = np.dot(chol_inv(np.eye(nu) + np.dot(V * np.tile((W * dd).T, (nu, 1)), V.T)), V) dalpha = dd * b - (W * dd) * np.dot(RV.T, np.dot(RV, (dd * b))) - alpha # Newt dir + line search vargout = brentmin(0, smax, Nline, thr, _Psi_lineFITC, 4, dalpha, alpha, hyp, V, d0, m, likfunc, y, inffunc) s = vargout[0] Psi_new = vargout[1] Nfun = vargout[2] alpha = vargout[3] f = vargout[4] dlp = vargout[5] W = vargout[6] isWneg = np.any(W < 0) last_alpha = alpha # remember for next call vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 4) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] d3lp = vargout[3] W = -d2lp isWneg = np.any(W < 0) post = postStruct() post.alpha = np.dot(R0.T, np.dot(V, alpha)) # return the posterior parameters post.sW = np.sqrt(np.abs(W)) * np.sign(W) # preserve sign in case of negative dd = 1 / (1 + d0 * W) # temporary variable O(n) A = np.eye(nu) + np.dot(V * np.tile((W * dd).T, (nu, 1)), V.T) # temporary variable O(n*nu^2) R0tV = np.dot(R0.T, V) B = R0tV * np.tile((W * dd).T, (nu, 1)) # temporary variables O(n*nu^2) post.L = -np.dot(B, R0tV.T) # L = -R0'*V*inv(Kt+diag(1./ttau))*V'*R0, first part if np.any(1 + d0 * W < 0): # B = np.dot(B,V.T); post.L += bp.dot(np.dot(B,np.inv(A)),B.T) # nlZ = np.nan; dnlZ = struct('cov',0*hyp.cov, 'mean',0*hyp.mean, 'lik',0*hyp.lik); raise Exception("W is too negative; nlZ and dnlZ cannot be computed.") nlZ = ( np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() - np.log(dd).sum() / 2.0 + np.log(np.diag(np.linalg.cholesky(A).T)).sum() ) RV = np.dot(chol_inv(A), V) RVdd = RV * np.tile((W * dd).T, (nu, 1)) # RVdd needed for dnlZ B = np.dot(B, RV.T) post.L += np.dot(B, B.T) if nargout > 2: # do we want derivatives? dnlZ = dnlzStruct(hyp) # allocate space for derivatives [d, P, R] = _fitcRefresh(d0, Ku, R0, V, W) # g = diag(inv(inv(K)+W))/2 g = d / 2 + 0.5 * np.atleast_2d((np.dot(np.dot(R, R0), P) ** 2).sum(axis=0)).T t = W / (1 + W * d0) dfhat = g * d3lp # deriv. of nlZ wrt. fhat: dfhat=diag(inv(inv(K)+W)).*d3lp/2 for ii in range(len(hyp.cov)): # covariance hypers ddiagK, dKuu, dKu = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii) # eval cov derivatives dA = 2.0 * dKu.T - np.dot(R0tV.T, dKuu) # dQ = dA*R0tV w = np.atleast_2d((dA * R0tV.T).sum(axis=1)).T v = ddiagK - w # w = diag(dQ); v = diag(dK)-diag(dQ); dnlZ.cov[ii] = np.dot(ddiagK.T, t) - np.dot((RVdd * RVdd).sum(axis=0), v) # explicit part dnlZ.cov[ii] -= (np.dot(RVdd, dA) * np.dot(RVdd, R0tV.T)).sum() # explicit part dnlZ.cov[ii] = ( 0.5 * dnlZ.cov[ii] - np.dot(alpha.T, np.dot(dA, np.dot(R0tV, alpha)) + v * alpha) / 2.0 ) # explicit b = np.dot(dA, np.dot(R0tV, dlp)) + v * dlp # b-K*(Z*b) = inv(eye(n)+K*diag(W))*b KZb = _mvmK(_mvmZ(b, RVdd, t), V, d0) dnlZ.cov[ii] -= np.dot(dfhat.T, (b - KZb)) # implicit part for ii in range(len(hyp.lik)): # likelihood hypers vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, ii, 3) lp_dhyp = vargout[0] dlp_dhyp = vargout[1] d2lp_dhyp = vargout[2] dnlZ.lik[ii] = -np.dot(g.T, d2lp_dhyp) - lp_dhyp.sum() # explicit part b = _mvmK(dlp_dhyp, V, d0) # implicit part dnlZ.lik[ii] -= np.dot(dfhat.T, b - _mvmK(_mvmZ(b, RVdd, t), V, d0)) if ii == len(hyp.lik) - 1: # since snu2 is a fixed fraction of sn2, there is a covariance-like term in the derivative as well snu = np.sqrt(snu2) T = chol_inv(Kuu + snu2 * np.eye(nu)) T = np.dot(T.T, np.dot(T, snu * Ku)) t = np.array([(T * T).sum(axis=0)]).T z = np.dot(alpha.T, np.dot(T.T, np.dot(T, alpha)) - t * alpha) - np.dot( np.array([(RVdd * RVdd).sum(axis=0)]), t ) z += (np.dot(RVdd, T.T) ** 2).sum() b = (t * dlp - np.dot(T.T, np.dot(T, dlp))) / 2.0 KZb = _mvmK(_mvmZ(b, RVdd, t), V, d0) z -= np.dot(dfhat.T, b - KZb) dnlZ.lik[ii] += z for ii in range(len(hyp.mean)): # mean hypers dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii) dnlZ.mean[ii] = -np.dot(alpha.T, dm) # explicit part Zdm = _mvmZ(dm, RVdd, t) dnlZ.mean[ii] -= np.dot(dfhat.T, (dm - _mvmK(Zdm, V, d0))) # implicit part vargout = [post, nlZ[0, 0], dnlZ] else: vargout = [post, nlZ[0, 0]] return vargout