def infExact(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1): """ Exact inference for a GP with Gaussian likelihood. Compute a parametrization of the posterior, the negative log marginal likelihood and its derivatives w.r.t. the hyperparameters. """ if not (likfunc[0] == "likelihoods.likGauss"): # NOTE: no explicit call to likGauss raise Exception("Exact inference only possible with Gaussian likelihood") n, D = x.shape K = src.Tools.general.feval(covfunc, hyp.cov, x) # evaluate covariance matrix m = src.Tools.general.feval(meanfunc, hyp.mean, x) # evaluate mean vector sn2 = np.exp(2.0 * hyp.lik) # noise variance of likGauss try: L = np.linalg.cholesky(K / sn2 + np.eye(n)).T # Cholesky factor of covariance with noise except np.linalg.LinAlgError: L = np.linalg.cholesky(nearPD(K / sn2 + np.eye(n))).T print "okay now" assert False alpha = solve_chol(L, y - m) / sn2 post = postStruct() post.alpha = alpha # return the posterior parameters post.sW = np.ones((n, 1)) / np.sqrt(sn2) # sqrt of noise precision vector post.L = L # L = chol(eye(n)+sW*sW'.*K) if nargout > 1: # do we want the marginal likelihood? nlZ = ( np.dot((y - m).T, alpha / 2) + np.log(np.diag(L)).sum() + n * np.log(2 * np.pi * sn2) / 2.0 ) # -log marg lik if nargout > 2: # do we want derivatives? dnlZ = dnlzStruct(hyp) # allocate space for derivatives Q = solve_chol(L, np.eye(n)) / sn2 - np.dot(alpha, alpha.T) # precompute for convenience for ii in range(len(hyp.cov)): dnlZ.cov[ii] = (Q * src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)).sum() / 2.0 dnlZ.lik = sn2 * np.trace(Q) for ii in range(len(hyp.mean)): dnlZ.mean[ii] = np.dot(-src.Tools.general.feval(meanfunc, hyp.mean, x, ii).T, alpha) return [post, nlZ[0][0], dnlZ] return [post, nlZ[0][0]] return [post]
def infEP(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1): """ Expectation Propagation approximation to the posterior Gaussian Process. The function takes a specified covariance function (see kernels.py) and likelihood function (see likelihoods.py), and is designed to be used with gp.py. In the EP algorithm, the sites are updated in random order, for better performance when cases are ordered according to the targets. """ tol = 1e-4 max_sweep = 10 min_sweep = 2 # tolerance to stop EP iterations inffunc = "inferences.infEP" n = x.shape[0] K = src.Tools.general.feval(covfunc, hyp.cov, x) # evaluate the covariance matrix m = src.Tools.general.feval(meanfunc, hyp.mean, x) # evaluate the mean vector # A note on naming: variables are given short but descriptive names in # accordance with Rasmussen & Williams "GPs for Machine Learning" (2006): mu # and s2 are mean and variance, nu and tau are natural parameters. A leading t # means tilde, a subscript _ni means "not i" (for cavity parameters), or _n # for a vector of cavity parameters. # marginal likelihood for ttau = tnu = zeros(n,1); equals n*log(2) for likCum* nlZ0 = -src.Tools.general.feval( likfunc, hyp.lik, y, m, np.reshape(np.diag(K), (np.diag(K).shape[0], 1)), inffunc ).sum() if "last_ttau" not in infEP.__dict__: # find starting point for tilde parameters ttau = np.zeros((n, 1)) # initialize to zero if we have no better guess tnu = np.zeros((n, 1)) Sigma = K # initialize Sigma and mu, the parameters of .. mu = np.zeros((n, 1)) # .. the Gaussian posterior approximation nlZ = nlZ0 else: ttau = infEP.last_ttau # try the tilde values from previous call tnu = infEP.last_tnu [Sigma, mu, nlZ, L] = epComputeParams(K, y, ttau, tnu, likfunc, hyp, m, inffunc) if nlZ > nlZ0: # if zero is better .. ttau = np.zeros((n, 1)) # .. then initialize with zero instead tnu = np.zeros((n, 1)) Sigma = K # initialize Sigma and mu, the parameters of .. mu = np.zeros((n, 1)) # .. the Gaussian posterior approximation nlZ = nlZ0 nlZ_old = np.inf sweep = 0 # converged, max. sweeps or min. sweeps? while (np.abs(nlZ - nlZ_old) > tol and sweep < max_sweep) or (sweep < min_sweep): nlZ_old = nlZ sweep += 1 rperm = range(n) # randperm(n) for ii in rperm: # iterate EP updates (in random order) over examples tau_ni = 1 / Sigma[ii, ii] - ttau[ii] # first find the cavity distribution .. nu_ni = mu[ii] / Sigma[ii, ii] + m[ii] * tau_ni - tnu[ii] # .. params tau_ni and nu_ni # compute the desired derivatives of the indivdual log partition function vargout = src.Tools.general.feval(likfunc, hyp.lik, y[ii], nu_ni / tau_ni, 1 / tau_ni, inffunc, None, 3) lZ = vargout[0] dlZ = vargout[1] d2lZ = vargout[2] ttau_old = copy(ttau[ii]) # then find the new tilde parameters, keep copy of old ttau[ii] = -d2lZ / (1.0 + d2lZ / tau_ni) ttau[ii] = max(ttau[ii], 0) # enforce positivity i.e. lower bound ttau by zero tnu[ii] = (dlZ + (m[ii] - nu_ni / tau_ni) * d2lZ) / (1.0 + d2lZ / tau_ni) ds2 = ttau[ii] - ttau_old # finally rank-1 update Sigma .. si = np.reshape(Sigma[:, ii], (Sigma.shape[0], 1)) Sigma = Sigma - ds2 / (1.0 + ds2 * si[ii]) * np.dot(si, si.T) # takes 70# of total time mu = np.dot(Sigma, tnu) # .. and recompute mu # recompute since repeated rank-one updates can destroy numerical precision [Sigma, mu, nlZ, L] = epComputeParams(K, y, ttau, tnu, likfunc, hyp, m, inffunc) if sweep == max_sweep: raise Exception("maximum number of sweeps reached in function infEP") infEP.last_ttau = ttau infEP.last_tnu = tnu # remember for next call sW = np.sqrt(ttau) alpha = tnu - sW * solve_chol(L, sW * np.dot(K, tnu)) post = postStruct() post.alpha = alpha # return the posterior params post.sW = sW post.L = L if nargout > 1: if nargout > 2: # do we want derivatives? dnlZ = dnlzStruct(hyp) # allocate space for derivatives ssi = np.sqrt(ttau) V = np.linalg.solve(L.T, np.tile(ssi, (1, n)) * K) Sigma = K - np.dot(V.T, V) mu = np.dot(Sigma, tnu) Dsigma = np.reshape(np.diag(Sigma), (np.diag(Sigma).shape[0], 1)) tau_n = 1 / Dsigma - ttau # compute the log marginal likelihood nu_n = mu / Dsigma - tnu # vectors of cavity parameters F = np.dot(alpha, alpha.T) - np.tile(sW, (1, n)) * solve_chol( L, np.diag(np.reshape(sW, (sW.shape[0],))) ) # covariance hypers for ii in range(len(hyp.cov)): dK = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii) dnlZ.cov[ii] = -(F * dK).sum() / 2.0 for ii in range(len(hyp.lik)): dlik = src.Tools.general.feval(likfunc, hyp.lik, y, nu_n / tau_n, 1 / tau_n, inffunc, ii, 1) dnlZ.lik[ii] = -dlik.sum() [junk, dlZ] = src.Tools.general.feval( likfunc, hyp.lik, y, nu_n / tau_n, 1 / tau_n, inffunc, None, 2 ) # mean hyps for ii in range(len(hyp.mean)): dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii) dnlZ.mean[ii] = -np.dot(dlZ.T, dm)[0, 0] vargout = [post, nlZ, dnlZ] else: vargout = [post, nlZ] else: vargout = [post] return vargout
def infLaplace(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1): """ Laplace approximation to the posterior Gaussian process. The function takes a specified covariance function (see kernels.py) and likelihood function (see likelihoods.py). """ tol = 1e-6 # tolerance for when to stop the Newton iterations smax = 2 Nline = 20 thr = 1e-4 # line search parameters maxit = 20 # max number of Newton steps in f inffunc = "inferences.infLaplace" K = src.Tools.general.feval(covfunc, hyp.cov, x) # evaluate the covariance m = src.Tools.general.feval(meanfunc, hyp.mean, x) # evaluate the mean vector n, D = x.shape Psi_old = np.inf # make sure while loop starts by the largest old objective val if "last_alpha" not in infLaplace.__dict__: # find a good starting point for alpha and f alpha = np.zeros((n, 1)) f = np.dot(K, alpha) + m # start at mean if sizes not match vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = -lp.sum() else: alpha = last_alpha f = np.dot(K, alpha) + m # try last one vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() # objective for last alpha vargout = -src.Tools.general.feval(likfunc, hyp.lik, y, m, None, inffunc, None, 1) Psi_def = vargout[0] # objective for default init f==m if Psi_def < Psi_new: # if default is better, we use it alpha = np.zeros((n, 1)) f = np.dot(K, alpha) + m vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] W = -d2lp Psi_new = -lp.sum() isWneg = np.any(W < 0) # flag indicating whether we found negative values of W it = 0 # this happens for the Student's t likelihood while (Psi_old - Psi_new > tol) and it < maxit: # begin Newton Psi_old = Psi_new it += 1 if isWneg: # stabilise the Newton direction in case W has negative values W = np.maximum(W, 0) # stabilise the Hessian to guarantee postive definiteness tol = 1e-10 # increase accuracy to also get the derivatives right # In Vanhatalo et. al., GPR with Student's t likelihood, NIPS 2009, they use # a more conservative strategy then we do being equivalent to 2 lines below. # nu = exp(hyp.lik(1)); # degree of freedom hyperparameter # W = W + 2/(nu+1)*dlp.^2; # add ridge according to Vanhatalo sW = np.sqrt(W) L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T b = W * (f - m) + dlp dalpha = b - sW * solve_chol(L, sW * np.dot(K, b)) - alpha vargout = brentmin(0, smax, Nline, thr, _Psi_line, 4, dalpha, alpha, hyp, K, m, likfunc, y, inffunc) s = vargout[0] Psi_new = vargout[1] Nfun = vargout[2] alpha = vargout[3] f = vargout[4] dlp = vargout[5] W = vargout[6] isWneg = np.any(W < 0) last_alpha = alpha # remember for next call vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 4) lp = vargout[0] dlp = vargout[1] d2lp = vargout[2] d3lp = vargout[3] W = -d2lp isWneg = np.any(W < 0) post = postStruct() post.alpha = alpha # return the posterior parameters post.sW = np.sqrt(np.abs(W)) * np.sign(W) # preserve sign in case of negative if isWneg: [ldA, iA, post.L] = _logdetA(K, W, 3) nlZ = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() + ldA / 2.0 nlZ = nlZ[0] else: sW = post.sW post.L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T nlZ = np.dot(alpha.T, (f - m)) / 2.0 + (np.log(np.diag(post.L)) - np.reshape(lp, (lp.shape[0],))).sum() nlZ = nlZ[0] if nargout > 2: # do we want derivatives? dnlZ = dnlzStruct(hyp) # allocate space for derivatives if isWneg: # switch between Cholesky and LU decomposition mode Z = -post.L # inv(K+inv(W)) g = np.atleast_2d((iA * K).sum(axis=1)).T / 2 # deriv. of ln|B| wrt W; g = diag(inv(inv(K)+diag(W)))/2 else: Z = np.tile(sW, (1, n)) * solve_chol( post.L, np.diag(np.reshape(sW, (sW.shape[0],))) ) # sW*inv(B)*sW=inv(K+inv(W)) C = np.linalg.solve(post.L.T, np.tile(sW, (1, n)) * K) # deriv. of ln|B| wrt W g = np.atleast_2d((np.diag(K) - (C ** 2).sum(axis=0).T)).T / 2.0 # g = diag(inv(inv(K)+W))/2 dfhat = g * d3lp # deriv. of nlZ wrt. fhat for ii in range(len(hyp.cov)): # covariance hypers dK = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii) dnlZ.cov[ii] = (Z * dK).sum() / 2.0 - np.dot(alpha.T, np.dot(dK, alpha)) / 2.0 # explicit part b = np.dot(dK, dlp) tmp = np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b))) dnlZ.cov[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0] # implicit part for ii in range(len(hyp.lik)): # likelihood hypers [lp_dhyp, dlp_dhyp, d2lp_dhyp] = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, ii, 3) dnlZ.lik[ii] = -np.dot(g.T, d2lp_dhyp) - lp_dhyp.sum() # explicit part b = np.dot(K, dlp_dhyp) dnlZ.lik[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0] # implicit part for ii in range(len(hyp.mean)): # mean hypers dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii) dnlZ.mean[ii] = -np.dot(alpha.T, dm) # explicit part dnlZ.mean[ii] -= np.dot(dfhat.T, dm - np.dot(K, np.dot(Z, dm)))[0, 0] # implicit part vargout = [post, nlZ, dnlZ] else: vargout = [post, nlZ] return vargout
def infFITC(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1): """ FITC approximation to the posterior Gaussian process. The function is equivalent to infExact with the covariance function: Kt = Q + G; G = diag(g); g = diag(K-Q); Q = Ku' * inv(Quu) * Ku; where Ku and Kuu are covariances w.r.t. to inducing inputs xu, snu2 = sn2/1e6 is the noise of the inducing inputs and Quu = Kuu + snu2 * eye(nu). We fixed the standard deviation of the inducing inputs snu to be a one per mil of the measurement noise's standard deviation sn. The implementation exploits the Woodbury matrix identity inv(Kt) = inv(G) - inv(G) * V' * inv(eye(nu) + V * inv(G) * V') * V * inv(G) in order to be applicable to large datasets. The computational complexity is O(n nu^2) where n is the number of data points x and nu the number of inducing inputs in xu. The function takes a specified covariance function (see kernels.py) and likelihood function (see likelihoods.py), and is designed to be used with gp.py and in conjunction with covFITC and likGauss. """ if not (likfunc[0] == "likelihoods.likGauss"): # NOTE: no explicit call to likGauss raise Exception("Exact inference only possible with Gaussian likelihood") cov1 = covfunc[0] if not cov1 == ["kernels.covFITC"]: raise Exception("Only covFITC supported.") # check cov diagK, Kuu, Ku = src.Tools.general.feval(covfunc, hyp.cov, x) # evaluate covariance matrix m = src.Tools.general.feval(meanfunc, hyp.mean, x) # evaluate mean vector n, D = x.shape nu = Kuu.shape[0] sn2 = np.exp(2 * hyp.lik[0]) # noise variance of likGauss snu2 = 1.0e-6 * sn2 # hard coded inducing inputs noise Luu = np.linalg.cholesky(Kuu + snu2 * np.eye(nu)).T # Kuu + snu2*I = Luu'*Luu V = np.linalg.solve(Luu.T, Ku) # V = inv(Luu')*Ku => V'*V = Q g_sn2 = diagK + sn2 - np.array([(V * V).sum(axis=0)]).T # g + sn2 = diag(K) + sn2 - diag(Q) Lu = np.linalg.cholesky(np.eye(nu) + np.dot(V / np.tile(g_sn2.T, (nu, 1)), V.T)).T # Lu'*Lu=I+V*diag(1/g_sn2)*V' r = (y - m) / np.sqrt(g_sn2) be = np.linalg.solve(Lu.T, np.dot(V, r / np.sqrt(g_sn2))) iKuu = solve_chol(Luu, np.eye(nu)) # inv(Kuu + snu2*I) = iKuu post = postStruct() post.alpha = np.linalg.solve(Luu, np.linalg.solve(Lu, be)) # return the posterior parameters post.L = solve_chol(np.dot(Lu, Luu), np.eye(nu)) - iKuu # Sigma-inv(Kuu) post.sW = np.ones((n, 1)) / np.sqrt(sn2) # unused for FITC prediction with gp.py if nargout > 1: # do we want the marginal likelihood nlZ = ( np.log(np.diag(Lu)).sum() + (np.log(g_sn2).sum() + n * np.log(2 * np.pi) + np.dot(r.T, r) - np.dot(be.T, be)) / 2.0 ) if nargout > 2: # do we want derivatives? dnlZ = dnlzStruct(hyp) # allocate space for derivatives al = r / np.sqrt(g_sn2) - np.dot(V.T, np.linalg.solve(Lu, be)) / g_sn2 # al = (Kt+sn2*eye(n))\y B = np.dot(iKuu, Ku) w = np.dot(B, al) W = np.linalg.solve(Lu.T, V / np.tile(g_sn2.T, (nu, 1))) for ii in range(len(hyp.cov)): [ddiagKi, dKuui, dKui] = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii) # eval cov deriv R = 2.0 * dKui - np.dot(dKuui, B) v = ddiagKi - np.array([(R * B).sum(axis=0)]).T # diag part of cov deriv dnlZ.cov[ii] = ( np.dot(ddiagKi.T, 1.0 / g_sn2) + np.dot(w.T, (np.dot(dKuui, w) - 2.0 * np.dot(dKui, al))) - np.dot(al.T, (v * al)) - np.dot(np.array([(W * W).sum(axis=0)]), v) - (np.dot(R, W.T) * np.dot(B, W.T)).sum() ) / 2.0 dnlZ.lik = sn2 * ((1.0 / g_sn2).sum() - (np.array([(W * W).sum(axis=0)])).sum() - np.dot(al.T, al)) # since snu2 is a fixed fraction of sn2, there is a covariance-like term in the derivative as well dKuui = 2 * snu2 R = -dKuui * B v = -np.array([(R * B).sum(axis=0)]).T # diag part of cov deriv dnlZ.lik += ( np.dot(w.T, np.dot(dKuui, w)) - np.dot(al.T, (v * al)) - np.dot(np.array([(W * W).sum(axis=0)]), v) - (np.dot(R, W.T) * np.dot(B, W.T)).sum() ) / 2.0 dnlZ.lik = dnlZ.lik[0] for ii in range(len(hyp.mean)): dnlZ.mean[ii] = np.dot(-src.Tools.general.feval(meanfunc, hyp.mean, x, ii).T, al)[0, 0] return [post, nlZ[0, 0], dnlZ] return [post, nlZ[0, 0]] return [post]