Example #1
0
def infExact(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ Exact inference for a GP with Gaussian likelihood. Compute a parametrization
     of the posterior, the negative log marginal likelihood and its derivatives
     w.r.t. the hyperparameters.
    """

    if not (likfunc[0] == "likelihoods.likGauss"):  # NOTE: no explicit call to likGauss
        raise Exception("Exact inference only possible with Gaussian likelihood")

    n, D = x.shape
    K = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate covariance matrix
    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate mean vector

    sn2 = np.exp(2.0 * hyp.lik)  # noise variance of likGauss
    try:
        L = np.linalg.cholesky(K / sn2 + np.eye(n)).T  # Cholesky factor of covariance with noise
    except np.linalg.LinAlgError:
        L = np.linalg.cholesky(nearPD(K / sn2 + np.eye(n))).T
        print "okay now"
        assert False
    alpha = solve_chol(L, y - m) / sn2

    post = postStruct()

    post.alpha = alpha  # return the posterior parameters
    post.sW = np.ones((n, 1)) / np.sqrt(sn2)  # sqrt of noise precision vector
    post.L = L  # L = chol(eye(n)+sW*sW'.*K)

    if nargout > 1:  # do we want the marginal likelihood?
        nlZ = (
            np.dot((y - m).T, alpha / 2) + np.log(np.diag(L)).sum() + n * np.log(2 * np.pi * sn2) / 2.0
        )  # -log marg lik
        if nargout > 2:  # do we want derivatives?
            dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
            Q = solve_chol(L, np.eye(n)) / sn2 - np.dot(alpha, alpha.T)  # precompute for convenience
            for ii in range(len(hyp.cov)):
                dnlZ.cov[ii] = (Q * src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)).sum() / 2.0

            dnlZ.lik = sn2 * np.trace(Q)
            for ii in range(len(hyp.mean)):
                dnlZ.mean[ii] = np.dot(-src.Tools.general.feval(meanfunc, hyp.mean, x, ii).T, alpha)

            return [post, nlZ[0][0], dnlZ]

        return [post, nlZ[0][0]]

    return [post]
Example #2
0
def infLaplace(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ Laplace approximation to the posterior Gaussian process.
     The function takes a specified covariance function (see kernels.py) and
     likelihood function (see likelihoods.py).
    """

    tol = 1e-6
    # tolerance for when to stop the Newton iterations
    smax = 2
    Nline = 20
    thr = 1e-4
    # line search parameters
    maxit = 20
    # max number of Newton steps in f

    inffunc = "inferences.infLaplace"

    K = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate the covariance
    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate the mean vector

    n, D = x.shape

    Psi_old = np.inf  # make sure while loop starts by the largest old objective val
    if "last_alpha" not in infLaplace.__dict__:  # find a good starting point for alpha and f
        alpha = np.zeros((n, 1))
        f = np.dot(K, alpha) + m  # start at mean if sizes not match
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = -lp.sum()
    else:
        alpha = last_alpha
        f = np.dot(K, alpha) + m  # try last one
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum()  # objective for last alpha
        vargout = -src.Tools.general.feval(likfunc, hyp.lik, y, m, None, inffunc, None, 1)
        Psi_def = vargout[0]  # objective for default init f==m
        if Psi_def < Psi_new:  # if default is better, we use it
            alpha = np.zeros((n, 1))
            f = np.dot(K, alpha) + m
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
            lp = vargout[0]
            dlp = vargout[1]
            d2lp = vargout[2]
            W = -d2lp
            Psi_new = -lp.sum()

    isWneg = np.any(W < 0)  # flag indicating whether we found negative values of W
    it = 0  # this happens for the Student's t likelihood

    while (Psi_old - Psi_new > tol) and it < maxit:  # begin Newton
        Psi_old = Psi_new
        it += 1
        if isWneg:  # stabilise the Newton direction in case W has negative values
            W = np.maximum(W, 0)  # stabilise the Hessian to guarantee postive definiteness
            tol = 1e-10
            # increase accuracy to also get the derivatives right
            # In Vanhatalo et. al., GPR with Student's t likelihood, NIPS 2009, they use
            # a more conservative strategy then we do being equivalent to 2 lines below.
            # nu  = exp(hyp.lik(1));                    # degree of freedom hyperparameter
            # W  = W + 2/(nu+1)*dlp.^2;                 # add ridge according to Vanhatalo

        sW = np.sqrt(W)
        L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T
        b = W * (f - m) + dlp
        dalpha = b - sW * solve_chol(L, sW * np.dot(K, b)) - alpha
        vargout = brentmin(0, smax, Nline, thr, _Psi_line, 4, dalpha, alpha, hyp, K, m, likfunc, y, inffunc)
        s = vargout[0]
        Psi_new = vargout[1]
        Nfun = vargout[2]
        alpha = vargout[3]
        f = vargout[4]
        dlp = vargout[5]
        W = vargout[6]
        isWneg = np.any(W < 0)

    last_alpha = alpha  # remember for next call
    vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 4)
    lp = vargout[0]
    dlp = vargout[1]
    d2lp = vargout[2]
    d3lp = vargout[3]

    W = -d2lp
    isWneg = np.any(W < 0)
    post = postStruct()
    post.alpha = alpha  # return the posterior parameters
    post.sW = np.sqrt(np.abs(W)) * np.sign(W)  # preserve sign in case of negative
    if isWneg:
        [ldA, iA, post.L] = _logdetA(K, W, 3)
        nlZ = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() + ldA / 2.0
        nlZ = nlZ[0]
    else:
        sW = post.sW
        post.L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T
        nlZ = np.dot(alpha.T, (f - m)) / 2.0 + (np.log(np.diag(post.L)) - np.reshape(lp, (lp.shape[0],))).sum()
        nlZ = nlZ[0]

    if nargout > 2:  # do we want derivatives?
        dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
        if isWneg:  # switch between Cholesky and LU decomposition mode
            Z = -post.L  # inv(K+inv(W))
            g = np.atleast_2d((iA * K).sum(axis=1)).T / 2  # deriv. of ln|B| wrt W; g = diag(inv(inv(K)+diag(W)))/2
        else:
            Z = np.tile(sW, (1, n)) * solve_chol(
                post.L, np.diag(np.reshape(sW, (sW.shape[0],)))
            )  # sW*inv(B)*sW=inv(K+inv(W))
            C = np.linalg.solve(post.L.T, np.tile(sW, (1, n)) * K)  # deriv. of ln|B| wrt W
            g = np.atleast_2d((np.diag(K) - (C ** 2).sum(axis=0).T)).T / 2.0  # g = diag(inv(inv(K)+W))/2

        dfhat = g * d3lp  # deriv. of nlZ wrt. fhat
        for ii in range(len(hyp.cov)):  # covariance hypers
            dK = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)
            dnlZ.cov[ii] = (Z * dK).sum() / 2.0 - np.dot(alpha.T, np.dot(dK, alpha)) / 2.0  # explicit part
            b = np.dot(dK, dlp)
            tmp = np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))
            dnlZ.cov[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0]  # implicit part

        for ii in range(len(hyp.lik)):  # likelihood hypers
            [lp_dhyp, dlp_dhyp, d2lp_dhyp] = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, ii, 3)
            dnlZ.lik[ii] = -np.dot(g.T, d2lp_dhyp) - lp_dhyp.sum()  # explicit part
            b = np.dot(K, dlp_dhyp)
            dnlZ.lik[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0]  # implicit part

        for ii in range(len(hyp.mean)):  # mean hypers
            dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii)
            dnlZ.mean[ii] = -np.dot(alpha.T, dm)  # explicit part
            dnlZ.mean[ii] -= np.dot(dfhat.T, dm - np.dot(K, np.dot(Z, dm)))[0, 0]  # implicit part

        vargout = [post, nlZ, dnlZ]
    else:
        vargout = [post, nlZ]

    return vargout
Example #3
0
def infEP(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ Expectation Propagation approximation to the posterior Gaussian Process.
     The function takes a specified covariance function (see kernels.py) and
     likelihood function (see likelihoods.py), and is designed to be used with
     gp.py. In the EP algorithm, the sites are 
     updated in random order, for better performance when cases are ordered
     according to the targets.
    """

    tol = 1e-4
    max_sweep = 10
    min_sweep = 2  # tolerance to stop EP iterations

    inffunc = "inferences.infEP"
    n = x.shape[0]

    K = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate the covariance matrix

    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate the mean vector

    # A note on naming: variables are given short but descriptive names in
    # accordance with Rasmussen & Williams "GPs for Machine Learning" (2006): mu
    # and s2 are mean and variance, nu and tau are natural parameters. A leading t
    # means tilde, a subscript _ni means "not i" (for cavity parameters), or _n
    # for a vector of cavity parameters.

    # marginal likelihood for ttau = tnu = zeros(n,1); equals n*log(2) for likCum*
    nlZ0 = -src.Tools.general.feval(
        likfunc, hyp.lik, y, m, np.reshape(np.diag(K), (np.diag(K).shape[0], 1)), inffunc
    ).sum()
    if "last_ttau" not in infEP.__dict__:  # find starting point for tilde parameters
        ttau = np.zeros((n, 1))  # initialize to zero if we have no better guess
        tnu = np.zeros((n, 1))
        Sigma = K  # initialize Sigma and mu, the parameters of ..
        mu = np.zeros((n, 1))  # .. the Gaussian posterior approximation
        nlZ = nlZ0
    else:
        ttau = infEP.last_ttau  # try the tilde values from previous call
        tnu = infEP.last_tnu
        [Sigma, mu, nlZ, L] = epComputeParams(K, y, ttau, tnu, likfunc, hyp, m, inffunc)
        if nlZ > nlZ0:  # if zero is better ..
            ttau = np.zeros((n, 1))  # .. then initialize with zero instead
            tnu = np.zeros((n, 1))
            Sigma = K  # initialize Sigma and mu, the parameters of ..
            mu = np.zeros((n, 1))  # .. the Gaussian posterior approximation
            nlZ = nlZ0

    nlZ_old = np.inf
    sweep = 0  # converged, max. sweeps or min. sweeps?
    while (np.abs(nlZ - nlZ_old) > tol and sweep < max_sweep) or (sweep < min_sweep):
        nlZ_old = nlZ
        sweep += 1
        rperm = range(n)  # randperm(n)
        for ii in rperm:  # iterate EP updates (in random order) over examples
            tau_ni = 1 / Sigma[ii, ii] - ttau[ii]  #  first find the cavity distribution ..
            nu_ni = mu[ii] / Sigma[ii, ii] + m[ii] * tau_ni - tnu[ii]  # .. params tau_ni and nu_ni
            # compute the desired derivatives of the indivdual log partition function
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y[ii], nu_ni / tau_ni, 1 / tau_ni, inffunc, None, 3)
            lZ = vargout[0]
            dlZ = vargout[1]
            d2lZ = vargout[2]
            ttau_old = copy(ttau[ii])  # then find the new tilde parameters, keep copy of old

            ttau[ii] = -d2lZ / (1.0 + d2lZ / tau_ni)
            ttau[ii] = max(ttau[ii], 0)  # enforce positivity i.e. lower bound ttau by zero
            tnu[ii] = (dlZ + (m[ii] - nu_ni / tau_ni) * d2lZ) / (1.0 + d2lZ / tau_ni)

            ds2 = ttau[ii] - ttau_old  # finally rank-1 update Sigma ..
            si = np.reshape(Sigma[:, ii], (Sigma.shape[0], 1))
            Sigma = Sigma - ds2 / (1.0 + ds2 * si[ii]) * np.dot(si, si.T)  # takes 70# of total time
            mu = np.dot(Sigma, tnu)  # .. and recompute mu

        # recompute since repeated rank-one updates can destroy numerical precision
        [Sigma, mu, nlZ, L] = epComputeParams(K, y, ttau, tnu, likfunc, hyp, m, inffunc)

    if sweep == max_sweep:
        raise Exception("maximum number of sweeps reached in function infEP")

    infEP.last_ttau = ttau
    infEP.last_tnu = tnu  # remember for next call

    sW = np.sqrt(ttau)
    alpha = tnu - sW * solve_chol(L, sW * np.dot(K, tnu))

    post = postStruct()

    post.alpha = alpha  # return the posterior params
    post.sW = sW
    post.L = L

    if nargout > 1:
        if nargout > 2:  # do we want derivatives?
            dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
            ssi = np.sqrt(ttau)
            V = np.linalg.solve(L.T, np.tile(ssi, (1, n)) * K)
            Sigma = K - np.dot(V.T, V)
            mu = np.dot(Sigma, tnu)
            Dsigma = np.reshape(np.diag(Sigma), (np.diag(Sigma).shape[0], 1))
            tau_n = 1 / Dsigma - ttau  # compute the log marginal likelihood
            nu_n = mu / Dsigma - tnu  # vectors of cavity parameters
            F = np.dot(alpha, alpha.T) - np.tile(sW, (1, n)) * solve_chol(
                L, np.diag(np.reshape(sW, (sW.shape[0],)))
            )  # covariance hypers
            for ii in range(len(hyp.cov)):
                dK = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)
                dnlZ.cov[ii] = -(F * dK).sum() / 2.0

            for ii in range(len(hyp.lik)):
                dlik = src.Tools.general.feval(likfunc, hyp.lik, y, nu_n / tau_n, 1 / tau_n, inffunc, ii, 1)
                dnlZ.lik[ii] = -dlik.sum()

            [junk, dlZ] = src.Tools.general.feval(
                likfunc, hyp.lik, y, nu_n / tau_n, 1 / tau_n, inffunc, None, 2
            )  # mean hyps
            for ii in range(len(hyp.mean)):
                dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii)
                dnlZ.mean[ii] = -np.dot(dlZ.T, dm)[0, 0]

            vargout = [post, nlZ, dnlZ]
        else:
            vargout = [post, nlZ]
    else:
        vargout = [post]

    return vargout
Example #4
0
def infFITC_EP(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ FITC-EP approximation to the posterior Gaussian process. The function is
     equivalent to infEP with the covariance function:
    
     Kt = Q + G; G = diag(g); g = diag(K-Q);  Q = Ku' * inv(Kuu + snu2 * eye(nu)) * Ku;
    
     where Ku and Kuu are covariances w.r.t. to inducing inputs xu and
     snu2 = sn2/1e6 is the noise of the inducing inputs. We fixed the standard
     deviation of the inducing inputs snu to be a one per mil of the measurement 
     noise's standard deviation sn. In case of a likelihood without noise
     parameter sn2, we simply use snu2 = 1e-6.
     For details, see The Generalized FITC Approximation, Andrew Naish-Guzman and Sean Holden, NIPS, 2007.
    
     The implementation exploits the Woodbury matrix identity
     inv(Kt) = inv(G) - inv(G) * Ku' * inv(Kuu+Ku * inv(G) * Ku') * Ku * inv(G)
     in order to be applicable to large datasets. The computational complexity
     is O(n nu^2) where n is the number of data points x and nu the number of
     inducing inputs in xu.
     The posterior N(f|h,Sigma) is given by h = m+mu with mu = nn + P' * gg and
     Sigma = inv(inv(K)+diag(W)) = diag(d) + P' * R0' * R' * R * R0 * P. Here, we use the
     site parameters: b,w= b, pi =tnu,ttau, P= P', nn= nu, gg= gamma
                 
     The function takes a specified covariance function (see kernels.py) and
     likelihood function (see likelihoods.py), and is designed to be used with
     gp.py and in conjunction with covFITC. 
    """

    cov1 = covfunc[0]
    if not cov1 == ["kernels.covFITC"]:
        print "cov1 = ", cov1
        raise Exception("Only covFITC supported.")  # check cov

    tol = 1e-4
    max_sweep = 10
    min_sweep = 2  # tolerance to stop EP iterations

    inffunc = "inferences.infEP"

    diagK, Kuu, Ku = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate the covariance matrix
    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate the mean vector

    if hyp.lik:  # hard coded inducing inputs noise
        sn2 = np.exp(2.0 * hyp.lik[-1])
        snu2 = 1.0e-6 * sn2  # similar to infFITC
    else:
        snu2 = 1.0e-6

    n, D = x.shape
    nu = Kuu.shape[0]

    rot180 = lambda A: np.rot90(np.rot90(A))  # little helper functions
    chol_inv = lambda A: np.linalg.solve(rot180(np.linalg.cholesky(rot180(A))), np.eye(nu))  # chol(inv(A))

    R0 = chol_inv(Kuu + snu2 * np.eye(nu))  # initial R, used for refresh O(nu^3)
    V = np.dot(R0, Ku)
    d0 = diagK - np.array([(V * V).sum(axis=0)]).T  # initial d, needed for refresh O(n*nu^2)

    # A note on naming: variables are given short but descriptive names in
    # accordance with Rasmussen & Williams "GPs for Machine Learning" (2006): mu
    # and s2 are mean and variance, nu and tau are natural parameters. A leading t
    # means tilde, a subscript _ni means "not i" (for cavity parameters), or _n
    # for a vector of cavity parameters.

    # marginal likelihood for ttau = tnu = zeros(n,1); equals n*log(2) for likCum*
    nlZ0 = -1.0 * src.Tools.general.feval(likfunc, hyp.lik, y, m, np.reshape(diagK, (diagK.shape[0], 1)), inffunc).sum()
    if "last_ttau" not in infFITC_EP.__dict__:  # find starting point for tilde parameters
        ttau = np.zeros((n, 1))  # initialize to zero if we have no better guess
        tnu = np.zeros((n, 1))
        [d, P, R, nn, gg] = _epfitcRefresh(d0, Ku, R0, V, ttau, tnu)  # compute initial repres.
        nlZ = nlZ0
    else:
        ttau = infFITC_EP.last_ttau  # try the tilde values from previous call
        tnu = infFITC_EP.last_tnu

        [d, P, R, nn, gg] = _epfitcRefresh(d0, Ku, R0, V, ttau, tnu)  # compute initial repres.
        nlZ = _epfitcZ(d, P, R, nn, gg, ttau, tnu, d0, R0, Ku, y, likfunc, hyp, m, inffunc)[0]
        if nlZ > nlZ0:  # if zero is better ..
            ttau = np.zeros((n, 1))  # .. then initialize with zero instead
            tnu = np.zeros((n, 1))
            [d, P, R, nn, gg] = _epfitcRefresh(d0, Ku, R0, V, ttau, tnu)  # initial repres.
            nlZ = nlZ0

    nlZ_old = np.inf
    sweep = 0  # converged, max. sweeps or min. sweeps?
    while (np.abs(nlZ - nlZ_old) > tol and sweep < max_sweep) or (sweep < min_sweep):
        nlZ_old = nlZ
        sweep += 1
        rperm = range(n)  # randperm(n)
        for ii in rperm:  # iterate EP updates (in random order) over examples
            p_i = np.reshape(P[:, ii], (P.shape[0], 1))
            t = np.dot(R, np.dot(R0, p_i))  # temporary variables
            sigma_i = d[ii] + np.dot(t.T, t)
            mu_i = nn[ii] + np.dot(p_i.T, gg)  # post moments O(nu^2)
            tau_ni = 1 / sigma_i - ttau[ii]  #  first find the cavity distribution ..
            nu_ni = mu_i / sigma_i + m[ii] * tau_ni - tnu[ii]  # .. params tau_ni and nu_ni
            # compute the desired derivatives of the indivdual log partition function
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y[ii], nu_ni / tau_ni, 1 / tau_ni, inffunc, None, 3)
            lZ = vargout[0]
            dlZ = vargout[1]
            d2lZ = vargout[2]
            ttau_i = -d2lZ / (1.0 + d2lZ / tau_ni)
            ttau_i = max(ttau_i, 0)  # enforce positivity i.e. lower bound ttau by zero
            tnu_i = (dlZ + (m[ii] - nu_ni / tau_ni) * d2lZ) / (1.0 + d2lZ / tau_ni)
            [d, P[:, ii], R, nn, gg, ttau, tnu] = _epfitcUpdate(
                d, P[:, ii], R, nn, gg, ttau, tnu, ii, ttau_i, tnu_i, m, d0, Ku, R0
            )  # update representation

        # recompute since repeated rank-one updates can destroy numerical precision
        [d, P, R, nn, gg] = _epfitcRefresh(d0, Ku, R0, V, ttau, tnu)
        [nlZ, nu_n, tau_n] = _epfitcZ(d, P, R, nn, gg, ttau, tnu, d0, R0, Ku, y, likfunc, hyp, m, inffunc)

    if sweep == max_sweep:
        raise Exception("maximum number of sweeps reached in function infEP")

    infFITC_EP.last_ttau = ttau
    infFITC_EP.last_tnu = tnu  # remember for next call
    post = postStruct()

    post.sW = np.sqrt(ttau)
    # unused for FITC_EP prediction with gp.py
    dd = 1 / (d0 + 1 / ttau)
    alpha = tnu / ttau * dd
    RV = np.dot(R, V)
    R0tV = np.dot(R0.T, V)

    alpha = alpha - np.dot(RV.T, np.dot(RV, alpha)) * dd  # long alpha vector for ordinary infEP
    post.alpha = np.dot(R0tV, alpha)
    B = R0tV * np.tile(dd.T, (nu, 1))
    L = np.dot(B, R0tV.T)
    B = np.dot(B, RV.T)
    post.L = np.dot(B, B.T) - L

    if nargout > 2:  # do we want derivatives?
        dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
        RVdd = RV * np.tile(dd.T, (nu, 1))
        for ii in range(len(hyp.cov)):
            ddiagK, dKuu, dKu = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)
            dA = 2 * dKu.T - np.dot(R0tV.T, dKuu)
            w = np.atleast_2d((dA * R0tV.T).sum(axis=1)).T
            v = ddiagK - w
            z = (
                np.dot(dd.T, (v + w))
                - np.dot(np.atleast_2d((RVdd * RVdd).sum(axis=0)), v)
                - (np.dot(RVdd, dA).T * np.dot(R0tV, RVdd.T)).sum()
            )
            dnlZ.cov[ii] = (z - np.dot(alpha.T, (alpha * v)) - np.dot(np.dot(alpha.T, dA), np.dot(R0tV, alpha))) / 2.0
        for ii in range(len(hyp.lik)):  # likelihood hypers
            dlik = src.Tools.general.feval(likfunc, hyp.lik, y, nu_n / tau_n + m, 1 / tau_n, inffunc, ii, 1)
            dnlZ.lik[ii] = -dlik.sum()
            if ii == len(hyp.lik - 1):
                # since snu2 is a fixed fraction of sn2, there is a covariance-like term in the derivative as well
                v = np.atleast_2d((R0tV * R0tV).sum(axis=0)).T
                z = (np.dot(RVdd, R0tV.T) ** 2).sum() - np.dot(np.atleast_2d((RVdd * RVdd).sum(axis=0)), v)
                z = z + np.dot(post.alpha.T, post.alpha) - np.dot(alpha.T, (v * alpha))
                dnlZ.lik[ii] += snu2 * z
        [junk, dlZ] = src.Tools.general.feval(
            likfunc, hyp.lik, y, nu_n / tau_n, 1 / tau_n, inffunc, None, 2
        )  # mean hyps
        for ii in range(len(hyp.mean)):
            dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii)
            dnlZ.mean[ii] = -np.dot(dlZ.T, dm)

        vargout = [post, nlZ[0][0], dnlZ]
    else:
        vargout = [post, nlZ[0][0]]

    return vargout
Example #5
0
def infFITC_Laplace(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ infFITC_Laplace - FITC-Laplace approximation to the posterior Gaussian process. The function is
     equivalent to infLaplace with the covariance function:
    
       Kt = Q + G; G = diag(g); g = diag(K-Q);  Q = Ku' * inv(Kuu + snu2 * eye(nu)) * Ku
    
     where Ku and Kuu are covariances w.r.t. to inducing inputs xu and
     snu2 = sn2/1e6 is the noise of the inducing inputs. We fixed the standard
     deviation of the inducing inputs snu to be a one per mil of the measurement 
     noise's standard deviation sn. In case of a likelihood without noise
     parameter sn2, we simply use snu2 = 1e-6.
    
     The implementation exploits the Woodbury matrix identity
     inv(Kt) = inv(G) - inv(G) * Ku' * inv(Kuu+Ku * inv(G) * Ku') * Ku * inv(G)
     in order to be applicable to large datasets. The computational complexity
     is O(n nu^2) where n is the number of data points x and nu the number of
     inducing inputs in xu.
     The posterior N(f|h,Sigma) is given by h = m+mu with mu = nn + P' * gg and
     Sigma = inv(inv(K)+diag(W)) = diag(d) + P' * R0' * R' * R * R0 * P.
                 
     The function takes a specified covariance function (see kernels.py) and
     likelihood function (likelihoods.py), and is designed to be used with
     gp.py and in conjunction with covFITC. 
    """

    cov1 = covfunc[0]
    if not cov1 == ["kernels.covFITC"]:
        raise Exception("Only covFITC supported.")  # check cov

    tol = 1e-6
    # tolerance for when to stop the Newton iterations
    smax = 2
    Nline = 100
    thr = 1e-4
    # line search parameters
    maxit = 20
    # max number of Newton steps in f

    inffunc = "inferences.infLaplace"

    diagK, Kuu, Ku = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate the covariance
    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate the mean vector

    if hyp.lik:  # hard coded inducing inputs noise
        sn2 = np.exp(2.0 * hyp.lik[-1])
        snu2 = 1.0e-6 * sn2  # similar to infFITC
    else:
        snu2 = 1.0e-6

    n, D = x.shape
    nu = Kuu.shape[0]

    rot180 = lambda A: np.rot90(np.rot90(A))  # little helper funct
    chol_inv = lambda A: np.linalg.solve(rot180(np.linalg.cholesky(rot180(A))), np.eye(nu))  # chol(inv(A))

    R0 = chol_inv(Kuu + snu2 * np.eye(nu))  # initial R, used for refresh O(nu^3)
    V = np.dot(R0, Ku)
    d0 = diagK - np.array([(V * V).sum(axis=0)]).T  # initial d, needed

    Psi_old = np.inf  # make sure while loop starts by the largest old objective val
    if "last_alpha" not in infFITC_Laplace.__dict__:  # find a good starting point for alpha and f
        alpha = np.zeros((n, 1))
        f = _mvmK(alpha, V, d0) + m  # start at mean if sizes not match
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = -lp.sum()
    else:
        alpha = last_alpha
        f = _mvmK(alpha, V, d0) + m  # try last one
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum()  # objective for last alpha
        vargout = -src.Tools.general.feval(likfunc, hyp.lik, y, m, None, inffunc, None, 1)
        Psi_def = vargout[0]  # objective for default init f==m
        if Psi_def < Psi_new:  # if default is better, we use it
            alpha = np.zeros((n, 1))
            f = _mvmK(alpha, V, d0) + m
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
            lp = vargout[0]
            dlp = vargout[1]
            d2lp = vargout[2]
            W = -d2lp
            Psi_new = -lp.sum()

    isWneg = np.any(W < 0)  # flag indicating whether we found negative values of W
    it = 0  # this happens for the Student's t likelihood

    while (Psi_old - Psi_new > tol) and it < maxit:  # begin Newton
        Psi_old = Psi_new
        it += 1
        if isWneg:  # stabilise the Newton direction in case W has negative values
            W = np.maximum(W, 0)  # stabilise the Hessian to guarantee postive definiteness
            tol = 1e-8
            # increase accuracy to also get the derivatives right
            # In Vanhatalo et. al., GPR with Student's t likelihood, NIPS 2009, they use
            # a more conservative strategy then we do being equivalent to 2 lines below.
            # nu  = exp(hyp.lik(1));                  # degree of freedom hyperparameter
            # W  = W + 2/(nu+1)*dlp.^2;               # add ridge according to Vanhatalo

        b = W * (f - m) + dlp
        dd = 1 / (1 + W * d0)
        RV = np.dot(chol_inv(np.eye(nu) + np.dot(V * np.tile((W * dd).T, (nu, 1)), V.T)), V)
        dalpha = dd * b - (W * dd) * np.dot(RV.T, np.dot(RV, (dd * b))) - alpha  # Newt dir + line search
        vargout = brentmin(0, smax, Nline, thr, _Psi_lineFITC, 4, dalpha, alpha, hyp, V, d0, m, likfunc, y, inffunc)
        s = vargout[0]
        Psi_new = vargout[1]
        Nfun = vargout[2]
        alpha = vargout[3]
        f = vargout[4]
        dlp = vargout[5]
        W = vargout[6]

        isWneg = np.any(W < 0)

    last_alpha = alpha  # remember for next call
    vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 4)
    lp = vargout[0]
    dlp = vargout[1]
    d2lp = vargout[2]
    d3lp = vargout[3]

    W = -d2lp
    isWneg = np.any(W < 0)
    post = postStruct()
    post.alpha = np.dot(R0.T, np.dot(V, alpha))  # return the posterior parameters
    post.sW = np.sqrt(np.abs(W)) * np.sign(W)  # preserve sign in case of negative
    dd = 1 / (1 + d0 * W)  # temporary variable O(n)
    A = np.eye(nu) + np.dot(V * np.tile((W * dd).T, (nu, 1)), V.T)  # temporary variable O(n*nu^2)
    R0tV = np.dot(R0.T, V)
    B = R0tV * np.tile((W * dd).T, (nu, 1))  # temporary variables O(n*nu^2)
    post.L = -np.dot(B, R0tV.T)  # L = -R0'*V*inv(Kt+diag(1./ttau))*V'*R0, first part
    if np.any(1 + d0 * W < 0):
        # B = np.dot(B,V.T); post.L += bp.dot(np.dot(B,np.inv(A)),B.T)
        # nlZ = np.nan; dnlZ = struct('cov',0*hyp.cov, 'mean',0*hyp.mean, 'lik',0*hyp.lik);
        raise Exception("W is too negative; nlZ and dnlZ cannot be computed.")

    nlZ = (
        np.dot(alpha.T, (f - m)) / 2.0
        - lp.sum()
        - np.log(dd).sum() / 2.0
        + np.log(np.diag(np.linalg.cholesky(A).T)).sum()
    )
    RV = np.dot(chol_inv(A), V)
    RVdd = RV * np.tile((W * dd).T, (nu, 1))  # RVdd needed for dnlZ
    B = np.dot(B, RV.T)
    post.L += np.dot(B, B.T)

    if nargout > 2:  # do we want derivatives?
        dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
        [d, P, R] = _fitcRefresh(d0, Ku, R0, V, W)  # g = diag(inv(inv(K)+W))/2
        g = d / 2 + 0.5 * np.atleast_2d((np.dot(np.dot(R, R0), P) ** 2).sum(axis=0)).T
        t = W / (1 + W * d0)

        dfhat = g * d3lp  # deriv. of nlZ wrt. fhat: dfhat=diag(inv(inv(K)+W)).*d3lp/2
        for ii in range(len(hyp.cov)):  # covariance hypers
            ddiagK, dKuu, dKu = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)  # eval cov derivatives
            dA = 2.0 * dKu.T - np.dot(R0tV.T, dKuu)  # dQ = dA*R0tV
            w = np.atleast_2d((dA * R0tV.T).sum(axis=1)).T
            v = ddiagK - w  # w = diag(dQ); v = diag(dK)-diag(dQ);
            dnlZ.cov[ii] = np.dot(ddiagK.T, t) - np.dot((RVdd * RVdd).sum(axis=0), v)  # explicit part
            dnlZ.cov[ii] -= (np.dot(RVdd, dA) * np.dot(RVdd, R0tV.T)).sum()  # explicit part
            dnlZ.cov[ii] = (
                0.5 * dnlZ.cov[ii] - np.dot(alpha.T, np.dot(dA, np.dot(R0tV, alpha)) + v * alpha) / 2.0
            )  # explicit
            b = np.dot(dA, np.dot(R0tV, dlp)) + v * dlp  # b-K*(Z*b) = inv(eye(n)+K*diag(W))*b
            KZb = _mvmK(_mvmZ(b, RVdd, t), V, d0)
            dnlZ.cov[ii] -= np.dot(dfhat.T, (b - KZb))  # implicit part

        for ii in range(len(hyp.lik)):  # likelihood hypers
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, ii, 3)
            lp_dhyp = vargout[0]
            dlp_dhyp = vargout[1]
            d2lp_dhyp = vargout[2]
            dnlZ.lik[ii] = -np.dot(g.T, d2lp_dhyp) - lp_dhyp.sum()  # explicit part
            b = _mvmK(dlp_dhyp, V, d0)  # implicit part
            dnlZ.lik[ii] -= np.dot(dfhat.T, b - _mvmK(_mvmZ(b, RVdd, t), V, d0))
            if ii == len(hyp.lik) - 1:
                # since snu2 is a fixed fraction of sn2, there is a covariance-like term in the derivative as well
                snu = np.sqrt(snu2)
                T = chol_inv(Kuu + snu2 * np.eye(nu))
                T = np.dot(T.T, np.dot(T, snu * Ku))
                t = np.array([(T * T).sum(axis=0)]).T
                z = np.dot(alpha.T, np.dot(T.T, np.dot(T, alpha)) - t * alpha) - np.dot(
                    np.array([(RVdd * RVdd).sum(axis=0)]), t
                )
                z += (np.dot(RVdd, T.T) ** 2).sum()
                b = (t * dlp - np.dot(T.T, np.dot(T, dlp))) / 2.0
                KZb = _mvmK(_mvmZ(b, RVdd, t), V, d0)
                z -= np.dot(dfhat.T, b - KZb)
                dnlZ.lik[ii] += z

        for ii in range(len(hyp.mean)):  # mean hypers
            dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii)
            dnlZ.mean[ii] = -np.dot(alpha.T, dm)  # explicit part
            Zdm = _mvmZ(dm, RVdd, t)
            dnlZ.mean[ii] -= np.dot(dfhat.T, (dm - _mvmK(Zdm, V, d0)))  # implicit part

        vargout = [post, nlZ[0, 0], dnlZ]
    else:
        vargout = [post, nlZ[0, 0]]

    return vargout
Example #6
0
def infFITC(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ FITC approximation to the posterior Gaussian process. The function is
     equivalent to infExact with the covariance function:
    
     Kt = Q + G; G = diag(g); g = diag(K-Q);  Q = Ku' * inv(Quu) * Ku;
    
     where Ku and Kuu are covariances w.r.t. to inducing inputs xu, snu2 = sn2/1e6
     is the noise of the inducing inputs and Quu = Kuu + snu2 * eye(nu).
     We fixed the standard deviation of the inducing inputs snu to be a one per mil
     of the measurement noise's standard deviation sn.
     The implementation exploits the Woodbury matrix identity
     inv(Kt) = inv(G) - inv(G) * V' * inv(eye(nu) + V * inv(G) * V') * V * inv(G)
     in order to be applicable to large datasets. The computational complexity
     is O(n nu^2) where n is the number of data points x and nu the number of
     inducing inputs in xu.
     The function takes a specified covariance function (see kernels.py) and
     likelihood function (see likelihoods.py), and is designed to be used with
     gp.py and in conjunction with covFITC and likGauss. 
    """

    if not (likfunc[0] == "likelihoods.likGauss"):  # NOTE: no explicit call to likGauss
        raise Exception("Exact inference only possible with Gaussian likelihood")

    cov1 = covfunc[0]
    if not cov1 == ["kernels.covFITC"]:
        raise Exception("Only covFITC supported.")  # check cov

    diagK, Kuu, Ku = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate covariance matrix
    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate mean vector
    n, D = x.shape
    nu = Kuu.shape[0]

    sn2 = np.exp(2 * hyp.lik[0])  # noise variance of likGauss
    snu2 = 1.0e-6 * sn2  # hard coded inducing inputs noise
    Luu = np.linalg.cholesky(Kuu + snu2 * np.eye(nu)).T  # Kuu + snu2*I = Luu'*Luu
    V = np.linalg.solve(Luu.T, Ku)  # V = inv(Luu')*Ku => V'*V = Q
    g_sn2 = diagK + sn2 - np.array([(V * V).sum(axis=0)]).T  # g + sn2 = diag(K) + sn2 - diag(Q)
    Lu = np.linalg.cholesky(np.eye(nu) + np.dot(V / np.tile(g_sn2.T, (nu, 1)), V.T)).T  # Lu'*Lu=I+V*diag(1/g_sn2)*V'
    r = (y - m) / np.sqrt(g_sn2)
    be = np.linalg.solve(Lu.T, np.dot(V, r / np.sqrt(g_sn2)))
    iKuu = solve_chol(Luu, np.eye(nu))  # inv(Kuu + snu2*I) = iKuu

    post = postStruct()

    post.alpha = np.linalg.solve(Luu, np.linalg.solve(Lu, be))  # return the posterior parameters
    post.L = solve_chol(np.dot(Lu, Luu), np.eye(nu)) - iKuu  # Sigma-inv(Kuu)
    post.sW = np.ones((n, 1)) / np.sqrt(sn2)  # unused for FITC prediction  with gp.py

    if nargout > 1:  # do we want the marginal likelihood
        nlZ = (
            np.log(np.diag(Lu)).sum()
            + (np.log(g_sn2).sum() + n * np.log(2 * np.pi) + np.dot(r.T, r) - np.dot(be.T, be)) / 2.0
        )
        if nargout > 2:  # do we want derivatives?
            dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
            al = r / np.sqrt(g_sn2) - np.dot(V.T, np.linalg.solve(Lu, be)) / g_sn2  # al = (Kt+sn2*eye(n))\y
            B = np.dot(iKuu, Ku)
            w = np.dot(B, al)
            W = np.linalg.solve(Lu.T, V / np.tile(g_sn2.T, (nu, 1)))
            for ii in range(len(hyp.cov)):
                [ddiagKi, dKuui, dKui] = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)  # eval cov deriv
                R = 2.0 * dKui - np.dot(dKuui, B)
                v = ddiagKi - np.array([(R * B).sum(axis=0)]).T  # diag part of cov deriv
                dnlZ.cov[ii] = (
                    np.dot(ddiagKi.T, 1.0 / g_sn2)
                    + np.dot(w.T, (np.dot(dKuui, w) - 2.0 * np.dot(dKui, al)))
                    - np.dot(al.T, (v * al))
                    - np.dot(np.array([(W * W).sum(axis=0)]), v)
                    - (np.dot(R, W.T) * np.dot(B, W.T)).sum()
                ) / 2.0

            dnlZ.lik = sn2 * ((1.0 / g_sn2).sum() - (np.array([(W * W).sum(axis=0)])).sum() - np.dot(al.T, al))
            # since snu2 is a fixed fraction of sn2, there is a covariance-like term in the derivative as well
            dKuui = 2 * snu2
            R = -dKuui * B
            v = -np.array([(R * B).sum(axis=0)]).T  # diag part of cov deriv
            dnlZ.lik += (
                np.dot(w.T, np.dot(dKuui, w))
                - np.dot(al.T, (v * al))
                - np.dot(np.array([(W * W).sum(axis=0)]), v)
                - (np.dot(R, W.T) * np.dot(B, W.T)).sum()
            ) / 2.0
            dnlZ.lik = dnlZ.lik[0]
            for ii in range(len(hyp.mean)):
                dnlZ.mean[ii] = np.dot(-src.Tools.general.feval(meanfunc, hyp.mean, x, ii).T, al)[0, 0]

            return [post, nlZ[0, 0], dnlZ]

        return [post, nlZ[0, 0]]

    return [post]