def finalize(self): """Finalize the fit, utilizing the already inverted Sigma matrix""" # Calculate the prediction quantities if len(self.dtp) > 0: self.MNt_n = np.dot(self.Mp_n.T, sl.cho_solve(self.Np_cf, self.dtp)) dtNdt = np.dot(self.dtp, sl.cho_solve(self.Np_cf, self.dtp)) else: self.MNt_n = np.zeros(self.Mp_n.shape[1]) dtNdt = 0.0 self.dpars_n = np.dot(self.Sigma_n, self.MNt_n + self.phipar_n) # TODO: should use dpars, instead of MNt below here??? self.rp = np.dot(self.Mtot_n, np.dot(self.Sigma_n, self.MNt_n)) # Should be approx~0.0 self.rr = np.dot(self.Mtot_n, np.dot(self.Sigma_n, self.Mtot_n.T)) # Calculate the log-likelihood logdetN2 = np.sum(np.log(np.diag(self.Np_cf[0]))) logdetphi2 = 0.5*np.sum(np.log(self.Phivec_n)) chi2dt = 0.5*dtNdt chi2phi = 0.5*np.sum(self.prpars_delta_n**2/self.Phivec_n) chi2phi1 = 0.5*np.dot(self.dpars_n, np.dot(self.Sigma_inv_n, self.dpars_n)) chi2_active = 0.5*np.dot(self.dpars_n, np.dot(self.Sigma_inv_n, self.dpars_n)) # NOTE: chi2_active is zero _if_ we move to ML solution. We are dpars # away from there. That's why we subtract it from loglik. # Note also that, now, chi2phi1 and chi2_active are the same in # this rescaling self.loglik = -logdetN2-logdetphi2-chi2dt-chi2phi+chi2phi1-chi2_active self.loglik_ml = -logdetN2-logdetphi2-chi2dt-chi2phi+chi2phi1
def predict(self, y, t): """ Compute the conditional predictive distribution of the model. :param y: ``(nsamples,)`` The observations to condition the model on. :param t: ``(ntest,)`` or ``(ntest, ndim)`` The coordinates where the predictive distribution should be computed. Returns a tuple ``(mu, cov)`` where * **mu** ``(ntest,)`` is the mean of the predictive distribution, and * **cov** ``(ntest, ntest)`` is the predictive covariance. """ self.recompute() r = self._check_dimensions(y)[self.inds] - self.mean(self._x) xs, i = self.parse_samples(t, False) alpha = cho_solve(self._factor, r) # Compute the predictive mean. Kxs = self.kernel(self._x[None, :], xs[:, None]) mu = np.dot(Kxs, alpha) + self.mean(xs) # Compute the predictive covariance. cov = self.kernel(xs[:, None], xs[None, :]) cov -= np.dot(Kxs, cho_solve(self._factor, Kxs.T)) return mu, cov
def predict(self, pv, flux=None, inputs=None, inputs_pred=None, mean_only=True, splits=None): flux = flux if flux is not None else self.data.masked_flux iptr = inputs if inputs is not None else self.data.masked_inputs ippr = inputs_pred if inputs_pred is not None else iptr K0 = self.compute_cmat(pv, iptr, iptr, add_wn=False, splits=splits) K = K0 + self._pv[-1]**2 * identity(K0.shape[0]) if inputs_pred is None: Ks = K0.copy() Kss = K.copy() else: Ks = self.compute_cmat(pv, ippr, ippr, add_wn=False, splits=splits) Kss = self.compute_cmat(pv, ippr, ippr, add_wn=True, splits=splits) L = sla.cho_factor(K) b = sla.cho_solve(L, flux) mu = dot(Ks, b) if mean_only: return mu else: b = sla.cho_solve(L, Ks.T) cov = Kss - dot(Ks, b) err = np.sqrt(diag(cov)) return mu, err
def cho_solve_ATAI(A, rho, b, c, lwr, check_finite=True): r""" Solve the linear system :math:`(A^T A + \rho I)\mathbf{x} = \mathbf{b}` or :math:`(A^T A + \rho I)X = B` using :func:`scipy.linalg.cho_solve`. Parameters ---------- A : array_like Matrix :math:`A` rho : float Scalar :math:`\rho` b : array_like Vector :math:`\mathbf{b}` or matrix :math:`B` c : array_like Matrix containing lower or upper triangular Cholesky factor, as returned by :func:`scipy.linalg.cho_factor` lwr : bool Flag indicating whether the factor is lower or upper triangular Returns ------- x : ndarray Solution to the linear system """ N, M = A.shape if N >= M: x = linalg.cho_solve((c, lwr), b, check_finite=check_finite) else: x = (b - A.T.dot(linalg.cho_solve((c, lwr), A.dot(b), check_finite=check_finite))) / rho return x
def rakeDistortionlessFilters(self, source, interferer, R_n, delay=0.03, epsilon=5e-3): ''' Compute time-domain filters of a beamformer minimizing noise and interference while forcing a distortionless response towards the source. ''' H = buildRIRMatrix(self.R, (source, interferer), self.Lg, self.Fs, epsilon=epsilon, unit_damping=True) L = H.shape[1]/2 # We first assume the sample are uncorrelated K_nq = np.dot(H[:,L:], H[:,L:].T) + R_n # constraint kappa = int(delay*self.Fs) A = H[:,:L] b = np.zeros((L,1)) b[kappa,0] = 1 # filter computation C = la.cho_factor(K_nq, overwrite_a=True, check_finite=False) B = la.cho_solve(C, A) D = np.dot(A.T, B) C = la.cho_factor(D, overwrite_a=True, check_finite=False) x = la.cho_solve(C, b) g_val = np.dot(B, x) # reshape and store self.filters = g_val.reshape((self.M, self.Lg)) # compute and return SNR A = np.dot(g_val.T, H[:,:L]) num = np.dot(A, A.T) denom = np.dot(np.dot(g_val.T, K_nq), g_val) return num/denom
def _calculate_log_likelihood(self): #if self.m == None: # Give error message R = zeros((self.n, self.n)) X,Y = array(self.X), array(self.Y) thetas = 10.**self.thetas for i in range(self.n): for j in arange(i+1,self.n): R[i,j] = (1-self.nugget)*e**(-sum(thetas*(X[i]-X[j])**2.)) #weighted distance formula R = R + R.T + eye(self.n) self.R = R one = ones(self.n) try: self.R_fact = cho_factor(R) rhs = vstack([Y, one]).T R_fact = (self.R_fact[0].T,not self.R_fact[1]) cho = cho_solve(R_fact, rhs).T self.mu = dot(one,cho[0])/dot(one,cho[1]) self.sig2 = dot(Y-dot(one,self.mu),cho_solve(self.R_fact,(Y-dot(one,self.mu))))/self.n #self.log_likelihood = -self.n/2.*log(self.sig2)-1./2.*log(abs(det(self.R)+1.e-16))-sum(thetas) self.log_likelihood = -self.n/2.*log(self.sig2)-1./2.*log(abs(det(self.R)+1.e-16)) except (linalg.LinAlgError,ValueError): #------LSTSQ--------- self.R_fact = None #reset this to none, so we know not to use cholesky #self.R = self.R+diag([10e-6]*self.n) #improve conditioning[Booker et al., 1999] rhs = vstack([Y, one]).T lsq = lstsq(self.R.T,rhs)[0].T self.mu = dot(one,lsq[0])/dot(one,lsq[1]) self.sig2 = dot(Y-dot(one,self.mu),lstsq(self.R,Y-dot(one,self.mu))[0])/self.n self.log_likelihood = -self.n/2.*log(self.sig2)-1./2.*log(abs(det(self.R)+1.e-16))
def loglik_full(self, l_a, l_rho, Agw, gammagw): """ Given all these parameters, calculate the full likelihood @param l_a: List of Fourier coefficient arrays for all pulsars @param l_rho: List of arrays of log10(PSD) amplitudes for all pulsars @param Agw: log10(GW amplitude) @param gammagw: GWB spectral index @return: Log-likelihood """ # Transform the GWB parameters to PSD coefficients (pc) pc_gw = self.gwPSD(Agw, gammagw) rv = 0.0 for ii, freq in enumerate(self.freqs): a_cos = l_a[:,2*ii] # Cosine modes for f=freq a_sin = l_a[:,2*ii+1] # Sine modes for f=freq rho = l_rho[:,ii] # PSD amp for f=freq # Covariance matrix is the same for sine and cosine modes cov = np.diag(10**rho) + self.hdmat * pc_gw[ii] cf = sl.cho_factor(cov) logdet = 2*np.sum(np.log(np.diag(cf[0]))) # Add the log-likelihood for the cosine and the sine modes rv += -0.5 * np.dot(a_cos, sl.cho_solve(cf, a_cos)) - \ 0.5 * np.dot(a_sin, sl.cho_solve(cf, a_sin)) - \ 2*self.Npsr*np.log(2*np.pi) - logdet return rv
def _update_cache(self): """ INPUT: hyperparams: dictionary OUTPUT: dictionary with the fields K: kernel Kinv: inverse of the kernel L: chol(K) alpha: solve(K,y) W: D*Kinv * alpha*alpha^T """ cov_params_have_changed = self.covar.params_have_changed if cov_params_have_changed or self.Y_has_changed: K = self.covar.K() L = LA.cholesky(K).T# lower triangular Kinv = LA.cho_solve((L,True),SP.eye(L.shape[0])) alpha = LA.cho_solve((L,True),self.Y) W = self.t*Kinv - SP.dot(alpha,alpha.T) self._covar_cache = {} self._covar_cache['K'] = K self._covar_cache['Kinv'] = Kinv self._covar_cache['L'] = L self._covar_cache['alpha'] = alpha self._covar_cache['W'] = W return self._covar_cache
def cal_varcov(self, θ2_vec): """calculate variance covariance matrix""" θ2, ix_θ2_T, Z, LinvW, X1 = self.θ2, self.ix_θ2_T, self.Z, self.LinvW, self.X1 θ2.T[ix_θ2_T] = θ2_vec # update δ δ = self.cal_δ(θ2) jacob = self.cal_jacobian(θ2, δ) θ1, ξ = self.cal_θ1_and_ξ(δ) Zres = Z * ξ.reshape(-1, 1) Ω = Zres.T @ Zres # covariance of the momconds G = (np.c_[X1, jacob].T @ Z).T # gradient of the momconds WG = cho_solve(LinvW, G) WΩ = cho_solve(LinvW, Ω) tmp = solve(G.T @ WG, G.T @ WΩ @ WG).T # G'WΩWG(G'WG)^(-1) part varcov = solve((G.T @ WG), tmp) return varcov
def get_covariances(self,hyperparams): """ INPUT: hyperparams: dictionary OUTPUT: dictionary with the fields K: kernel Kinv: inverse of the kernel L: chol(K) alpha: solve(K,y) W: D*Kinv * alpha*alpha^T """ if self._is_cached(hyperparams): return self._covar_cache K = self.covar.K(hyperparams['covar']) if self.likelihood is not None: Knoise = self.likelihood.K(hyperparams['lik'],self.n) K += Knoise L = LA.cholesky(K).T# lower triangular alpha = LA.cho_solve((L,True),self.Y) Kinv = LA.cho_solve((L,True),SP.eye(L.shape[0])) W = self.t*Kinv - SP.dot(alpha,alpha.T) self._covar_cache = {} self._covar_cache['K'] = K self._covar_cache['Kinv'] = Kinv self._covar_cache['L'] = L self._covar_cache['alpha'] = alpha self._covar_cache['W'] = W self._covar_cache['hyperparams'] = copy.deepcopy(hyperparams) return self._covar_cache
def _LMLgrad_covar_debug(self,covar): assert self.N*self.P<2000, 'gp2kronSum:: N*P>=2000' y = SP.reshape(self.Y,(self.N*self.P), order='F') K = SP.kron(self.Cg.K(),self.XX) K += SP.kron(self.Cn.K()+self.offset*SP.eye(self.P),SP.eye(self.N)) cholK = LA.cholesky(K).T Ki = LA.cho_solve((cholK,True),SP.eye(y.shape[0])) Kiy = LA.cho_solve((cholK,True),y) if covar=='Cr': n_params = self.Cr.getNumberParams() elif covar=='Cg': n_params = self.Cg.getNumberParams() elif covar=='Cn': n_params = self.Cn.getNumberParams() RV = SP.zeros(n_params) for i in range(n_params): #0. calc grad_i if covar=='Cg': C = self.Cg.Kgrad_param(i) Kgrad = SP.kron(C,self.XX) elif covar=='Cn': C = self.Cn.Kgrad_param(i) Kgrad = SP.kron(C,SP.eye(self.N)) #1. der of log det RV[i] = 0.5*(Ki*Kgrad).sum() #2. der of quad form RV[i] -= 0.5*(Kiy*SP.dot(Kgrad,Kiy)).sum() return RV
def elbo(params, mask, *args): """ELBO with full posterior covariance matrix""" t, mu, post_cov = args K, dK = kernel(t, params) dK *= mask[np.newaxis, np.newaxis, :] try: L = cholesky(K, lower=True) except LinAlgError: return -np.inf, np.zeros_like(params) Kinv = cho_solve((L, True), np.eye(K.shape[0])) # K inverse if mu.ndim == 1: mu = mu[:, np.newaxis] alpha = cho_solve((L, True), mu) ll_dims = -0.5 * np.einsum("ik,ik->k", mu, alpha) tmp = np.einsum("ik,jk->ijk", alpha, alpha) tmp -= Kinv[:, :, np.newaxis] for i in range(post_cov.shape[-1]): KinvSigma = cho_solve((L, True), post_cov[:, :, i]) ll_dims[i] -= 0.5 * np.trace(KinvSigma) tmp[:, :, i] += KinvSigma @ Kinv ll_dims -= np.log(np.diag(L)).sum() ll = ll_dims.sum(-1) dll_dims = 0.5 * np.einsum("ijl,ijk->kl", tmp, dK) dll = dll_dims.sum(-1) return ll, dll
def predict(self, y, t): """ Compute the conditional predictive distribution of the model. :param y: ``(nsamples, )`` The observations to condition the model on. :param t: ``(ntest, )`` The coordinates where the predictive distribution should be computed. :returns mu: ``(ntest, )`` The mean of the predictive distribution. :returns cov: ``(ntest, ntest)`` The predictive covariance. """ r = self._check_dimensions(y) xs, i = self._parse_samples(t, False) alpha = cho_solve(self._factor, r) # Compute the predictive mean. Kxs = self._kernel(self._x[None, :], xs[:, None]) mu = np.dot(Kxs, alpha) # Compute the predictive covariance. cov = self._kernel(xs[:, None], xs[None, :]) cov -= np.dot(Kxs, cho_solve(self._factor, Kxs.T)) return mu, cov
def grad_nlogprob(hypers): amp2 = np.exp(hypers[0]) noise = np.exp(hypers[1]) ls = np.exp(hypers[2:]) chol, corr, grad_corr = memoize(amp2, noise, ls) solve = spla.cho_solve((chol, True), diffs) inv_cov = spla.cho_solve((chol, True), np.eye(chol.shape[0])) jacobian = np.outer(solve, solve) - inv_cov grad = np.zeros(self.D + 2) # Log amplitude gradient. grad[0] = 0.5 * np.trace(np.dot( jacobian, corr + 1e-6*np.eye(chol.shape[0]))) * amp2 # Log noise gradient. grad[1] = 0.5 * np.trace(np.dot( jacobian, np.eye(chol.shape[0]))) * noise # Log length scale gradients. for dd in xrange(self.D): grad[dd+2] = 1 * np.trace(np.dot( jacobian, -amp2*grad_corr[:,:,dd]*comp[:,dd][:,np.newaxis]/(np.exp(ls[dd]))))*np.exp(ls[dd]) # Roll in the prior variance. #grad -= 2*hypers/self.hyper_prior return -grad
def compute_logprod_derivative(Alup, dA, B, dB): """ I = logdet(A)+Tr(inv(A)*B) dI/dx = Tr(inv(A)*(dA - dA*inv(A)*B + dB) """ tmp = lalg.cho_solve(Alup, B, check_finite=False) tmp2 = dA + dB - dA.dot(tmp) return np.trace(lalg.cho_solve(Alup, tmp2, check_finite=False))
def LMLdebug(self): """ LML function for debug """ assert self.N*self.P<5000, 'gp2kronSum:: N*P>=5000' y = SP.reshape(self.Y,(self.N*self.P), order='F') V = SP.kron(SP.eye(self.P),self.F) XX = SP.dot(self.Xr,self.Xr.T) K = SP.kron(self.Cr.K(),XX) K += SP.kron(self.Cn.K()+self.offset*SP.eye(self.P),SP.eye(self.N)) # inverse of K cholK = LA.cholesky(K) Ki = LA.cho_solve((cholK,False),SP.eye(self.N*self.P)) # Areml and inverse Areml = SP.dot(V.T,SP.dot(Ki,V)) cholAreml = LA.cholesky(Areml) Areml_i = LA.cho_solve((cholAreml,False),SP.eye(self.K*self.P)) # effect sizes and z b = SP.dot(Areml_i,SP.dot(V.T,SP.dot(Ki,y))) z = y-SP.dot(V,b) Kiz = SP.dot(Ki,z) # lml lml = y.shape[0]*SP.log(2*SP.pi) lml += 2*SP.log(SP.diag(cholK)).sum() lml += 2*SP.log(SP.diag(cholAreml)).sum() lml += SP.dot(z,Kiz) lml *= 0.5 return lml
def find_likelihood_der(self, X, y): """ Find the negative log likelihood and its partial derivatives. Parameters ---------- Returns ------- """ n = len(X) K = self.cf.eval(X) #if len(self.krnds)!=K.shape[0]: # print "Created new self.krnds!" # self.krnds = np.random.randn(K.shape[0])*10**-6 #K = K + np.eye(K.shape[0])*self.krnds L = np.linalg.cholesky(K) # Problems using this on the cluster - bad scaling! Running time becomes really bad with large N. Solution: Update ATLAS #L = la.cholesky(K) #print np.linalg.solve(L.T, np.linalg.solve(L, y)) #a = np.linalg.solve(L.T, np.linalg.solve(L, y)) a = la.cho_solve((L, True), y) nll = 0.5*np.dot(y.T, a) + np.sum(np.log(np.diag(L))) + 0.5*n*np.log(2*np.pi) ders = np.zeros(len(self.cf.get_params())) #W = np.linalg.solve(L.T, np.linalg.solve(L, np.eye(n))) - a*a.T W = la.cho_solve((L, True), np.eye(n)) - a*a.T for i in range(len(self.cf.get_params())): ders[i] = np.sum(W*self.cf.derivative(X, i))/2 return nll[0,0], ders
def predict_variance(self, X1, X2): if self.m == None: print("ERROR: Model has to be trained first.") return None LX1 = spla.cho_solve((self.m.posterior.L, True), self.kernel.getCovMatrix(self.X, X1, "cross")) LX2 = spla.cho_solve((self.m.posterior.L, True), self.kernel.getCovMatrix(self.X, X2, "cross")) var = self.kernel.getCovMatrix(X1, X2, "cross") - np.dot(LX1.T, LX2) return var
def E_step(self): M = np.dot(self.W.T,self.W) + np.eye(self.q)*self.sigma2 #M_inv = np.linalg.inv(M) #self.m_Z = np.dot(M_inv,np.dot(self.W.T,self.X2.T)).T #self.S_z = M_inv*self.sigma2 M_chol = linalg.cholesky(M) M_inv = linalg.cho_solve((M_chol,1),np.eye(self.q)) self.m_Z = linalg.cho_solve((M_chol,1),np.dot(self.W.T,self.X2.T)).T self.S_z = M_inv*self.sigma2
def remove_affine(p, q, q_factor=None, skip_factorization=False): """Removes an (unknown) affine transform between two matrixes. Given two arrays of the same size, `p` and `q`, finds a matrix `A` and column vector `t` such that `p = A * q + t` in the least-squares sense, and then computes `qnew = A * q + t`. (Notation: `matrix + vector` implies the vector is added to each column of the matrix.) NB: `p` and the returned `qnew` will be equal if and only if `p` is generated from `q` via an affine transform (no noise). Returns `(qnew, q_factor, Ahat, that)`. `q_factor` is a matrix factorization that can greatly speed up subsequent calls to remove_affine *with the same `q`*. If your `q` stays the same for multiple calls, cache `q_factor` and pass it in as a keyword argument; `q_factor` won't change from call to call. However, if your `q` change from call to call, ignore `q_factor` and pass in `skip_factorization=False` to avoid even calculating it. `Ahat` and `that` are the estimated values of `A` and `t`. NB2: the default `q_factor=None` will trigger computation of the factorization unless `skip_factorization=False`. Non-`None` `q_factor` will be trusted: no checks will be performed to make sure the given `q_factor` is indeed generated by the `q` you pass in. (Example: for `q.shape` of (2, 22), the speedup from using `q_factor` is 1.4x with skip_factorization=False, and 1.3x the case with skip_factorization=True, on a 2009 Mac Book Pro.) Implements the algorithm described in H. Spath, "Fitting affine and orthogonal transformations between two sets of points" in *Mathematical Communications*, vol. 9 (2004), pp. 27--34. http://hrcak.srce.hr/file/1425 """ qaug = np.vstack([q, np.ones_like(q[0, :])]) if q_factor is None: Q = np.dot(qaug, qaug.T) if skip_factorization: sol = la.lstsq(Q, np.dot(qaug, p.T))[0] q_factor = None else: q_factor = scila.cho_factor(Q) sol = scila.cho_solve(q_factor, np.dot(qaug, p.T)) else: sol = scila.cho_solve(q_factor, np.dot(qaug, p.T)) # sol.shape is (n+1, n), for n=p.shape[0] Ahat = sol[:-1, :].T # top square matrix of sol, transposed that = sol[-1:, :].T # bottom row vector of sol, transposed qnew = np.dot(Ahat, q) + that return (qnew, lambda x: Ahat @ x + that, lambda t: np.linalg.lstsq(Ahat, t - that, rcond=None)[0], Ahat, that)
def multivariate_t_pdf(self, nu, cov_det, d, scaleT, centered, L): L *= scaleT linalg.cho_solve((L, True), centered, overwrite_b=True, check_finite=False) inv = centered.T.dot(centered) # (L^-1b)^T(L^-1b) # Log Multivariate T - PDF return gammaln((nu + d) / 2.) - \ (gammaln(nu / 2.) + (d / 2.) * (log(nu) + log(pi)) + (0.5 * cov_det) + ((nu + d) / 2.) * log(1. + inv/nu))
def log_marginal_likelihood(self, theta=None, eval_gradient=False): if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ kernel = self.kernel_.clone_with_theta(theta) with warnings.catch_warnings(): warnings.simplefilter("ignore") if eval_gradient: K, K_gradient = kernel(self.X_train_, eval_gradient=True) else: K = kernel(self.X_train_) #check finite if np.isnan(K).any() or np.isinf(K).any(): return (-np.inf, np.zeros_like(theta)) \ if eval_gradient else -np.inf K[np.diag_indices_from(K)] += self.alpha try: L = cholesky(K, lower=True) except np.linalg.LinAlgError: return (-np.inf, np.zeros_like(theta)) \ if eval_gradient else -np.inf # Support multi-dimensional output of self.y_train_ y_train = self.y_train_ if y_train.ndim == 1: y_train = y_train[:, np.newaxis] alpha = cho_solve((L, True), y_train) # Line 3 # Compute log-likelihood (compare line 7) log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha) log_likelihood_dims -= np.log(np.diag(L)).sum() log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi) log_likelihood = log_likelihood_dims.sum(-1) # sum over dimensions if eval_gradient: # compare Equation 5.9 from GPML tmp = np.einsum("ik,jk->ijk", alpha, alpha) # k: output-dimension tmp -= cho_solve((L, True), np.eye(K.shape[0]))[:, :, np.newaxis] # Compute "0.5 * trace(tmp.dot(K_gradient))" without # constructing the full matrix tmp.dot(K_gradient) since only # its diagonal is required log_likelihood_gradient_dims = \ 0.5 * np.einsum("ijl,ijk->kl", tmp, K_gradient) log_likelihood_gradient = log_likelihood_gradient_dims.sum(-1) if eval_gradient: return log_likelihood, log_likelihood_gradient else: return log_likelihood
def likelihood_prior(self, mu, Sigma, k, R_S_mu = None, log_det_Q = None, R_S = None, switchprior = False): """ Computes the prior that is \pi( \mu | \theta[k], \Sigma[k]) \pi(\Sigma| Q[k], \nu[k]) = N(\mu; \theta[k], \Sigma[k]) IW(\Sigma; Q[k], \nu[k]) If switchprior = True, special values of nu and Sigma_mu are used if the parameters nu_sw and Sigma_mu_sw are set respectively. This enables use of "relaxed" priors facilitating label switch. NB! This makes the kernel non-symmetric, hence it cannot be used in a stationary state. """ if switchprior: try: nu = self.nu_sw except: nu = self.prior[k]['sigma']['nu'] try: Sigma_mu = self.Sigma_mu_sw except: Sigma_mu = self.prior[k]['mu']['Sigma'] Q = self.prior[k]['sigma']['Q']*nu/self.prior[k]['sigma']['nu'] else: nu = self.prior[k]['sigma']['nu'] Sigma_mu = self.prior[k]['mu']['Sigma'] Q = self.prior[k]['sigma']['Q'] if np.isnan(mu[0]) == 1: return 0, None, None, None if R_S_mu is None: R_S_mu = sla.cho_factor(Sigma_mu,check_finite = False) log_det_Sigma_mu = 2 * np.sum(np.log(np.diag(R_S_mu[0]))) if log_det_Q is None: R_Q = sla.cho_factor(Q,check_finite = False) log_det_Q = 2 * np.sum(np.log(np.diag(R_Q[0]))) if R_S is None: R_S = sla.cho_factor(Sigma,check_finite = False) log_det_Sigma = 2 * np.sum(np.log(np.diag(R_S[0]))) mu_theta = mu - self.prior[k]['mu']['theta'].reshape(self.d) # N(\mu; \theta[k], \Sigma[k]) lik = - np.dot(mu_theta.T, sla.cho_solve(R_S_mu, mu_theta, check_finite = False)) /2 lik = lik - 0.5 * (nu + self.d + 1.) * log_det_Sigma lik = lik + (nu * 0.5) * log_det_Q lik = lik - 0.5 * log_det_Sigma_mu lik = lik - self.ln_gamma_d(0.5 * nu) - 0.5 * np.log(2) * (nu * self.d) lik = lik - 0.5 * np.sum(np.diag(sla.cho_solve(R_S, Q))) return lik, R_S_mu, log_det_Q, R_S
def draw_new_wt_assgns(self, word, topic_id, new_doc=False, wvmodel=None): """ Log of the probablity density function for the Student-T Distribution Provides a PDF for a word (really a word-vector) in a given topic distribution. :param word: string of the word to find probabilty of word-topic assignment :param topic_id: Interger, a topic id to reference a topic distribution and its params :param new_doc: False (default), optional. True if predicting topics from unseen document/not currently training :param wvmodel: None by default. If predicting topics from an unseen document, requires a loaded word2vec model from GenSim :type wvmodel: gensim.models.word2vec.Word2Vec :return: log of PDF from t-distribution for a given word. Type: Float """ if not new_doc: # Getting params for calculating PDF of T-Dist for a word cov_det = self.topic_params[topic_id]["Chol Det"] Nk = self.topic_params[topic_id]["Topic Count"] # Precalculating some terms (V_di - Mu) centered = np.copy(self.word_vecs[word] - self.topic_params[topic_id]["Topic Mean"]) # (L^-1b)^T(L^-1b) _ if np.isnan(centered).any() or np.isinf(centered).any(): print centered print topic_id print Nk print word print self.word_vecs[word] print self.topic_params[topic_id]["Topic Mean"] linalg.cho_solve((self.topic_params[topic_id]["Lower Triangle"], True), centered, overwrite_b=True, check_finite=True) LLcomp = centered.T.dot(centered) # SHOULD THSI BE CENTERD.DOT(INV_COV).DOT(CENTERED.T))???? d = self.word_vec_size # dimensionality of word vector nu = self.priors.nu + Nk - d + 1. # Log PDF of multivariate student-T distribution log_prob = gammaln(nu + d / 2.) - \ (gammaln(nu / 2.) + d/2. * (log(nu) + log(pi)) +0.5 * cov_det + ((nu + d) / 2.) * log((1. + LLcomp ) / nu)) return log_prob if new_doc: cov_det = self.topic_params[topic_id]["Chol Det"] Nk = self.topic_params[topic_id]["Topic Count"] centered = self.word_vecs[word] - self.topic_params[topic_id]["Topic Mean"] cholesky_solution = linalg.cho_solve((self.topic_params[topic_id]["Lower Triangle"], True), centered) LLcomp = cholesky_solution.T.dot(cholesky_solution) # TODO: update to be like loop above d = wvmodel.vector_size nu = self.priors.nu + Nk - d + 1. log_prob = gammaln((nu + d) / 2.) - \ (gammaln(nu / 2.) + d/2. * (log(nu) + log(pi)) +0.5 * np.log(cov_det) + ((nu + d) / 2.) * log((1. + LLcomp )/ nu)) return log_prob
def prediction(self, data=None): """ Evaluates the posterior GP mean and covariance functions. This method computes the mean and covariance matrix of the posterior predictive distribution of the GP. The mean and covariance matrix are incorporated as attributes of the class and can be subsequently used to draw samples of the function values corresponding to the input values. If no data array is passed as argument, then the data attribute is used. :param np.array data: a `(N x 2)` or `(N x 3)` array of N data inputs: (data coordiante, data value, data error (optional)). :return: mean and covariance matrix of posterior predictive. """ if data is None and self.data is None: raise TypeError('Data array cannot be None, unless you want your' 'predictions to look like your prior. In that' 'case, better use the `sample` method.') elif data is not None: if self.data is not None: print('Data given. Overriden previous data.') self.data = data # Compute covariance matrices cov_test_data, cov_data = self.computecovariances(self.data) self.covariance_test_data = cov_test_data self.covariance_data = cov_data # If errors are provided for data, add them to the covariance diagonal if self.data.shape[0] > 2: dataerror = np.diag(np.atleast_1d(self.data[2] ** 2)) else: dataerror = np.diag(np.zeros_like(self.data[0])) # Use Cholesky decomposition on covariance of data inputs. factor, flag = cho_factor(self.covariance_data + dataerror) # Compute posterior mean (eq. 2.23 Rasmussen) a = cho_solve((factor, flag), self.data[1]) self.predmean = np.dot(self.covariance_test_data, np.array(a)) # Compute posterior covariance (eq. 2.24 Rasmussen) alpha = cho_solve((factor, flag), self.covariance_test_data.T) beta = np.dot(self.covariance_test_data, np.array(alpha)) self.predcov = self.covariance - beta return self.predmean, self.predcov
def _alpha(self, L): """ Covariance-derived term to construct expectations. See Rasmussen & Williams. Parameters ---------- L : np.ndarray Cholesky triangular Returns ---------- np.ndarray (alpha) """ return la.cho_solve((L.T, True), la.cho_solve((L, True), np.transpose(self.data)))
def _GMM(self, theta_vec): """GMM objective function""" _blp, theta, delta, v, D, x2, nmkt, nsimind, nbrand = self.set_aliases() theta[self.ix_theta] = theta_vec theta_v = theta[:, 0] theta_D = theta[:, 1:] # adaptive etol if self.GMM_diff < 1e-6: etol = self.etol = 1e-13 elif self.GMM_diff < 1e-3: etol = self.etol = 1e-12 else: etol = self.etol = 1e-9 if self.cython: _blp.cal_delta(delta, theta_v, theta_D, self.ln_s_jt, v, D, x2, nmkt, nsimind, nbrand, etol, self.iter_limit) else: self.cal_delta(theta) if np.isnan(delta).sum(): return(1e+10) Z_x1 = self.Z_x1 LW = self.LW # Z'delta Z_delta = self.Z.T.dot(delta) #\[ \theta_1 = (\tilde{X}'ZW^{-1}Z'\tilde{X})^{-1}\tilde{X}'ZW^{-1}Z'\delta \] theta1 = solve(Z_x1.T.dot(cho_solve(LW, Z_x1)), Z_x1.T.dot(cho_solve(LW, Z_delta))) xi = self.xi = delta - self.x1.dot(theta1) # Z'xi Z_xi = self.Z.T.dot(xi) # \[ (\delta - \tilde{X}\theta_1)'ZW^{-1}Z'(\delta-\tilde{X}\theta_1) \] GMM = Z_xi.T.dot(cho_solve(LW, Z_xi)) self.GMM_diff = abs(self.GMM_old - GMM) self.GMM_old = GMM print('GMM value: {}'.format(GMM)) return(GMM)
def mark2loglikelihood(psr, Aw, Ar, Si): """ Log-likelihood for our pulsar This likelihood does marginalize over the timing model. Calculate covariance matrix in the time-domain with: ll = 0.5 * res^{t} (C^{-1} - C^{-1} M (M^{T} C^{-1} M)^{-1} M^{T} C^{-1} ) res - \ 0.5 * log(det(C)) - 0.5 * log(det(M^{T} C^{-1} M)) In relation to 'mark1loglikelihood', this likelihood has but a simple addition: res' = res - M xi where M is a (n x m) matrix, with m < n, and xi is a vector of length m. The xi are analytically marginalised over, yielding the above equation (up to constants) :param psr: pulsar object, containing the data and stuff :param Aw: White noise amplitude, model parameter :param Ar: Red noise amplitude, model parameter :param Si: Spectral index of red noise, model parameter """ Mmat = psr.Mmat Cov = Aw ** 2 * np.eye(len(psr.toas)) + PL_covmat(psr.toas, Ar, alpha=0.5 * (3 - Si), fL=1.0 / (year * 20)) cfC = sl.cho_factor(Cov) Cinv = sl.cho_solve(cfC, np.eye(len(psr.toas))) ldetC = 2 * np.sum(np.log(np.diag(cfC[0]))) MCM = np.dot(Mmat.T, np.dot(Cinv, Mmat)) cfM = sl.cho_factor(MCM) ldetM = 2 * np.sum(np.log(np.diag(cfM[0]))) wr = np.dot(Cinv, psr.residuals) rCr = np.dot(psr.residuals, wr) MCr = np.dot(Mmat.T, wr) return ( -0.5 * rCr + 0.5 * np.dot(MCr, sl.cho_solve(cfM, MCr)) - 0.5 * ldetC - 0.5 * ldetM - 0.5 * len(psr.residuals) * np.log(2 * np.pi) )
def solve(self, other): if other.ndim == 1: Nx = np.array(other / self.N) elif other.ndim == 2: Nx = np.array(other / self.N[:,None]) UNx = np.dot(self.U.T, Nx) Sigma = np.diag(1/self.J) + np.dot(self.U.T, self.U/self.N[:,None]) cf = sl.cho_factor(Sigma) if UNx.ndim == 1: tmp = np.dot(self.U, sl.cho_solve(cf, UNx)) / self.N else: tmp = np.dot(self.U, sl.cho_solve(cf, UNx)) / self.N[:,None] return Nx - tmp
def testGetGradients(self): ''' Compares the gradients computed as done originally in spear-mint with our implementation. ''' xstar = scale * npr.random((1,d)) (mg,vg) = self.gp.getGradients(xstar[0]) ###################################################################################### #Spearmint Code #The code below is taken from GPEIOptChooser and adapted to the variables here. cand_cross_grad = self.amp2 * self.cov_grad_func(self.ls, self.X, xstar) comp_cov = cov(self, self.X) cand_cross = cov(self, self.X, xstar) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(self.X.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), self.y - self.mean) # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = -2 * np.dot(spla.cho_solve( (obsv_chol, True), cand_cross).transpose(), grad_cross) ###################################################################################### #End of Spearmint Code #it seems the gradient of the spearmint code is already optimized and therefore differs by sign #however, the gradient of our implementation agrees with the first order approximation grad_xp_m = -grad_xp_m grad_xp_v = -grad_xp_v assert(spla.norm(mg - grad_xp_m) < 1e-50) assert(spla.norm(vg[0] - grad_xp_v[0]) < 1e-50) #Test against first order approximation epsilon = 1e-6 vg = np.array([vg]) #needs to be in the format [[d0,...,dn]] def get_variance(x): return self.gp.predict(x, True)[1] self.assert_first_order_gradient_approximation(get_variance, xstar, vg, epsilon) mg = np.array([np.array([mg])]) #we need mg in the format [[d0, d1, ..., dn]] def get_mean(x): return np.array([self.gp.predict(x)]) self.assert_first_order_gradient_approximation(self.gp.predict, xstar, mg, epsilon)
def krr(descriptors, labels, training_size=1500, test_size=None, sigma=1000.0, opt=True, identifier=None, kernel='gaussian', use_tf=True, show_msgs=True): """ Basic krr methodology for a single descriptor type. descriptors: array of descriptors. labels: array of labels. training_size: size of the training set to use. test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. opt: if the optimized algorithm should be used. For benchmarking purposes. identifier: string with the name of the descriptor used. kernel: which kernel to use. use_tf: if tensorflow should be used. show_msgs: if debug messages should be shown. NOTE: identifier is just a string and is only for identification purposes. Also, training is done with the first part of the data and testing with the ending part of the data. """ tic = time.perf_counter() # Initial calculations for later use. data_size = descriptors.shape[0] if not identifier: identifier = 'NOT SPECIFIED' if not data_size == labels.shape[0]: raise ValueError('Labels size is different than descriptors size.') if training_size >= data_size: raise ValueError('Training size is greater or equal to the data size.') # If tf is to be used but couldn't be imported, don't try to use it. if use_tf and not TF_AV: use_tf = False # If test_size is not set, it is set to a maximum size of 1500. # Also, no overlapping with training data is achieved. if not test_size: test_size = data_size - training_size if test_size > 1500: test_size = 1500 if show_msgs: printc(f'{identifier} ML started.', 'GREEN') printc(f'\tTraining size: {training_size}', 'CYAN') printc(f'\tTest size: {test_size}', 'CYAN') printc(f'\tSigma: {sigma}', 'CYAN') printc(f'\tKernel: {kernel}', 'CYAN') printc(f'\tUse tf: {use_tf}', 'CYAN') if use_tf: if tf.config.experimental.list_physical_devices('GPU'): with tf.device('GPU:0'): X_tr = descriptors[:training_size] Y_tr = labels[:training_size] if kernel == 'gaussian': K_tr = gaussian_kernel(X_tr, X_tr, sigma, use_tf=use_tf) elif kernel == 'laplacian': K_tr = laplacian_kernel(X_tr, X_tr, sigma, use_tf=use_tf) elif kernel == 'wasserstein': K_tr = wasserstein_kernel(X_tr, X_tr, sigma, use_tf=use_tf) else: raise TypeError(f'{kernel} kernel not found.') # Adding a small value on the diagonal for cho_solve. dv = tf.linalg.tensor_diag(tf.constant(1e-8, shape=(training_size), dtype=tf.float64)) K_tr += dv Y_tr = tf.expand_dims(Y_tr, 1) alpha = tf.linalg.cholesky_solve(tf.linalg.cholesky(K_tr), Y_tr) X_te = descriptors[-test_size:] Y_te = labels[-test_size:] if kernel == 'gaussian': K_te = gaussian_kernel(X_te, X_tr, sigma, use_tf=use_tf) elif kernel == 'laplacian': K_te = laplacian_kernel(X_te, X_tr, sigma, use_tf=use_tf) elif kernel == 'wasserstein': K_te = wasserstein_kernel(X_te, X_tr, sigma, use_tf=use_tf) else: raise TypeError(f'{kernel} kernel not found.') Y_te = tf.expand_dims(Y_te, 1) Y_pr = tf.tensordot(K_te, alpha, 1) mae = tf.reduce_mean(tf.abs(Y_pr - Y_te)) else: raise TypeError('No GPU found, could not create Tensor objects.') else: X_tr = descriptors[:training_size] Y_tr = labels[:training_size] if kernel == 'gaussian': K_tr = gaussian_kernel(X_tr, X_tr, sigma, use_tf=use_tf) elif kernel == 'laplacian': K_tr = laplacian_kernel(X_tr, X_tr, sigma, use_tf=use_tf) elif kernel == 'wasserstein': K_tr = wasserstein_kernel(X_tr, X_tr, sigma, use_tf=use_tf) else: raise TypeError(f'{kernel} kernel not found.') # Adding a small value on the diagonal for cho_solve. K_tr[np.diag_indices_from(K_tr)] += 1e-8 alpha = LA.cho_solve(LA.cho_factor(K_tr), Y_tr) X_te = descriptors[-test_size:] Y_te = labels[-test_size:] if kernel == 'gaussian': K_te = gaussian_kernel(X_te, X_tr, sigma, use_tf=use_tf) elif kernel == 'laplacian': K_te = laplacian_kernel(X_te, X_tr, sigma, use_tf=use_tf) elif kernel == 'wasserstein': K_te = wasserstein_kernel(X_te, X_tr, sigma, use_tf=use_tf) else: raise TypeError(f'{kernel} kernel not found.') Y_pr = np.dot(K_te, alpha) mae = np.mean(np.abs(Y_pr - Y_te)) toc = time.perf_counter() tictoc = toc - tic if show_msgs: printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN') printc(f'\t{identifier} ML took {tictoc:.4f} seconds.', 'GREEN') return mae, tictoc
def predict(self, X, return_std=False, return_cov=False, return_mean_grad=False, return_std_grad=False): """ In addition to the mean of the predictive distribution, also its standard deviation (return_std=True) or covariance (return_cov=True), the gradient of the mean and the standard-deviation with respect to X can be optionally provided. Parameters ---------- X : array-like, shape = (n_samples, n_features) Query points where the GP is evaluated return_std : bool, default: False If True, the standard-deviation of the predictive distribution at the query points is returned along with the mean. return_cov : bool, default: False If True, the covariance of the joint predictive distribution at the query points is returned along with the mean return_mean_grad: bool, default: False Whether or not to return the gradient of the mean. Only valid when X is a single point. return_std_grad: bool, default: False Whether or not to return the gradient of the std. Only valid when X is a single point. Returns ------- y_mean : array, shape = (n_samples, [n_output_dims]) Mean of predictive distribution a query points y_std : array, shape = (n_samples,), optional Standard deviation of predictive distribution at query points. Only returned when return_std is True. y_cov : array, shape = (n_samples, n_samples), optional Covariance of joint predictive distribution a query points. Only returned when return_cov is True. y_mean_grad: shape = (n_samples, n_features) The gradient of the predicted mean y_std_grad: shape = (n_samples, n_features) The gradient of the predicted std. """ if return_std and return_cov: raise RuntimeError( "Not returning standard deviation of predictions when " "returning full covariance.") if return_std_grad and not return_std: raise ValueError("Not returning std_gradient without returning " "the std.") X = check_array(X) if X.shape[0] != 1 and (return_mean_grad or return_std_grad): raise ValueError("Not implemented for n_samples > 1") if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior y_mean = np.zeros(X.shape[0]) if return_cov: y_cov = self.kernel(X) return y_mean, y_cov elif return_std: y_var = self.kernel.diag(X) return y_mean, np.sqrt(y_var) else: return y_mean else: # Predict based on GP posterior K_trans = self.kernel_(X, self.X_train_) y_mean = K_trans.dot(self.alpha_) # Line 4 (y_mean = f_star) y_mean = self.y_train_mean + y_mean # undo normal. if return_cov: v = cho_solve((self.L_, True), K_trans.T) # Line 5 y_cov = self.kernel_(X) - K_trans.dot(v) # Line 6 return y_mean, y_cov elif return_std: # compute inverse K_inv of K based on its Cholesky # decomposition L and its inverse L_inv L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0])) K_inv = L_inv.dot(L_inv.T) # Compute variance of predictive distribution y_var = self.kernel_.diag(X) y_var -= np.einsum("ki,kj,ij->k", K_trans, K_trans, K_inv) # Check if any of the variances is negative because of # numerical issues. If yes: set the variance to 0. y_var_negative = y_var < 0 if np.any(y_var_negative): warnings.warn("Predicted variances smaller than 0. " "Setting those variances to 0.") y_var[y_var_negative] = 0.0 y_std = np.sqrt(y_var) if return_mean_grad: grad = self.kernel_.gradient_x(X[0], self.X_train_) grad_mean = np.dot(grad.T, self.alpha_) if return_std_grad: # XXX: Cache np.dot(K_trans, K_inv) from above grad_std = np.zeros(X.shape[1]) if not np.allclose(y_std, grad_std): grad_std = -np.dot(K_trans, np.dot(K_inv, grad))[0] / y_std return y_mean, y_std, grad_mean, grad_std if return_std: return y_mean, y_std, grad_mean else: return y_mean, grad_mean else: if return_std: return y_mean, y_std else: return y_mean
def test_cho_solve(self): x = array([[2,-1,0], [-1,2,-1], [0,-1,2]]) xcho = cho_factor(x) assert_no_overwrite(lambda b: cho_solve(xcho, b), [(3,)])
def get_d(hessian, grad: np.ndarray): df2_i = cho_solve(cho_factor(hessian), np.eye(len(hessian))) d = np.matmul(grad, df2_i) return d
sigma_values = np.logspace(min_sigma, max_sigma, num_parameters) lam = 1.0 lam_values = np.logspace(-7, 2, num_parameters) # construct kernel matrices K_train = rbf_kernel(X=x_train, gamma=mean_sigma) K_test = rbf_kernel(X=x_train, Y=x_test, gamma=mean_sigma) # slow method: solve problem t0 = time() alpha = scio.linalg.solve(K_train + lam * np.eye(x_train.shape[0]), y_train) t1 = time() - t0 print('Time taken for solve: {}'.format(t1)) # fast method: cholesky decomposition manually t0 = time() R = cholesky(K_train + lam * np.eye(x_train.shape[0])) alpha = scio.linalg.solve(R, scio.linalg.solve(R.T, y_train)) t1 = time() - t0 print('Time taken for cholesky manually: {}'.format(t1)) # fast method: cholesky decomposition with functions t0 = time() R, lower = cho_factor(K_train + lam * np.eye(x_train.shape[0])) alpha = cho_solve((R, lower), y_train) t1 = time() - t0 print('\nTime taken for cholesky with functions: {:.4f} secs\n'.format(t1)) # project the data y_pred = (K_test.T @ alpha).squeeze()
def _update_full_coef(self, cho, x_transpose_y, coef_sparse): """Update the unregularized weight vector""" b = x_transpose_y + coef_sparse / self.nu coef_full = cho_solve(cho, b) self.iters += 1 return coef_full
def get_d(x, grad, oracle): upper_triangle, _ = cho_factor(oracle.hess(x), lower=False, overwrite_a=True, check_finite=True) direction = cho_solve((upper_triangle, False), -grad, overwrite_b=False, check_finite=True) return direction
def log_marginal_likeli(self, theta=None, eval_gradient=False, clone_kernel=True): """Returns log-marginal likelihood of theta for training data. Parameters ---------- theta : array-like of shape (n_kernel_params,) default=None Kernel hyperparameters for which the log-marginal likelihood is evaluated. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default=False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. If True, theta must not be None. clone_kernel : bool, default=True If True, the kernel attribute is copied. If False, the kernel attribute is modified, but may result in a performance improvement. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ if clone_kernel: kernel = self.kernel_.clone_with_theta(theta) else: kernel = self.kernel_ kernel.theta = theta if eval_gradient: K, K_gradient = kernel(self.X_train_, eval_gradient=True) else: K = kernel(self.X_train_) K[np.diag_indices_from(K)] += self.alpha try: L = cholesky(K, lower=True) # Line 2 except np.linalg.LinAlgError: return (-np.inf, np.zeros_like(theta)) \ if eval_gradient else -np.inf # Support multi-dimensional output of self.y_train_ y_train = self.y_train_ if y_train.ndim == 1: y_train = y_train[:, np.newaxis] alpha = cho_solve((L, True), y_train) # Line 3 # Compute log-likelihood (compare line 7) log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha) log_likelihood_dims -= np.log(np.diag(L)).sum() log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi) log_likelihood = log_likelihood_dims.sum(-1) # sum over dimensions # Precompute quantities required for predictions which are independent # of actual query points K = self.kernel_(self.X_train_) K[np.diag_indices_from(K)] += self.alpha try: self.L_ = cholesky(K, lower=True) # Line 2 # self.L_ changed, self._K_inv needs to be recomputed self._K_inv = None except np.linalg.LinAlgError as exc: exc.args = ("The kernel, %s, is not returning a " "positive definite matrix. Try gradually " "increasing the 'alpha' parameter of your " "GaussianProcessRegressor estimator." % self.kernel_,) + exc.args raise self.alpha_ = cho_solve((self.L_, True), self.y_train_) # Line 3 pred1 = self.predict(x_unlabeled) pred1 = scaler_y.inverse_transform(pred1) phyloss = np.mean(poros(init_poro, pred1)) log_likelihood -= 500000*phyloss #print(log_likelihood) if eval_gradient: # compare Equation 5.9 from GPML tmp = np.einsum("ik,jk->ijk", alpha, alpha) # k: output-dimension tmp -= cho_solve((L, True), np.eye(K.shape[0]))[:, :, np.newaxis] # Compute "0.5 * trace(tmp.dot(K_gradient))" without # constructing the full matrix tmp.dot(K_gradient) since only # its diagonal is required log_likelihood_gradient_dims = \ 0.5 * np.einsum("ijl,jik->kl", tmp, K_gradient) log_likelihood_gradient = log_likelihood_gradient_dims.sum(-1) if eval_gradient: return log_likelihood, log_likelihood_gradient else: return log_likelihood
def log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True): """Returns log-marginal likelihood of theta for training data. Parameters ---------- theta : array-like of shape (n_kernel_params,), default=None Kernel hyperparameters for which the log-marginal likelihood is evaluated. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default=False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. If True, theta must not be None. clone_kernel : bool, default=True If True, the kernel attribute is copied. If False, the kernel attribute is modified, but may result in a performance improvement. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : ndarray of shape (n_kernel_params,), \ optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when `eval_gradient` is True. """ if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ if clone_kernel: kernel = self.kernel_.clone_with_theta(theta) else: kernel = self.kernel_ kernel.theta = theta if eval_gradient: K, K_gradient = kernel(self.X_train_, eval_gradient=True) else: K = kernel(self.X_train_) # Compute log-marginal-likelihood Z and also store some temporaries # which can be reused for computing Z's gradient Z, (pi, W_sr, L, b, a) = \ self._posterior_mode(K, return_temporaries=True) if not eval_gradient: return Z # Compute gradient based on Algorithm 5.1 of GPML d_Z = np.empty(theta.shape[0]) # XXX: Get rid of the np.diag() in the next line R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr)) # Line 7 C = solve(L, W_sr[:, np.newaxis] * K) # Line 8 # Line 9: (use einsum to compute np.diag(C.T.dot(C)))) s_2 = -0.5 * (np.diag(K) - np.einsum('ij, ij -> j', C, C)) \ * (pi * (1 - pi) * (1 - 2 * pi)) # third derivative for j in range(d_Z.shape[0]): C = K_gradient[:, :, j] # Line 11 # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C))) s_1 = .5 * a.T.dot(C).dot(a) - .5 * R.T.ravel().dot(C.ravel()) b = C.dot(self.y_train_ - pi) # Line 13 s_3 = b - K.dot(R.dot(b)) # Line 14 d_Z[j] = s_1 + s_2.T.dot(s_3) # Line 15 return Z, d_Z
def fit(self, X, y): """Fit Gaussian process regression model. Parameters ---------- X : array-like of shape (n_samples, n_features) or list of object Feature vectors or other representations of training data. y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values Returns ------- self : returns an instance of self. """ if self.kernel is None: # Use an RBF kernel as default self.kernel_ = C(1.0, constant_value_bounds="fixed") \ * RBF(1.0, length_scale_bounds="fixed") else: self.kernel_ = clone(self.kernel) self._rng = check_random_state(self.random_state) if self.kernel_.requires_vector_input: X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, ensure_2d=True, dtype="numeric") else: X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, ensure_2d=False, dtype=None) # Normalize target value if self.normalize_y: self._y_train_mean = np.mean(y, axis=0) self._y_train_std = np.std(y, axis=0) # Remove mean and make unit variance y = (y - self._y_train_mean) / self._y_train_std else: self._y_train_mean = np.zeros(1) self._y_train_std = 1 if np.iterable(self.alpha) \ and self.alpha.shape[0] != y.shape[0]: if self.alpha.shape[0] == 1: self.alpha = self.alpha[0] else: raise ValueError( "alpha must be a scalar or an array" " with same number of entries as y.(%d != %d)" % (self.alpha.shape[0], y.shape[0])) self.X_train_ = np.copy(X) if self.copy_X_train else X self.y_train_ = np.copy(y) if self.copy_X_train else y if self.optimizer is not None and self.kernel_.n_dims > 0: # Choose hyperparameters based on maximizing the log-marginal # likelihood (potentially starting from several initial values) def obj_func(theta, eval_gradient=True): if eval_gradient: lml, grad = self.log_marginal_likelihood( theta, eval_gradient=True, clone_kernel=False) return -lml, -grad else: return -self.log_marginal_likelihood(theta, clone_kernel=False) # First optimize starting from theta specified in kernel optima = [(self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds))] # Additional runs are performed from log-uniform chosen initial # theta if self.n_restarts_optimizer > 0: if not np.isfinite(self.kernel_.bounds).all(): raise ValueError( "Multiple optimizer restarts (n_restarts_optimizer>0) " "requires that all bounds are finite.") bounds = self.kernel_.bounds for iteration in range(self.n_restarts_optimizer): theta_initial = \ self._rng.uniform(bounds[:, 0], bounds[:, 1]) optima.append( self._constrained_optimization(obj_func, theta_initial, bounds)) # Select result from run with minimal (negative) log-marginal # likelihood lml_values = list(map(itemgetter(1), optima)) self.kernel_.theta = optima[np.argmin(lml_values)][0] self.kernel_._check_bounds_params() self.log_marginal_likelihood_value_ = -np.min(lml_values) else: self.log_marginal_likelihood_value_ = \ self.log_marginal_likelihood(self.kernel_.theta, clone_kernel=False) # Precompute quantities required for predictions which are independent # of actual query points K = self.kernel_(self.X_train_) K[np.diag_indices_from(K)] += self.alpha try: self.L_ = cholesky(K, lower=True) # Line 2 # self.L_ changed, self._K_inv needs to be recomputed self._K_inv = None except np.linalg.LinAlgError as exc: exc.args = ("The kernel, %s, is not returning a " "positive definite matrix. Try gradually " "increasing the 'alpha' parameter of your " "GaussianProcessRegressor estimator." % self.kernel_, ) + exc.args raise self.alpha_ = cho_solve((self.L_, True), self.y_train_) # Line 3 return self
def predict(self, xnew, merrorsnew=False, derivs=0, addnoise=False): """ Determines the predicted mean latent function (.f) and its variance (.fvar) and potentially the predicted mean first derivative (.df) and its variance (.dfvar) and the predicted mean second derivative (.ddf) and its variance (.ddfvar) . Also .mnp is the predicted combined array of the mean latent function and its mean derivatives and .covp is the corresponding covariance matrix. Arguments -- xnew: abscissa values for which predicted ordinate values are desired merrorsnew: if specified, the expected measurements errors at xnew (need not be specified if xnew= x) derivs: if 0, only the latent function is inferred; if 1, the latent function and the first derivative are inferred; if 2, the latent function and the first and second derivatives are inferred addnoise: if True, add measuremnet noise to the predicted variance """ if len(self.x) == len(xnew) and (self.x == xnew).all(): xold = True else: xold = False if np.any(self.merrors) and not np.any(merrorsnew) and not xold: print('Length of xnew is different from x.') raise gaussianprocessException( 'Measurement errors were used to find the hyperparameters and measurement errors are therefore required for any predictions.' ) elif not hasattr(self, 'lth_opt'): raise gaussianprocessException( ' Run gp.findhyperparameters() first before making predictions.' ) else: # set up self.xnew = xnew lth, x, y = self.lth_opt, self.x, self.y # work with an array of length 3*N: the first N values being the function, # the second N values being the first derivative, and the last N values being the second derivative Knewold = np.empty((len(xnew), len(x))) Knewnew = np.empty((len(xnew), len(xnew))) if derivs > 0: d1Knewold = np.empty((len(xnew), len(x))) d1Knewnew = np.empty((len(xnew), len(xnew))) d1d2Knewnew = np.empty((len(xnew), len(xnew))) if derivs > 1: d12Knewold = np.empty((len(xnew), len(x))) d12Knewnew = np.empty((len(xnew), len(xnew))) d12d2Knewnew = np.empty((len(xnew), len(xnew))) d12d22Knewnew = np.empty((len(xnew), len(xnew))) for i in range(len(xnew)): Knewold[i, :] = self.covfn(xnew[i], x, lth)[0] Knewnew[i, :] = self.covfn(xnew[i], xnew, lth)[0] if derivs > 0: d1Knewold[i, :] = self.d1covfn(xnew[i], x, lth)[0] d1Knewnew[i, :] = self.d1covfn(xnew[i], xnew, lth)[0] d1d2Knewnew[i, :] = self.d1d2covfn(xnew[i], xnew, lth)[0] if derivs > 1: d12Knewold[i, :] = self.d12covfn(xnew[i], x, lth)[0] d12Knewnew[i, :] = self.d12covfn(xnew[i], xnew, lth)[0] d12d2Knewnew[i, :] = self.d12d2covfn(xnew[i], xnew, lth)[0] d12d22Knewnew[i, :] = self.d12d22covfn(xnew[i], xnew, lth)[0] if derivs == 0: kv = Knewold km = Knewnew elif derivs == 1: kv = np.bmat([[Knewold], [d1Knewold]]) km = np.bmat([[Knewnew, np.transpose(d1Knewnew)], [d1Knewnew, d1d2Knewnew]]) elif derivs == 2: kv = np.bmat([[Knewold], [d1Knewold], [d12Knewold]]) km = np.bmat([[ Knewnew, np.transpose(d1Knewnew), np.transpose(d12Knewnew) ], [d1Knewnew, d1d2Knewnew, np.transpose(d12d2Knewnew)], [d12Knewnew, d12d2Knewnew, d12d22Knewnew]]) # find mean prediction k, L = self.kernelmatrix(lth, x) m = np.dot(kv, linalg.cho_solve(L, y)) mnp = np.reshape(np.array(m), np.size(m)) self.mnp = mnp # find variance of prediction covp = km - np.dot(kv, linalg.cho_solve(L, np.transpose(kv))) self.covp = covp varp = np.diag(covp) # for user self.f = mnp[:len(xnew)] self.fvar = varp[:len(xnew)] fvar = varp[:len(xnew)] if addnoise: # add measurement error to the variance of the latent function if np.any(self.merrors): if xold: self.fvar = fvar + np.exp(lth[-1]) * np.diag( self.merrors) else: self.fvar = fvar + merrorsnew else: self.fvar = fvar + np.exp(lth[-1]) * np.identity(len(xnew)) else: # just take the variance of the latent function self.fvar = fvar if derivs > 0: self.df = mnp[len(xnew):2 * len(xnew)] self.dfvar = varp[len(xnew):2 * len(xnew)] if derivs > 1: self.ddf = mnp[2 * len(xnew):] self.ddfvar = varp[2 * len(xnew):]
def predict(self, X, return_std=False, return_cov=False): """Predict using the Gaussian process regression model We can also predict based on an unfitted model by using the GP prior. In addition to the mean of the predictive distribution, also its standard deviation (return_std=True) or covariance (return_cov=True). Note that at most one of the two can be requested. Parameters ---------- X : array-like of shape (n_samples, n_features) or list of object Query points where the GP is evaluated. return_std : bool, default=False If True, the standard-deviation of the predictive distribution at the query points is returned along with the mean. return_cov : bool, default=False If True, the covariance of the joint predictive distribution at the query points is returned along with the mean. Returns ------- y_mean : ndarray of shape (n_samples, [n_output_dims]) Mean of predictive distribution a query points. y_std : ndarray of shape (n_samples,), optional Standard deviation of predictive distribution at query points. Only returned when `return_std` is True. y_cov : ndarray of shape (n_samples, n_samples), optional Covariance of joint predictive distribution a query points. Only returned when `return_cov` is True. """ if return_std and return_cov: raise RuntimeError( "Not returning standard deviation of predictions when " "returning full covariance.") if self.kernel is None or self.kernel.requires_vector_input: X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False) else: X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False) if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior if self.kernel is None: kernel = (C(1.0, constant_value_bounds="fixed") * RBF(1.0, length_scale_bounds="fixed")) else: kernel = self.kernel y_mean = np.zeros(X.shape[0]) if return_cov: y_cov = kernel(X) return y_mean, y_cov elif return_std: y_var = kernel.diag(X) return y_mean, np.sqrt(y_var) else: return y_mean else: # Predict based on GP posterior K_trans = self.kernel_(X, self.X_train_) y_mean = K_trans.dot(self.alpha_) # Line 4 (y_mean = f_star) # undo normalisation y_mean = self._y_train_std * y_mean + self._y_train_mean if return_cov: v = cho_solve((self.L_, True), K_trans.T) # Line 5 y_cov = self.kernel_(X) - K_trans.dot(v) # Line 6 # undo normalisation y_cov = y_cov * self._y_train_std**2 return y_mean, y_cov elif return_std: # cache result of K_inv computation if self._K_inv is None: # compute inverse K_inv of K based on its Cholesky # decomposition L and its inverse L_inv L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0])) self._K_inv = L_inv.dot(L_inv.T) # Compute variance of predictive distribution y_var = self.kernel_.diag(X) y_var -= np.einsum("ij,ij->i", np.dot(K_trans, self._K_inv), K_trans) # Check if any of the variances is negative because of # numerical issues. If yes: set the variance to 0. y_var_negative = y_var < 0 if np.any(y_var_negative): warnings.warn("Predicted variances smaller than 0. " "Setting those variances to 0.") y_var[y_var_negative] = 0.0 # undo normalisation y_var = y_var * self._y_train_std**2 return y_mean, np.sqrt(y_var) else: return y_mean
def log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True): """Returns log-marginal likelihood of theta for training data. Parameters ---------- theta : array-like of shape (n_kernel_params,) default=None Kernel hyperparameters for which the log-marginal likelihood is evaluated. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default=False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. If True, theta must not be None. clone_kernel : bool, default=True If True, the kernel attribute is copied. If False, the kernel attribute is modified, but may result in a performance improvement. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ if clone_kernel: kernel = self.kernel_.clone_with_theta(theta) else: kernel = self.kernel_ kernel.theta = theta if eval_gradient: K, K_gradient = kernel(self.X_train_, eval_gradient=True) else: K = kernel(self.X_train_) K[np.diag_indices_from(K)] += self.alpha try: L = cholesky(K, lower=True) # Line 2 except np.linalg.LinAlgError: return (-np.inf, np.zeros_like(theta)) \ if eval_gradient else -np.inf # Support multi-dimensional output of self.y_train_ y_train = self.y_train_ if y_train.ndim == 1: y_train = y_train[:, np.newaxis] alpha = cho_solve((L, True), y_train) # Line 3 # Compute log-likelihood (compare line 7) log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha) log_likelihood_dims -= np.log(np.diag(L)).sum() log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi) log_likelihood = log_likelihood_dims.sum(-1) # sum over dimensions if eval_gradient: # compare Equation 5.9 from GPML tmp = np.einsum("ik,jk->ijk", alpha, alpha) # k: output-dimension tmp -= cho_solve((L, True), np.eye(K.shape[0]))[:, :, np.newaxis] # Compute "0.5 * trace(tmp.dot(K_gradient))" without # constructing the full matrix tmp.dot(K_gradient) since only # its diagonal is required log_likelihood_gradient_dims = \ 0.5 * np.einsum("ijl,jik->kl", tmp, K_gradient) log_likelihood_gradient = log_likelihood_gradient_dims.sum(-1) if eval_gradient: return log_likelihood, log_likelihood_gradient else: return log_likelihood
def _update_cache(self): """ Update cache """ cov_params_have_changed = self.Cr.params_have_changed or self.Cg.params_have_changed or self.Cn.params_have_changed if self.XX_has_changed: start = TIME.time() """ Row SVD Bg + Noise """ self.cache['Srstar'],Urstar = LA.eigh(self.XX) self.cache['Lr'] = Urstar.T self.mean.setRowRotation(Lr=self.cache['Lr']) smartSum(self.time,'cache_XXchanged',TIME.time()-start) smartSum(self.count,'cache_XXchanged',1) if self.Xr_has_changed or self.XX_has_changed: start = TIME.time() """ rotate Xr and XrXr """ self.cache['LXr'] = SP.dot(self.cache['Lr'],self.Xr) smartSum(self.time,'cache_Xrchanged',TIME.time()-start) smartSum(self.count,'cache_Xrchanged',1) if cov_params_have_changed: start = TIME.time() """ Col SVD Bg + Noise """ S2,U2 = LA.eigh(self.Cn.K()+self.offset*SP.eye(self.P)) self.cache['Sc2'] = S2 US2 = SP.dot(U2,SP.diag(SP.sqrt(S2))) USi2 = SP.dot(U2,SP.diag(SP.sqrt(1./S2))) Cstar = SP.dot(USi2.T,SP.dot(self.Cg.K(),USi2)) self.cache['Scstar'],Ucstar = LA.eigh(Cstar) self.cache['Lc'] = SP.dot(Ucstar.T,USi2.T) """ pheno """ self.mean.setColRotation(self.cache['Lc']) """ region part """ self.cache['A'] = SP.reshape(self.Cr.getParams(),(self.P,self.rank),order='F') self.cache['LAc'] = SP.dot(self.cache['Lc'],self.cache['A']) if cov_params_have_changed or self.XX_has_changed: """ S """ self.cache['s'] = SP.kron(self.cache['Scstar'],self.cache['Srstar'])+1 self.cache['d'] = 1./self.cache['s'] self.cache['D'] = SP.reshape(self.cache['d'],(self.N,self.P), order='F') """ pheno """ self.cache['LY'] = self.mean.evaluate() self.cache['DLY'] = self.cache['D']*self.cache['LY'] smartSum(self.time,'cache_colSVDpRot',TIME.time()-start) smartSum(self.count,'cache_colSVDpRot',1) if cov_params_have_changed or self.XX_has_changed or self.Xr_has_changed: """ calculate B = I + kron(LcA,LrXr).T*D*kron(kron(LcA,LrXr)) """ start = TIME.time() W = SP.kron(self.cache['LAc'],self.cache['LXr']) self.cache['DW'] = W*self.cache['d'][:,SP.newaxis] self.cache['DWt'] = self.cache['DW'].reshape((self.N,self.P,self.rank*self.S),order='F') #B = NP.einsum('ijk,jl->ilk',self.cache['DWt'],self.cache['LAc']) #B = NP.einsum('ji,jlk->ilk',self.cache['LXr'],B) B = SP.tensordot(self.cache['DWt'],self.cache['LAc'],axes=(1,0)) B = NP.transpose(B, (0, 2, 1)) B = SP.tensordot(self.cache['LXr'],B,axes=(0,0)) B = B.reshape((self.rank*self.S,self.rank*self.S),order='F') B+= SP.eye(self.rank*self.S) smartSum(self.time,'cache_calcB',TIME.time()-start) smartSum(self.count,'cache_calcB',1) """ invert B """ start = TIME.time() self.cache['cholB'] = LA.cholesky(B).T self.cache['Bi'] = LA.cho_solve((self.cache['cholB'],True),SP.eye(self.S*self.rank)) smartSum(self.time,'cache_invB',TIME.time()-start) smartSum(self.count,'cache_invB',1) """ pheno """ start = TIME.time() Z = SP.dot(self.cache['LXr'].T,SP.dot(self.cache['DLY'],self.cache['LAc'])) self.cache['z'] = SP.reshape(Z,(self.S*self.rank), order='F') self.cache['Biz'] = LA.cho_solve((self.cache['cholB'],True),self.cache['z']) BiZ = SP.reshape(self.cache['Biz'],(self.S,self.rank), order='F') self.cache['DLYpDLXBiz'] = SP.dot(self.cache['LXr'],SP.dot(BiZ,self.cache['LAc'].T)) self.cache['DLYpDLXBiz'] *= -self.cache['D'] self.cache['DLYpDLXBiz'] += self.cache['DLY'] smartSum(self.time,'cache_phenoCalc',TIME.time()-start) smartSum(self.count,'cache_phenoCalc',1) self.XX_has_changed = False self.Xr_has_changed = False self.Y_has_changed = False self.Cr.params_have_changed = False self.Cg.params_have_changed = False self.Cn.params_have_changed = False
def _applyConstraints(blockVectorV, factYBY, blockVectorBY, blockVectorY): """Changes blockVectorV in place.""" gramYBV = np.dot(blockVectorBY.T, blockVectorV) tmp = cho_solve(factYBY, gramYBV) blockVectorV -= np.dot(blockVectorY, tmp)
def kinetic_energy(self, pos, mom, cache={}): return 0.5 * mom.dot(la.cho_solve((self.mass_matrix_chol, True), mom))
def newton(oracle, x_0, tolerance=1e-5, max_iter=100, line_search_options=None, trace=False, display=False): """ Newton's optimization method. Parameters ---------- oracle : BaseSmoothOracle-descendant object Oracle with .func(), .grad() and .hess() methods implemented for computing function value, its gradient and Hessian respectively. If the Hessian returned by the oracle is not positive-definite method stops with message="newton_direction_error" x_0 : np.array Starting point for optimization algorithm tolerance : float Epsilon value for stopping criterion. max_iter : int Maximum number of iterations. line_search_options : dict, LineSearchTool or None Dictionary with line search options. See LineSearchTool class for details. trace : bool If True, the progress information is appended into history dictionary during training. Otherwise None is returned instead of history. display : bool If True, debug information is displayed during optimization. Returns ------- x_star : np.array The point found by the optimization procedure message : string 'success' or the description of error: - 'iterations_exceeded': if after max_iter iterations of the method x_k still doesn't satisfy the stopping criterion. - 'newton_direction_error': in case of failure of solving linear system with Hessian matrix (e.g. non-invertible matrix). - 'computational_error': in case of getting Infinity or None value during the computations. history : dictionary of lists or None Dictionary containing the progress information or None if trace=False. Dictionary has to be organized as follows: - history['time'] : list of floats, containing time passed from the start of the method - history['func'] : list of function values f(x_k) on every step of the algorithm - history['grad_norm'] : list of values Euclidian norms ||g(x_k)|| of the gradient on every step of the algorithm - history['x'] : list of np.arrays, containing the trajectory of the algorithm. ONLY STORE IF x.size <= 2 Example: -------- >> oracle = QuadraticOracle(np.eye(5), np.arange(5)) >> x_opt, message, history = newton(oracle, np.zeros(5), line_search_options={'method': 'Constant', 'c': 1.0}) >> print('Found optimal point: {}'.format(x_opt)) Found optimal point: [ 0. 1. 2. 3. 4.] """ history = defaultdict(list) if trace else None line_search_tool = LineSearchTool(**line_search_options) x_k = np.copy(x_0) # TODO: Implement Newton's method. # Use line_search_tool.line_search() for adaptive step size. def fill_history(): if not trace: return history['time'].append(datetime.now() - t_0) history['func'].append(func_k) history['grad_norm'].append(grad_k_norm) if len(x_k) <= 2: history['x'].append(np.copy(x_k)) def get_alpha(x_concat, d_concat): x, u = np.array_split(x_concat, 2) grad_x, grad_u = np.array_split(d_concat, 2) alphas = [1.] THETA = 0.99 for i in range(len(grad_x)): if grad_x[i] > grad_u[i]: alphas.append(THETA * (u[i] - x[i]) / (grad_x[i] - grad_u[i])) if grad_x[i] < -grad_u[i]: alphas.append(THETA * (x[i] + u[i]) / (-grad_x[i] - grad_u[i])) return min(alphas) t_0 = datetime.now() func_k = oracle.func(x_k) grad_k = oracle.grad(x_k) hess_k = oracle.hess(x_k) grad_0_norm = grad_k_norm = np.linalg.norm(grad_k) fill_history() if display: print('Begin new NM') for i in range(max_iter): if display: print('i = {} grad_norm = {} func = {} x = {} grad = {}'.format( i, grad_k_norm, func_k, x_k, grad_k)) if grad_k_norm**2 <= tolerance * grad_0_norm**2: break try: d_k = cho_solve(cho_factor(hess_k), -grad_k) except LinAlgError: return x_k, 'computational_error', history a_k = line_search_tool.line_search(oracle, x_k, d_k, previous_alpha=get_alpha(x_k, d_k)) x_k += a_k * d_k func_k = oracle.func(x_k) grad_k = oracle.grad(x_k) hess_k = oracle.hess(x_k) grad_k_norm = np.linalg.norm(grad_k) fill_history() if grad_k_norm**2 <= tolerance * grad_0_norm**2: return x_k, 'success', history else: return x_k, 'iterations_exceeded', history
def submatrix_inv_mult(M, Minv, imask, Y, MinvY, pad=True, bruteforce=False): """ Returns (inverse of submatrix of M) * Y Parameters: M (np.ndarray) N x N: symmetric and positive semi-definite matrix Minv (np.ndarray) N x N: inverse of M imask (np.ndarray) N x N: mask of rows/columns to use (1 == keep, 0 == remove); contains Nk ones and Nr zeros Y (np.ndarray) Nspec x N: matrix multiply Ainv by; assumed to be zero-padded MinvY (np.ndarray) N x Nspec: matrix Minv * Y pad (bool) : flag for zero-padding bruteforce (bool) : flag for using bruteforce approach Returns: Ainvy (np.ndarray): Inverse of A (submatrix of M) times Y (nspec X N - Nr) where Nr = number of removed rows. - If pad is True, then zero-padded to nspec X N with 0 at each removed row Comments: Let M be block matrix given by |A B| |P Q| M = | | and M^{-1} = | | |B.T D| |Q.T U| Then inverse of A is Schur complement of U A^{-1} = P - Q U^{-1} Q.T U and M must be invertible and positive semi-definite. This returns, A^{-1} Y = P Y - (Q U^{-1} Q.T) Y Y is assumed to be zero-padded at bad rows """ #verify proper dimensionalities assert M.shape[0] == M.shape[1], "M must be a square matrix." assert imask.shape[0] == M.shape[ 0], "M and imask have incompatible dimensions." assert Y.ndim == 2, "Y must be column vector." assert MinvY.ndim == 2, "MinvY must be column vector." #rows/columns to keep (k) and remove (r) k = np.where(imask.any(axis=1))[0] #?? assume imask symmetric nk = len(k) r = np.where(~imask.any( axis=1))[0] #must convert to bool since ~ is bitwise complement nr = len(r) if bruteforce: A = (M[k, :])[:, k] Ainv = cholesky_inv(A) Ainvy = Ainv.T @ Y[k, :] return Ainvy if (nr == 0): print("imask does not remove any rows or columns.") return MinvY #Use Q.T y = Minv y - U y ?? why? known result? U = (Minv[r, :])[:, r] Yr = Y[r, :] Qty = MinvY[r, :] - np.dot(U, Yr) Qt = (Minv[r, :])[:, k] #evaluate A^{-1} Y = P Y - Q U^{-1} Q.T Y #Faster for big U (and fast enough for small U) if (U.shape[0] == 1): UinvQtY = Qty / U[0] else: L = linalg.cho_factor(U, lower=False, check_finite=False) UinvQtY = linalg.cho_solve(L, Qty, overwrite_b=False) #Evaluate A^{-1} Y = P Y - Q U^{-1} Q^T Y #using P Y = Minv Y - Q Y if (pad): AinvY0 = deepcopy(MinvY) AinvY0[k, :] -= Qt.T @ (UinvQtY + Yr) AinvY0[r, :] = 0 return AinvY0 else: AinvY = MinvY[k, :] - ((UinvQtY + Yr).T @ Qt).T return AinvY
def multi_harmonic_fit(time, data, error, freq, nharm=4, return_model=False, freq_sep=0.01, fit_mean=True, fit_slope=False): """ Simultaneous fit of a sum of sinusoids by weighted, linear least squares. model(t) = C0 + C1*(t-t0) + Sum_i Sum_j Aij sin(2*pi*j*fi*(t-t0)+phij), i=[1,nfreq], j=[1,nharm] [t0 defined such that ph11=0] Input: time: x vector data: y vector error: uncertainty on data freq: one or more frequencies freq_i to fit nharm: number of harmonics of each frequency to fit (nharm=1 is just fundamental) fij = fi, 2*fi, ... nharm*fi freq_sep: freq_ij seperated by less than this are ignored (should be the search grid spacing) fit_slope=False, then C1=0 fit_mean=False, then C0=0 Output: A dictionary containing the model evaluated on the time grid (if return_model==True) and the model amplitudes Aij, phases phij, and their uncertainties. """ t = time.astype('float64') r = data.astype('float64') dr = error.astype('float64') numt = len(t) wt = 1. / dr**2 s0 = wt.sum() t0 = (t * wt).sum() / s0 t -= t0 dr *= sqrt(s0) r0 = (r * wt).sum() / s0 r -= r0 nfit = 0 if (fit_mean == True): nfit = 1 if (fit_slope == True): fit_mean = True nfit = 2 tm = t.max() s1 = ((t / tm)**2 * wt).sum() sb = ((t / tm) * r * wt).sum() slope = sb / s1 s1 /= s0 r -= slope * t / tm tt = t / tm / dr rr = r / dr chi0 = dot(rr, rr) * s0 matr = empty((nfit + 2 * nharm, nfit + 2 * nharm), dtype='float64') vec = empty(nfit + 2 * nharm, dtype='float64') sx = empty((nharm, numt), dtype='float64') cx = empty((nharm, numt), dtype='float64') # # We will solve matr*res = vec, for res. Define matr and vec. # sx0, cx0 = sin(2 * pi * t * freq), cos(2 * pi * t * freq) sx[0, :] = sx0 / dr cx[0, :] = cx0 / dr for i in xrange(nharm - 1): sx[i + 1, :] = cx0 * sx[i, :] + sx0 * cx[i, :] cx[i + 1, :] = -sx0 * sx[i, :] + cx0 * cx[i, :] if (nfit > 0): vec[0] = 0. matr[0, 0] = 1. if (nfit > 1): vec[1] = matr[0, 1] = matr[1, 0] = 0. matr[1, 1] = s1 for i in xrange(nharm): vec[i + nfit] = dot(sx[i, :], rr) vec[nharm + i + nfit] = dot(cx[i, :], rr) if (nfit > 0): matr[0, i + nfit] = matr[i + nfit, 0] = dot(sx[i, :], 1. / dr) matr[0, nharm + i + nfit] = matr[nharm + i + nfit, 0] = dot(cx[i, :], 1. / dr) if (nfit > 1): matr[1, i + nfit] = matr[i + nfit, 1] = dot(sx[i, :], tt) matr[1, nharm + i + nfit] = matr[nharm + i + nfit, 1] = dot(cx[i, :], tt) for j in xrange(i + 1): matr[j + nfit, i + nfit] = matr[i + nfit, j + nfit] = dot(sx[i, :], sx[j, :]) matr[j + nfit, nharm + i + nfit] = matr[nharm + i + nfit, j + nfit] = dot(cx[i, :], sx[j, :]) matr[nharm + j + nfit, i + nfit] = matr[i + nfit, nharm + j + nfit] = dot(sx[i, :], cx[j, :]) matr[nharm + j + nfit, nharm + i + nfit] = matr[nharm + i + nfit, nharm + j + nfit] = dot( cx[i, :], cx[j, :]) out_dict = {} # # Convert to amplitudes and phases and propagate errors # out_dict['cn0'] = r0 out_dict['cn0_error'] = 1. / sqrt(s0) out_dict['trend'] = 0. out_dict['trend_error'] = 0. A0, B0, vA0, vB0, covA0B0 = zeros((5, nharm), dtype='float64') amp, phase, rel_phase = zeros((3, nharm), dtype='float64') damp, dphase = zeros((2, nharm), dtype='float64') covA0B0 = zeros(nharm, dtype='float64') res = zeros(nfit + 2 * nharm, dtype='float64') err2 = zeros(nfit + 2 * nharm, dtype='float64') out_dict['bayes_factor'] = 0. try: # # solve the equation and replace matr with its inverse # m0 = cho_factor(matr, lower=False) out_dict['bayes_factor'] = -log(trace(m0[0])) res = cho_solve(m0, vec) CholeskyInverse(m0[0], matr) A0, B0 = res[nfit:nharm + nfit], res[nharm + nfit:] amp = sqrt(A0**2 + B0**2) phase = arctan2(B0, A0) err2 = diag(matr) / s0 vA0, vB0 = err2[nfit:nharm + nfit], err2[nharm + nfit:] for i in xrange(nharm): covA0B0[i] = matr[nfit + i, nharm + nfit + i] / s0 damp = sqrt(A0**2 * vA0 + B0**2 * vB0 + 2. * A0 * B0 * covA0B0) / amp dphase = sqrt(A0**2 * vB0 + B0**2 * vA0 - 2. * A0 * B0 * covA0B0) / amp**2 rel_phase = phase - phase[0] * (1. + arange(nharm)) rel_phase = arctan2(sin(rel_phase), cos(rel_phase)) except: print( "Failed: singular matrix! (Are your frequencies unique/non-harmonic?)" ) out_dict['time0'] = t0 - phase[0] / (2 * pi * freq) out_dict["amplitude"] = amp out_dict["amplitude_error"] = damp out_dict["rel_phase"] = rel_phase out_dict["rel_phase_error"] = dphase modl = r0 + dot(A0, sx * dr) + dot(B0, cx * dr) if (nfit > 0): out_dict['cn0'] += res[0] out_dict['cn0_error'] = sqrt(err2[0]) modl += res[0] if (nfit > 1): out_dict['trend'] = (res[1] + slope) / tm out_dict['trend_error'] = sqrt(err2[1]) / tm modl += out_dict['trend'] * t ### #import os #import matplotlib.pyplot as pyplot #t_folded = t % (1./freq) #pyplot.title("nfit=%d After modl += res[0] and modl += out_dict['trend']*t" % (nfit)) #pyplot.plot(t_folded, data, 'bo', ms=3) #pyplot.plot(t_folded, modl, 'ro', ms=3) #pyplot.plot(t_folded, modl - out_dict['trend']*t, 'mo', ms=3) #pyplot.plot(t_folded, out_dict['trend']*t, 'go', ms=3) ##pyplot.plot(t, data, 'bo', ms=3) ##pyplot.plot(t, modl, 'ro', ms=3) ##pyplot.plot(t, modl - out_dict['trend']*t, 'mo', ms=3) ##pyplot.plot(t, out_dict['trend']*t, 'go', ms=3) ###pyplot.show() ##fpath = '/tmp/multiharmonic.ps' ##pyplot.savefig(fpath) ##os.system('gv %s &' % (fpath)) #import pdb; pdb.set_trace() ### resid = (modl - r - r0 - slope * tt * dr) / dr out_dict['chi2'] = dot(resid, resid) * s0 out_dict['cn0'] += out_dict['trend'] * (out_dict['time0'] - t0) else: resid = (modl - r - r0) / dr out_dict['chi2'] = dot(resid, resid) * s0 ### #import os #import matplotlib.pyplot as pyplot #t_folded = t % (1./freq) #pyplot.title("nfit=%d freq=%f End" % (nfit, freq)) #pyplot.plot(t_folded, data, 'bo', ms=3) #pyplot.plot(t_folded, modl, 'ro', ms=3) #pyplot.plot(t_folded, modl - out_dict['trend']*t, 'mo', ms=3) #pyplot.plot(t_folded, out_dict['trend']*t, 'go', ms=3) ##pyplot.plot(t, data, 'bo', ms=3) ##pyplot.plot(t, modl, 'ro', ms=3) ##pyplot.plot(t, modl - out_dict['trend']*t, 'mo', ms=3) ##pyplot.plot(t, out_dict['trend']*t, 'go', ms=3) ###pyplot.show() #fpath = '/tmp/multiharmonic.ps' #pyplot.savefig(fpath) #os.system('gv %s &' % (fpath)) #import pdb; pdb.set_trace() #pyplot.clf() ### out_dict['nu'] = numt - 2 * nharm - nfit out_dict['signif'] = chi2sigma(chi0, out_dict['chi2'], numt - nfit, nharm) if (return_model): out_dict['model'] = modl return out_dict
def _precond(LorU, lower, x): y = cho_solve((LorU, lower), x) return _as2d(y)
def solve(self, tr_radius): """Solve quadratic subproblem""" lambda_current, lambda_lb, lambda_ub = self._initial_values(tr_radius) n = self.dimension hits_boundary = True already_factorized = False self.niter = 0 while True: # Compute Cholesky factorization if already_factorized: already_factorized = False else: H = self.hess + lambda_current * np.eye(n) U, info = self.cholesky(H, lower=False, overwrite_a=False, clean=True) self.niter += 1 # Check if factorization succeeded if info == 0 and self.jac_mag > self.CLOSE_TO_ZERO: # Successful factorization # Solve `U.T U p = s` p = cho_solve((U, False), -self.jac) p_norm = norm(p) # Check for interior convergence if p_norm <= tr_radius and lambda_current == 0: hits_boundary = False break # Solve `U.T w = p` w = solve_triangular(U, p, trans='T') w_norm = norm(w) # Compute Newton step accordingly to # formula (4.44) p.87 from ref [2]_. delta_lambda = (p_norm / w_norm)**2 * (p_norm - tr_radius) / tr_radius lambda_new = lambda_current + delta_lambda if p_norm < tr_radius: # Inside boundary s_min, z_min = estimate_smallest_singular_value(U) ta, tb = self.get_boundaries_intersections( p, z_min, tr_radius) # Choose `step_len` with the smallest magnitude. # The reason for this choice is explained at # ref [3]_, p. 6 (Immediately before the formula # for `tau`). step_len = min([ta, tb], key=abs) # Compute the quadratic term (p.T*H*p) quadratic_term = np.dot(p, np.dot(H, p)) # Check stop criteria relative_error = (step_len**2 * s_min**2) / ( quadratic_term + lambda_current * tr_radius**2) if relative_error <= self.k_hard: p += step_len * z_min break # Update uncertanty bounds lambda_ub = lambda_current lambda_lb = max(lambda_lb, lambda_current - s_min**2) # Compute Cholesky factorization H = self.hess + lambda_new * np.eye(n) c, info = self.cholesky(H, lower=False, overwrite_a=False, clean=True) # Check if the factorization have succeeded # if info == 0: # Successful factorization # Update damping factor lambda_current = lambda_new already_factorized = True else: # Unsuccessful factorization # Update uncertanty bounds lambda_lb = max(lambda_lb, lambda_new) # Update damping factor lambda_current = max( np.sqrt(lambda_lb * lambda_ub), lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb)) else: # Outside boundary # Check stop criteria relative_error = abs(p_norm - tr_radius) / tr_radius if relative_error <= self.k_easy: break # Update uncertanty bounds lambda_lb = lambda_current # Update damping factor lambda_current = lambda_new elif info == 0 and self.jac_mag <= self.CLOSE_TO_ZERO: # jac_mag very close to zero # Check for interior convergence if lambda_current == 0: p = np.zeros(n) hits_boundary = False break s_min, z_min = estimate_smallest_singular_value(U) step_len = tr_radius # Check stop criteria if step_len**2 * s_min**2 <= self.k_hard * lambda_current * tr_radius**2: p = step_len * z_min break # Update uncertanty bounds lambda_ub = lambda_current lambda_lb = max(lambda_lb, lambda_current - s_min**2) # Update damping factor lambda_current = max( np.sqrt(lambda_lb * lambda_ub), lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb)) else: # Unsuccessful factorization # Compute auxiliary terms delta, v = singular_leading_submatrix(H, U, info) v_norm = norm(v) # Update uncertanty interval lambda_lb = max(lambda_lb, lambda_current + delta / v_norm**2) # Update damping factor lambda_current = max( np.sqrt(lambda_lb * lambda_ub), lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb)) self.lambda_lb = lambda_lb self.lambda_current = lambda_current self.previous_tr_radius = tr_radius return p, hits_boundary
def gp_fit_demo(f, pars, xrng=(-1, 1, 50), save_figs=False, alpha=1.0, el=1.0): xs = np.linspace(*xrng) # test set fx = np.apply_along_axis(f, 0, xs[na, :], pars).squeeze() xtr = np.sqrt(3) * np.array([-1, 1], dtype=float) # train set ytr = np.apply_along_axis(f, 0, xtr[na, :], pars).squeeze( ) # function observations + np.random.randn(xtr.shape[0]) dtr = np.apply_along_axis(f, 0, xtr[na, :], pars, dx=True).squeeze() # derivative observations y = np.hstack((ytr, dtr)) m, n = len(xs), len(xtr) # train and test points jitter = 1e-8 # evaluate kernel matrices kss, kfd, kdd = kern_rbf_der(xs, xs, alpha=alpha, el=el) kff, kfd, kdd = kern_rbf_der(xs, xtr, alpha=alpha, el=el) kfy = np.hstack((kff, kfd)) Kff, Kfd, Kdd = kern_rbf_der(xtr, xtr, alpha=alpha, el=el) K = np.vstack((np.hstack((Kff, Kfd)), np.hstack((Kfd.T, Kdd)))) # GP fit w/ function values only kff_iK = cho_solve(cho_factor(Kff + jitter * np.eye(n)), kff.T).T gp_mean = kff_iK.dot(ytr) gp_var = np.diag(kss - kff_iK.dot(kff.T)) gp_std = np.sqrt(gp_var) # GP fit w/ functionn values and derivatives kfy_iK = cho_solve(cho_factor(K + jitter * np.eye(n + n * 1)), kfy.T).T # kx.dot(inv(K)) gp_mean_d = kfy_iK.dot(y) gp_var_d = np.diag(kss - kfy_iK.dot(kfy.T)) gp_std_d = np.sqrt(gp_var_d) # setup plotting fmin, fmax, fp2p = np.min(fx), np.max(fx), np.ptp(fx) axis_limits = [-3, 3, fmin - 0.2 * fp2p, fmax + 0.2 * fp2p] tick_settings = { 'which': 'both', 'bottom': 'off', 'top': 'off', 'left': 'off', 'right': 'off', 'labelleft': 'off', 'labelbottom': 'off' } # use tex to render text in the figure mpl.rc('text', usetex=True) # use lmodern font package which is also used in the paper mpl.rc('text.latex', preamble=[r'\usepackage{lmodern}']) # sans serif font for figure, size 10pt mpl.rc('font', family='sans-serif', size=10) plt.style.use('seaborn-paper') # set figure width to fit the column width of the article pti = 1.0 / 72.0 # 1 inch = 72 points fig_width_pt = 244 # obtained from latex using \the\columnwidth golden_mean = (np.sqrt(5.0) - 1.0) / 2.0 fig_w = fig_width_pt * pti * 1.0 fig_h = fig_w * golden_mean plt.figure(figsize=(fig_w, fig_h)) # # plot ordinary GP regression fit # plt.subplot(211) # plt.axis(axis_limits) # plt.tick_params(**tick_settings) # plt.title('GP regression') # plt.plot(xs, fx, 'r--', label='true') # plt.plot(xtr, ytr, 'ko', ms=8, label='observed fcn values') # plt.plot(xs, gp_mean, 'k-', lw=2, label='GP mean') # plt.fill_between(xs, gp_mean - 2 * gp_std, gp_mean + 2 * gp_std, color='k', alpha=0.15) # # plot GP regression fit w/ derivative observations # plt.subplot(212) # plt.axis(axis_limits) # plt.tick_params(**tick_settings) # plt.title('GP regression with gradient observations') # plt.plot(xs, fx, 'r--', label='true') # plt.plot(xtr, ytr, 'ko', ms=8, label='observed fcn values') # plt.plot(xs, gp_mean_d, 'k-', lw=2, label='GP mean') # plt.fill_between(xs, gp_mean_d - 2 * gp_std_d, gp_mean_d + 2 * gp_std_d, color='k', alpha=0.15) # # plot line segments to indicate derivative observations # h = 0.15 # for i in range(len(dtr)): # x0, x1 = xtr[i] - h, xtr[i] + h # y0 = dtr[i] * (x0 - xtr[i]) + ytr[i] # y1 = dtr[i] * (x1 - xtr[i]) + ytr[i] # plt.gca().add_line(Line2D([x0, x1], [y0, y1], linewidth=6, color='k')) # plt.tight_layout() # if save_figs: # plt.savefig('{}_gpr_grad_compar.pdf'.format(f.__name__), format='pdf') # else: # plt.show() # two figure version scale = 0.5 fig_width_pt = 244 / 2 fig_w = fig_width_pt * pti fig_h = fig_w * golden_mean * 1 # plot ordinary GP regression fit plt.figure(figsize=(fig_w, fig_h)) plt.axis(axis_limits) plt.tick_params(**tick_settings) plt.plot(xs, fx, 'r--', label='true') plt.plot(xtr, ytr, 'ko', ms=8, label='observed fcn values') plt.plot(xs, gp_mean, 'k-', lw=2, label='GP mean') plt.fill_between(xs, gp_mean - 2 * gp_std, gp_mean + 2 * gp_std, color='k', alpha=0.15) plt.tight_layout(pad=0.5) if save_figs: plt.savefig('{}_gpr_fcn_obs_small.pdf'.format(f.__name__), format='pdf') else: plt.show() # plot GP regression fit w/ derivative observations plt.figure(figsize=(fig_w, fig_h)) plt.axis(axis_limits) plt.tick_params(**tick_settings) plt.plot(xs, fx, 'r--', label='true') plt.plot(xtr, ytr, 'ko', ms=8, label='observed fcn values') plt.plot(xs, gp_mean_d, 'k-', lw=2, label='GP mean') plt.fill_between(xs, gp_mean_d - 2 * gp_std_d, gp_mean_d + 2 * gp_std_d, color='k', alpha=0.15) # plot line segments to indicate derivative observations h = 0.15 for i in range(len(dtr)): x0, x1 = xtr[i] - h, xtr[i] + h y0 = dtr[i] * (x0 - xtr[i]) + ytr[i] y1 = dtr[i] * (x1 - xtr[i]) + ytr[i] plt.gca().add_line(Line2D([x0, x1], [y0, y1], linewidth=6, color='k')) plt.tight_layout(pad=0.5) if save_figs: plt.savefig('{}_gpr_grad_obs_small.pdf'.format(f.__name__), format='pdf') else: plt.show()
def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(comp_pend) + self.noise * np.eye( comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = (np.dot( pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + self.mean) # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.mean(ei, axis=1)
def calculate_continuous_ancestral_states(tree, char_mtx, sum_to_one=False, calc_std_err=False): """Calculates the continuous ancestral states for the nodes in a tree. Args: tree (Tree): A dendropy tree or TreeWrapper object. char_mtx (Matrix): A Matrix object with character information. Each row should represent a tip in the tree and each column should be a variable to calculate ancestral state for. calc_std_err (:obj:`bool`, optional): If True, calculate standard error for each variable. Defaults to False. sum_to_one (:obj:`bool`, optional): If True, standardize the character matrix so that the values in a row sum to one. Defaults to False. Raises: ValueError: Raised if none of the tree tips were found in the character data. Returns: A matrix of character data with the following dimensions: * rows: nodes / tips in the tree * columns: character variables * depth: first is the calculated value, second layer is standard error if desired Todo: * Add function for consistent label handling. """ # Wrap tree if dendropy tree if not isinstance(tree, TreeWrapper): tree = TreeWrapper.from_base_tree(tree) # Assign labels to nodes that don't have them tree.add_node_labels() # Synchronize tree and character data # Prune tree prune_taxa = [] keep_taxon_labels = [] init_row_headers = char_mtx.get_row_headers() for taxon in tree.taxon_namespace: label = taxon.label.replace(' ', '_') if label not in init_row_headers: prune_taxa.append(taxon) print( 'Could not find {} in character matrix, pruning'.format(label)) else: keep_taxon_labels.append(label) if len(keep_taxon_labels) == 0: raise ValueError( 'None of the tree tips were found in the character data') tree.prune_taxa(prune_taxa) tree.purge_taxon_namespace() # Prune character data keep_rows = [] i = 0 for label in init_row_headers: if label in keep_taxon_labels: keep_rows.append(i) else: print('Could not find {} in tree tips, pruning'.format(label)) i += 1 char_mtx = char_mtx.slice(keep_rows) # Standardize character matrix if requested tip_count, num_vars = char_mtx.shape if sum_to_one: for i in range(tip_count): sc = float(1.0) / np.sum(char_mtx[i]) for j in range(num_vars): char_mtx[i, j] *= sc # Initialize data matrix num_nodes = len(tree.nodes()) data_shape = (num_nodes, num_vars, 2 if calc_std_err else 1) data = np.zeros(data_shape, dtype=float) # Initialize headers row_headers = [] tip_col_headers = char_mtx.get_column_headers() tip_row_headers = char_mtx.get_row_headers() tip_lookup = dict([(tip_row_headers[i].replace('_', ' '), i) for i in range(tip_count)]) # Get the number of internal nodes in the tree internal_node_count = num_nodes - tip_count # Loop through the tree and set the matrix index for each node # Also set data values node_headers = [] node_i = tip_count tip_i = 0 node_index_lookup = {} for node in tree.nodes(): label = _get_node_label(node) if len(node.child_nodes()) == 0: # Tip node_index_lookup[label] = tip_i row_headers.append(label) data[tip_i, :, 0] = char_mtx[tip_lookup[label]] tip_i += 1 else: node_index_lookup[label] = node_i node_headers.append(label) # Internal node data[node_i, :, 0] = np.zeros((1, num_vars), dtype=float) node_i += 1 # Row headers should be extended with node headers row_headers.extend(node_headers) # For each variable for x in range(num_vars): # Compute the ML estimate of the root full_mcp = np.zeros((internal_node_count, internal_node_count), dtype=float) full_vcp = np.zeros(internal_node_count, dtype=float) for k in tree.postorder_edge_iter(): i = k.head_node if len(i.child_nodes()) != 0: node_num_i = node_index_lookup[_get_node_label(i)] - tip_count for j in i.child_nodes(): tbl = 2. / j.edge_length full_mcp[node_num_i][node_num_i] += tbl node_num_j = node_index_lookup[_get_node_label(j)] if len(j.child_nodes()) == 0: full_vcp[node_num_i] += (data[node_num_j, x, 0] * tbl) else: node_num_j -= tip_count full_mcp[node_num_i][node_num_j] -= tbl full_mcp[node_num_j][node_num_i] -= tbl full_mcp[node_num_j][node_num_j] += tbl b = la.cho_factor(full_mcp) # these are the ML estimates for the ancestral states ml_est = la.cho_solve(b, full_vcp) sos = 0 for k in tree.postorder_edge_iter(): i = k.head_node node_num_i = node_index_lookup[_get_node_label(i)] if len(i.child_nodes()) != 0: data[node_num_i, x, 0] = ml_est[node_num_i - tip_count] if calc_std_err: for j in i.child_nodes(): node_num_j = node_index_lookup[_get_node_label(j)] temp = data[node_num_i, x, 0] - data[node_num_j, x, 0] sos += temp * temp / j.edge_length # nni is node_num_i adjusted for only nodes nni = node_num_i - tip_count qpq = full_mcp[nni][nni] tm1 = np.delete(full_mcp, (nni), axis=0) tm = np.delete(tm1, (nni), axis=1) b = la.cho_factor(tm) sol = la.cho_solve(b, tm1[:, nni]) temp_std_err = qpq - np.inner(tm1[:, nni], sol) data[node_num_i, x, 1] = math.sqrt( 2.0 * sos / ((internal_node_count - 1) * temp_std_err)) depth_headers = ['maximum_likelihood'] if calc_std_err: depth_headers.append('standard_error') mtx_headers = {'0': row_headers, '1': tip_col_headers, '2': depth_headers} return tree, Matrix(data, headers=mtx_headers)
def log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True): """Returns log-marginal likelihood of theta for training data. Parameters ---------- theta : array-like of shape (n_kernel_params,) default=None Kernel hyperparameters for which the log-marginal likelihood is evaluated. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default=False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. If True, theta must not be None. clone_kernel : bool, default=True If True, the kernel attribute is copied. If False, the kernel attribute is modified, but may result in a performance improvement. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ if clone_kernel: kernel = self.kernel_.clone_with_theta(theta) else: kernel = self.kernel_ kernel.theta = theta if eval_gradient: K, K_gradient = kernel(self.X_train_, eval_gradient=True) else: K = kernel(self.X_train_) # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I) K[np.diag_indices_from(K)] += self.alpha try: L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False) except np.linalg.LinAlgError: return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf # Support multi-dimensional output of self.y_train_ y_train = self.y_train_ if y_train.ndim == 1: y_train = y_train[:, np.newaxis] # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y) alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False) # Alg 2.1, page 19, line 7 # -0.5 . y^T . alpha - sum(log(diag(L))) - n_samples / 2 log(2*pi) # y is originally thought to be a (1, n_samples) row vector. However, # in multioutputs, y is of shape (n_samples, 2) and we need to compute # y^T . alpha for each output, independently using einsum. Thus, it # is equivalent to: # for output_idx in range(n_outputs): # log_likelihood_dims[output_idx] = ( # y_train[:, [output_idx]] @ alpha[:, [output_idx]] # ) log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha) log_likelihood_dims -= np.log(np.diag(L)).sum() log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi) # the log likehood is sum-up across the outputs log_likelihood = log_likelihood_dims.sum(axis=-1) if eval_gradient: # Eq. 5.9, p. 114, and footnote 5 in p. 114 # 0.5 * trace((alpha . alpha^T - K^-1) . K_gradient) # alpha is supposed to be a vector of (n_samples,) elements. With # multioutputs, alpha is a matrix of size (n_samples, n_outputs). # Therefore, we want to construct a matrix of # (n_samples, n_samples, n_outputs) equivalent to # for output_idx in range(n_outputs): # output_alpha = alpha[:, [output_idx]] # inner_term[..., output_idx] = output_alpha @ output_alpha.T inner_term = np.einsum("ik,jk->ijk", alpha, alpha) # compute K^-1 of shape (n_samples, n_samples) K_inv = cho_solve((L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False) # create a new axis to use broadcasting between inner_term and # K_inv inner_term -= K_inv[..., np.newaxis] # Since we are interested about the trace of # inner_term @ K_gradient, we don't explicitly compute the # matrix-by-matrix operation and instead use an einsum. Therefore # it is equivalent to: # for param_idx in range(n_kernel_params): # for output_idx in range(n_output): # log_likehood_gradient_dims[param_idx, output_idx] = ( # inner_term[..., output_idx] @ # K_gradient[..., param_idx] # ) log_likelihood_gradient_dims = 0.5 * np.einsum( "ijl,jik->kl", inner_term, K_gradient) # the log likehood gradient is the sum-up across the outputs log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1) if eval_gradient: return log_likelihood, log_likelihood_gradient else: return log_likelihood
def chol_solve(U, b): if isinstance(U, np.ndarray): if sparse.issparse(b): b = b.toarray() # Allocate memory U = np.atleast_2d(U) B = np.atleast_1d(B) sh_u = U.shape[:-2] sh_b = B.shape[:-1] l_u = len(sh_u) l_b = len(sh_b) # Check which axis are iterated over with B along with U ind_b = [Ellipsis] * l_b l_min = min(l_u, l_b) jnd_b = tuple(i for i in range(-l_min, 0) if sh_b[i] == sh_u[i]) if out == None: # Shape of the result (broadcasting rules) sh = broadcasted_shape(sh_u, sh_b) #out = np.zeros(np.shape(B)) out = np.zeros(sh + B.shape[-1:]) for i in nested_iterator(np.shape(U)[:-2]): # The goal is to run Cholesky solver once for all vectors of B # for which the matrices of U are the same (according to the # broadcasting rules). Thus, we collect all the axes of B for # which U is singleton and form them as a 2-D matrix and then # run the solver once. # Select those axes of B for which U and B are not singleton for j in jnd_b: ind_b[j] = i[j] # Collect all the axes for which U is singleton b = B[tuple(ind_b) + (Ellipsis, )] # Reshape it to a 2-D (or 1-D) array orig_shape = b.shape if b.ndim > 1: b = b.reshape((-1, b.shape[-1])) # Ellipsis to all preceeding axes and ellipsis for the last # axis: if len(ind_b) < len(sh): ind_out = (Ellipsis, ) + tuple(ind_b) + (Ellipsis, ) else: ind_out = tuple(ind_b) + (Ellipsis, ) out[ind_out] = linalg.cho_solve((U[i], False), b.T).T.reshape(orig_shape) return out elif isinstance(U, cholmod.Factor): if sparse.issparse(b): b = b.toarray() return U.solve_A(b) else: raise ValueError("Unknown type of Cholesky factor")
def Neglikelihood(self, theta): """Negative log-likelihood function Input ----- theta (array): correlation legnths for different dimensions Output ------ NegLnLike: Negative log-likelihood value""" theta = 10**theta # Correlation length n = self.X.shape[0] # Number of training instances k = self.X.shape[1] # Number of dimensions if self.trend == 'Const': F = np.vstack((np.ones((n, 1)), np.zeros((n * k, 1)))) else: print( 'Other trends are currently not available, switch to "Const" instead' ) F = np.vstack((np.ones((n, 1)), np.zeros((n * k, 1)))) # Construct correlation matrix PsiDot = np.zeros(((k + 1) * n, (k + 1) * n)) # 1-Build normal Psi matrix Psi = np.zeros((n, n)) for i in range(n): Psi[i, :] = np.exp( -np.sum(theta * (self.X[i, :] - self.X)**2, axis=1)) Psi = Psi + np.eye(n) * self.nugget # To avoid duplicate addition PsiDot[:n, :n] = Psi / 2 # 2-Build dPsidX for i in range(k): PsiDot[:n, (i + 1) * n:(i + 2) * n] = 2 * theta[i] * self.diff_list[i] * Psi # 3-Build d2PsidX2 for i in range(k): # To avoid duplicate addition PsiDot[(i+1)*n:(i+2)*n, (i+1)*n:(i+2)*n] = \ (2*theta[i]-4*theta[i]**2*self.diff_list[i]**2)*Psi/2 # 4-Build d2PsidXdX for i in range(k - 1): for j in range(i + 1, k): PsiDot[(i+1)*n:(i+2)*n, (j+1)*n:(j+2)*n] = \ -4*theta[i]*theta[j]*self.diff_list[i]*self.diff_list[j]*Psi # 5-Compile PsiDot PsiDot = PsiDot + PsiDot.T L = np.linalg.cholesky(PsiDot) # Mean estimation mu = np.linalg.solve( F.T @ (cho_solve((L, True), F)), F.T @ (cho_solve( (L, True), np.vstack((self.y, self.grad))))) # Variance estimation SigmaSqr = (np.vstack((self.y, self.grad))-F@mu).T @ \ (cho_solve((L, True), np.vstack((self.y, self.grad))-F@mu)) / ((k+1)*n) # Compute log-likelihood LnDetK = 2 * np.sum(np.log(np.abs(np.diag(L)))) NegLnLike = ((k + 1) * n / 2) * np.log(SigmaSqr) + 0.5 * LnDetK # Update attributes self.PsiDot, self.F, self.L, self.mu, self.SigmaSqr = PsiDot, F, L, mu, SigmaSqr return NegLnLike.flatten()
def solve(self, M): return LA.cho_solve((self.chol(), True), M)
def cal_lf0(config): base_path = config['base_path'] label_path = config['label_path'] name = config['name'] outfilepath = config['outfilepath'] var_path = config['var_path'] syllable_base_path = config['syllable_base_path'] syllable_var_path = config['syllable_var_path'] original = config['original'] koriyama_gen = config['koriyama_gen'] figure_path = config['figure_path'] ph_in_syl_object_path = config['phone_in_syllable_object_path'] stress = config['stress'] original_vuv = config['original_vuv'] p_in_s_file = Utility.load_obj(ph_in_syl_object_path) # vuv = np.load('{}/class.npy'.format(config['vuv_path'])) vuv = original_vuv #--------Frame-------# lf0_mean = np.load('{}/mean.npy'.format(base_path)) lf0_cov = np.load('{}/cov.npy'.format(base_path)) var = np.load('{}'.format(var_path)) if len(lf0_cov) > len(vuv): for i in range(len(lf0_cov) - len(vuv)): vuv.append(-1, axis=0) elif len(lf0_cov) < len(vuv): vuv = vuv[0:len(lf0_cov)] lf0_var = np.sum(var, axis=0) lf0_mean = np.array([lf0_mean[:, 0], lf0_mean[:, 1], lf0_mean[:, 2]]) lf0_w = PoGUtility.generate_W_for_GPR_generate_features(len(lf0_cov), vuv) frame_B = alpha * PoGUtility.cal_sum_of_mean_part(lf0_var, lf0_w, lf0_cov, lf0_mean) frame_A = alpha * PoGUtility.cal_sum_of_weight_part( lf0_var, lf0_w, lf0_cov) L = linalg.cholesky(frame_A, lower=True) lf0 = linalg.cho_solve((L, True), frame_B) # lf0 = lf0_gen_with_vuv(lf0, vuv) print lf0.shape frame_lf0_nomask = lf0 # lf0 = lf0_gen_with_vuv(lf0, vuv) lf0[lf0 < 1] = np.nan frame_lf0 = np.copy(lf0) #----------Syllable level--------# dur_list, names = PoGUtility.gen_dur_and_name_list(label_path, name) # print np.sum(dur_list) if np.sum(dur_list) < len(original): dur_list[0] = dur_list[0] + len(original) - np.sum(dur_list) # print np.sum(dur_list) syl_mean = np.load('{}/mean.npy'.format(syllable_base_path)) syl_cov = np.load('{}/cov.npy'.format(syllable_base_path)) s_mean = syl_mean var = np.load('{}'.format(syllable_var_path)) syl_var = np.sum(var, axis=0) temp_mean = [] for i in range(len(syl_mean[0])): temp_mean.append(syl_mean[:, i]) syl_mean = np.array(temp_mean) syl_w = PoGUtility.generate_DCT_W_without_consonant_on_stress( len(lf0_cov), dur_list, num_coeff, p_in_s_file, stress) syl_B = beta * PoGUtility.cal_sum_of_mean_part(syl_var, syl_w, syl_cov, syl_mean) syl_A = beta * PoGUtility.cal_sum_of_weight_part(syl_var, syl_w, syl_cov) #----------Combine Model--------# L = linalg.cholesky(frame_A + syl_A, lower=True) lf0 = linalg.cho_solve((L, True), frame_B + syl_B) # print lf0.shape lf0[lf0 < 1] = np.nan PlotUtility.plot([lf0, original, frame_lf0_nomask], ['Multi', 'original', 'Single'], '{}/{}_no_mask.eps'.format(figure_path, name)) lf0 = lf0_gen_with_vuv(lf0, vuv) lf0[lf0 < 1] = np.nan frame_lf0 = lf0_gen_with_vuv(frame_lf0, vuv) frame_lf0[frame_lf0 < 1] = np.nan np.save(outfilepath, lf0) print min(lf0) PlotUtility.plot([lf0, original, frame_lf0], ['Multi', 'original', 'Single'], '{}/{}_multi.eps'.format(figure_path, name)) #----------Combine Model--------# o = [] for data_dct, dur in zip(s_mean, dur_list): i_dct = PoGUtility.generate_inverse_DCT(data_dct, dur) o = o + i_dct o = np.concatenate((np.zeros(len(original) - len(o)), np.array(o)), axis=0) o = lf0_gen_with_vuv(o, vuv) o[o <= 1] = np.nan # print o.shape PlotUtility.plot([o, original, lf0, frame_lf0], ['dct', 'original', 'Multi', 'frame_lf0'], '{}/{}_dct.eps'.format(figure_path, name)) pass
# so we need to do some tempering, plus a little # nonlinear refinement. T = 5000.0 dlogT = -0.025 niter = 2000 lr = 3e-4 # Pre-compute the GP on the spectral components if s0_rho > 0.0: kernel = celerite.terms.Matern32Term(np.log(s0_sig), np.log(s0_rho)) gp = celerite.GP(kernel) s0_C = gp.get_matrix(lnlam_padded) else: s0_C = np.eye(Kp) * s0_sig**2 s0_cho_C = cho_factor(s0_C) s0_CInv = cho_solve(s0_cho_C, np.eye(Kp)) s0_CInvmu = cho_solve(s0_cho_C, np.ones(Kp) * s0_mu) if s1_rho > 0.0: kernel = celerite.terms.Matern32Term(np.log(s1_sig), np.log(s1_rho)) gp = celerite.GP(kernel) s1_C = gp.get_matrix(lnlam_padded) else: s1_C = np.eye(Kp) * s1_sig**2 s1_cho_C = cho_factor(s1_C) s1_CInv = cho_solve(s1_cho_C, np.eye(Kp)) s1_CInvmu = cho_solve(s1_cho_C, np.ones(Kp) * s1_mu) s_CInv = dense_block_diag(s0_CInv, s1_CInv) s_CInvmu = np.append(s0_CInvmu, s1_CInvmu) # Define the model