def log_likelihood(test_series, cov, maps, residues): """ Return the log likelihood of test_series under the model described by cov, maps, and residues. """ # try: # This makes heavy use of the matrix inversion lemma #test_series = np.concatenate(test_series, axis=0) n_samples = test_series.shape[0] white_test_series = test_series / residues residues_fit = np.sum(white_test_series**2) white_test_series /= residues white_projection = np.dot(white_test_series, maps) del white_test_series prec_maps = linalg.inv(cov) prec_maps += np.dot(maps.T / residues**2, maps) residues_fit -= np.trace( np.dot(np.dot(white_projection.T, white_projection), linalg.inv(prec_maps))) del white_projection white_maps = maps / residues[:, np.newaxis] prec_maps += np.dot(white_maps.T, white_maps) del white_maps det = fast_logdet(prec_maps) del prec_maps return (-residues_fit / n_samples - fast_logdet(cov) - det - 2 * np.sum(np.log(residues)))
def log_likelihood(test_series, cov, maps, residues): """ Return the log likelihood of test_series under the model described by cov, maps, and residues. """ # try: # This makes heavy use of the matrix inversion lemma #test_series = np.concatenate(test_series, axis=0) n_samples = test_series.shape[0] white_test_series = test_series / residues residues_fit = np.sum(white_test_series ** 2) white_test_series /= residues white_projection = np.dot(white_test_series, maps) del white_test_series prec_maps = linalg.inv(cov) prec_maps += np.dot(maps.T / residues ** 2, maps) residues_fit -= np.trace( np.dot(np.dot(white_projection.T, white_projection), linalg.inv(prec_maps))) del white_projection white_maps = maps / residues[:, np.newaxis] prec_maps += np.dot(white_maps.T, white_maps) del white_maps det = fast_logdet(prec_maps) del prec_maps return (-residues_fit / n_samples - fast_logdet(cov) - det - 2 * np.sum(np.log(residues)))
def objectiveFLGL(emp_cov, K, R, T, H, U, mu, eta, rho): res = -fast_logdet(R) + np.sum(R * emp_cov) res += rho / 2. * squared_norm(R - T + U + np.linalg.multi_dot( (K.T, linalg.pinvh(H), K))) res += mu * l1_od_norm(H) res += eta * l1_od_norm(T) return res
def score_samples(self, X): """Return the log-likelihood of each sample See. "Pattern Recognition and Machine Learning" by C. Bishop, 12.2.1 p. 574 or http://www.miketipping.com/papers/met-mppca.pdf Parameters ---------- X: array, shape(n_samples, n_features) The data. Returns ------- ll: array, shape (n_samples,) Log-likelihood of each sample under the current model """ X = array2d(X) Xr = X - self.mean_ n_features = X.shape[1] log_like = np.zeros(X.shape[0]) precision = self.get_precision() log_like = -.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1) log_like -= .5 * (n_features * log(2. * np.pi) - fast_logdet(precision)) return log_like
def bound(self, doc, lamda=None, nu2=None): """ Estimate the variational bound of a document """ if lamda is None: lamda = self.lamda if nu2 is None: nu2 = self.nu2 N = sum([cnt for _, cnt in doc]) # nb of words in document bound = 0.0 # E[log p(\eta | \mu, \Sigma)] + H(q(\eta | \lamda, \nu) + sum_n,i { \phi_{n,i}*log(\phi_{n,i}) } bound = - np.sum(np.diag(nu2) * self.sigma_inverse) + fast_logdet(self.sigma_inverse) bound -= (lamda - self.mu).transpose().dot(self.sigma_inverse).dot(lamda - self.mu) bound += np.sum(np.log(nu2)) + self.num_topics # TODO safe_log bound /= 2 # print "first term %f for doc %s" %(bound, doc) # \sum_n { E[log p(z_n | \eta)] - sum_i {\lamda_i * \phi_{n, i}} sum_exp = np.exp(lamda + 0.5 * nu2).sum() bound -= (N * (sum_exp / self.zeta - 1. + np.log(self.zeta))) # print "second term %f for doc %s" %(bound, doc) # E[log p(w_n | z_n, \beta)] - sum_n,i { \phi_{n,i}*log(\phi_{n,i}) bound += sum(c * (self.phi[n] * (lamda + np.log(self.beta[:, n]) - np.log(self.phi[n]))).sum() for (n, c) in doc) return bound
def log_likelihood(covariance, precision): """Computes the log-likelihood between the covariance and precision estimate. Parameters ---------- covariance : 2D ndarray (n_features, n_features) Maximum Likelihood Estimator of covariance precision : 2D ndarray (n_features, n_features) The precision matrix of the covariance model to be tested Returns ------- log-likelihood """ assert covariance.shape == precision.shape dim, _ = precision.shape log_likelihood_ = ( -np.sum(covariance * precision) + fast_logdet(precision) - dim * np.log(2 * np.pi) ) log_likelihood_ /= 2. return log_likelihood_
def score(self, X, y=None): """Return a score associated to new data Parameters ---------- X: array of shape(n_samples, n_features) The data to test Returns ------- ll: array of shape (n_samples), log-likelihood of each row of X under the current model """ Xr = X - self.mean_ n_features = X.shape[1] log_like = np.zeros(X.shape[0]) if (self.precision_ is None) and (self.covariance_ is None): XrP,ldet = self.dot_precision(X=Xr, logdet=True ) else: if self.precision_ is None: self.precision_ = linalg.inv(self.covariance_) XrP = np.dot(Xr, self.precision_) ldet = fast_logdet(self.covariance_) log_like = -.5 * (Xr * XrP).sum(axis=1) log_like -= .5 * (ldet + n_features * log(2. * np.pi)) return log_like
def log_likelihood(self, X): """Equivalent to scipy. from scipy.stats import invwishart invwishart.logpdf(X, nu, S) """ nu = self.nu n_dim = X.shape[0] logp = nu * fast_logdet(self.S) logp -= np.sum(self.S * linalg.pinvh(X)) logp -= (nu + n_dim + 1) * fast_logdet(X) logp -= nu * n_dim * np.log(2) logp -= 2 * multigammaln(0.5 * nu, n_dim) logp /= 2.0 return logp
def score_samples(self, X): """Return the log-likelihood of each sample. See. "Pattern Recognition and Machine Learning" by C. Bishop, 12.2.1 p. 574 or http://www.miketipping.com/papers/met-mppca.pdf Parameters ---------- X : array, shape(n_samples, n_features) The data. Returns ------- ll : array, shape (n_samples,) Log-likelihood of each sample under the current model """ check_is_fitted(self, "mean_") # X = check_array(X) Xr = X - self.mean_ n_features = X.shape[1] precision = self.get_precision() # [n_features, n_features] log_like = -.5 * (Xr * (da.dot(Xr, precision))).sum(axis=1) log_like -= .5 * (n_features * da.log(2. * np.pi) - fast_logdet(precision)) return log_like
def score(self, X, y=None): """Return a score associated to new data Parameters ---------- X: array of shape(n_samples, n_features) The data to test Returns ------- ll: array of shape (n_samples), log-likelihood of each row of X under the current model """ Xr = X - self.mean_ n_features = X.shape[1] log_like = np.zeros(X.shape[0]) if (self.precision_ is None) and (self.covariance_ is None): XrP, ldet = self.dot_precision(X=Xr, logdet=True) else: if self.precision_ is None: self.precision_ = linalg.inv(self.covariance_) XrP = np.dot(Xr, self.precision_) ldet = fast_logdet(self.covariance_) log_like = -.5 * (Xr * XrP).sum(axis=1) log_like -= .5 * (ldet + n_features * log(2. * np.pi)) return log_like
def _ridge_smooth_fun_grad(theta, X, y, D, verbose, other): nv = theta[0] alphas = theta[1:] XX, Xy, yy, fit_intercept = other N, p = X.shape nD = D.shape[2] I = np.eye(p) # Prior covariance matrix for current parameter setting Cprior, invC = _ridge_smooth_inverse(D, alphas, fit_intercept=fit_intercept) # Posterior covariance and mean SS = linalg.pinv(XX / nv + invC) mu = np.dot(SS, Xy) / nv # Compute log-evidence term1 = .5 * (extmath.fast_logdet(2 * np.pi * SS) - p * np.log(2 * np.pi * nv) - p * np.log(2 * np.pi) - 1. / extmath.fast_logdet(invC)) term2 = -.5 * (yy / nv - np.dot(Xy.T, np.dot(SS, Xy)) / nv**2) logE = term1 + term2 # Derivative with respect to covariance hyperparameters dAlphas = np.zeros((nD, )) for i in range(nD): A = Cprior - SS - np.outer(mu, mu) dAlphas[i] = .5 * np.trace(np.dot(A, D[:, :, i])) # Gradient with respect to the noise variance SSinvC = np.dot(SS, invC) rss = yy - 2 * np.dot(mu.T, Xy) + np.dot(mu.T, XX).dot(mu) dNsevar = -N / nv + np.trace(I - SSinvC) / nv + rss / nv**2 dEE = np.append(dNsevar.item(), dAlphas) if verbose: ss = ("-logE: %0.3f | nv: %0.3f | alphas: (" % (-logE, nv)) for alpha in alphas: ss += ("%0.3g, " % alpha) print(s[:-2] + ")") return -logE, -dEE
def log_likelihood(self, X): """Equivalent to scipy. from scipy.stats import wishart wishart.logpdf(X, nu, S) """ nu = self.nu n_dim = X.shape[0] inv_S = self.inv_S logp = (nu - n_dim - 1) * fast_logdet(X) logp -= np.sum(X * inv_S) logp -= nu * n_dim * np.log(2) logp -= 2 * multigammaln(0.5 * nu, n_dim) logp -= nu * fast_logdet(self.S) logp /= 2.0 return logp
def dot_precision(self,X,logdet=False): """Compute the dot product of a matrix X by the data precision matrix with the generative model. Returns ------- Y : array, shape=(n_samples, n_features) =X*precision """ n_features = self.components_.shape[1] # handle corner cases first if self.n_components_ == 0: if logdet: #import pdb;pdb.set_trace() if np.isscalar(self.noise_variance_): logdet_cov = np.log(self.noise_variance_)*X.shape[1] else: assert self.noise_variance_.shape[0] == X.shape[1], "self.noise_variance_.shape[0] == X.shape[1]" logdet_cov = np.log(self.noise_variance_).sum() return X / self.noise_variance_, logdet_cov else: return X / self.noise_variance_ if self.n_components_ == n_features: covariance = self.get_covariance() if logdet: return X.dot(linalg.inv(covariance)), fast_logdet(covariance) else: return X.dot(linalg.inv(covariance)) # Get precision using matrix inversion lemma components_ = self.components_ exp_var = self.explained_variance_ if self.whiten: components_ = components_ * np.sqrt(exp_var[:, np.newaxis]) exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.) Xprecision = (1.0 / exp_var_diff) + (1.0 / self.noise_variance_) if logdet: #import pdb;pdb.set_trace() if np.isscalar(self.noise_variance_): logdet_cov = np.log(self.noise_variance_)*X.shape[1] else: assert self.noise_variance_.shape[0] == X.shape[1], "self.noise_variance_.shape[0] == X.shape[1]" logdet_cov = np.log(self.noise_variance_).sum() logdet_cov += np.log(((1.0 / exp_var_diff) + (1.0 / self.noise_variance_))).sum() logdet_cov += np.log(exp_var_diff).sum() Xprecision *= (self.noise_variance_ * self.noise_variance_) Xprecision = (X.dot(components_.T/(-Xprecision))).dot(components_) Xprecision += X/self.noise_variance_ #cprecision=((1.0 / exp_var_diff) + (1.0 / self.noise_variance_))*(-self.noise_variance_ * self.noise_variance_) #cprecision = (components_.T/cprecision).dot(components_) #cprecision.flat[::len(cprecision) + 1] += 1. / self.noise_variance_ #Xcprecision = X.dot(cprecision) if logdet: return Xprecision, logdet_cov else: return Xprecision
def log_likelihood_full(test_series, full_cov): """ Return the log likelihood of test_series under the model described by cov, maps, and residues. """ # Without the matrix inversion lemma n_samples = test_series.shape[0] return -fast_logdet(full_cov) - 1. / n_samples * \ np.trace(np.dot(np.dot(test_series, linalg.inv(full_cov)), test_series.T))
def objective_function(self, data, location, covariance): """Objective function minimized at each step of the MCD algorithm. """ precision = pinvh(covariance) det = fast_logdet(precision) trace = np.trace( np.dot(empirical_covariance(data - location, assume_centered=True), precision)) pen = self.shrinkage * np.trace(precision) return -det + trace + pen
def _score_samples(self, X, session=None): check_is_fitted(self, "mean_") X = check_array(X) Xr = X - self.mean_ n_features = X.shape[1] precision = self.get_precision().fetch(session=session) log_like = -0.5 * (Xr * (mt.dot(Xr, precision))).sum(axis=1) log_like -= 0.5 * (n_features * log(2.0 * mt.pi) - fast_logdet(precision)) return log_like
def ebic(covariance, precision, n_samples, n_features, gamma=0): ''' Extended Bayesian Information Criteria for model selection. When using path mode, use this as an alternative to cross-validation for finding lambda. See: "Extended Bayesian Information Criteria for Gaussian Graphical Models" R. Foygel and M. Drton, NIPS 2010 Parameters ---------- covariance : 2D ndarray (n_features, n_features) Maximum Likelihood Estimator of covariance (sample covariance) precision : 2D ndarray (n_features, n_features) The precision matrix of the model to be tested n_samples : int Number of examples. n_features : int Dimension of an example. lam: (float) Threshold value for precision matrix. This should be lambda scaling used to obtain this estimate. gamma : (float) \in (0, 1) Choice of gamma=0 leads to classical BIC Positive gamma leads to stronger penalization of large graphs. Returns ------- ebic score (float). Caller should minimized this score. ''' l_theta = -np.sum(covariance * precision) + fast_logdet(precision) l_theta *= n_features / 2. # is something goes wrong with fast_logdet, return large value if np.isinf(l_theta) or np.isnan(l_theta): return 1e10 mask = np.abs(precision.flat) > np.finfo(precision.dtype).eps precision_nnz = (np.sum(mask) - n_features) / 2.0 # lower off diagonal tri return ( -2.0 * l_theta + precision_nnz * np.log(n_samples) + 4.0 * precision_nnz * np.log(n_features) * gamma )
def decision_function(self, X, raw_values=True): """ """ n_features = self.cov_.shape[0] prec_ = linalg.pinv(self.cov_) dist = np.zeros((X.shape[0], self.support.shape[0])) for i, x in enumerate(X): for j, t in enumerate(self.support): dist[i, j] = distance.mahalanobis(x, t, prec_) a = fast_logdet(self.cov_) density = np.log(np.ravel(np.exp(-.5 * dist).mean(1))) \ - 0.5 * a - (.5 * n_features) * np.log(2. * np.pi) return -density
def _decision_function2(self, X): check_is_fitted(self, "classes_") X = check_array(X) precisions = self.get_observed_precision() norm2 = [] for i in range(len(self.classes_)): Xm = X - self.means_[i] # X2 = np.dot(Xm, R * (S ** (-0.5))) X2 = np.linalg.multi_dot((Xm, precisions[i], Xm.T)) norm2.append(np.diag(X2)) norm2 = np.array(norm2).T # shape = [len(X), n_classes] u = np.asarray([-fast_logdet(s) for s in precisions]) return -0.5 * (norm2 + u) + np.log(self.priors_)
def _gaussian_likelihood(self, S_test, prec): """ Estimates the likelihood of the neighbourhood selection using the Gaussian log-likelihood model Parameters ---------- S_test : array_like n by p matrix - data matrix of test data prec : array_like p by p matrix - estimated precision matrix """ p = S_test.shape[0] log_likelihood_ = -fast_logdet(prec) + np.trace(S_test @ prec) log_likelihood_ -= p * np.log(2 * np.pi) return log_likelihood_
def test_bayesian_ridge_score_values(): """Check value of score on toy example. Compute log marginal likelihood with equation (36) in Sparse Bayesian Learning and the Relevance Vector Machine (Tipping, 2001): - 0.5 * (log |Id/alpha + X.X^T/lambda| + y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi)) + lambda_1 * log(lambda) - lambda_2 * lambda + alpha_1 * log(alpha) - alpha_2 * alpha and check equality with the score computed during training. """ X, y = diabetes.data, diabetes.target n_samples = X.shape[0] # check with initial values of alpha and lambda (see code for the values) eps = np.finfo(np.float64).eps alpha_ = 1.0 / (np.var(y) + eps) lambda_ = 1.0 # value of the parameters of the Gamma hyperpriors alpha_1 = 0.1 alpha_2 = 0.1 lambda_1 = 0.1 lambda_2 = 0.1 # compute score using formula of docstring score = lambda_1 * log(lambda_) - lambda_2 * lambda_ score += alpha_1 * log(alpha_) - alpha_2 * alpha_ M = 1.0 / alpha_ * np.eye(n_samples) + 1.0 / lambda_ * np.dot(X, X.T) M_inv = pinvh(M) score += -0.5 * (fast_logdet(M) + np.dot(y.T, np.dot(M_inv, y)) + n_samples * log(2 * np.pi)) # compute score with BayesianRidge clf = BayesianRidge( alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, n_iter=1, fit_intercept=False, compute_score=True, ) clf.fit(X, y) assert_almost_equal(clf.scores_[0], score, decimal=9)
def pison_correction(n, p): """ """ repeat = 100 pth_roots = np.zeros(repeat) for i in range(repeat): print i data = np.dot(np.random.randn(n, p), np.eye(p)) mcd = MCD(h=None).fit(data) covariance = mcd.raw_covariance_ pth_roots[i] = np.exp(fast_logdet(covariance)) res_inv = (1. / repeat) * np.sum(pth_roots ** (1. / p)) return 1. / res_inv
def pison_correction(n, p): """ """ repeat = 100 pth_roots = np.zeros(repeat) for i in range(repeat): print i data = np.dot(np.random.randn(n, p), np.eye(p)) mcd = MCD(h=None).fit(data) covariance = mcd.raw_covariance_ pth_roots[i] = np.exp(fast_logdet(covariance)) res_inv = (1. / repeat) * np.sum(pth_roots**(1. / p)) return 1. / res_inv
def likelihood(self, S, theta): """ Likelihood function for a Gaussian model Parameters ---------- S : array_like p by p matrix - Covariance matrix of problem theta : array_like estimated precision matrix Returns ------- float - Gaussian loglikelihood of the estimated model """ p = S.shape[0] log_likelihood_ = -fast_logdet(theta) + np.trace(S @ theta) log_likelihood_ -= p * np.log(2 * np.pi) return log_likelihood_
def test_gaussian_mixture_aic_bic(): # Test the aic and bic criteria rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 3, 2 X = rng.randn(n_samples, n_features) # standard gaussian entropy sgh = 0.5 * (fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi))) for cv_type in COVARIANCE_TYPE: g = GaussianMixture( n_components=n_components, covariance_type=cv_type, random_state=rng, max_iter=200) g.fit(X) aic = 2 * n_samples * sgh + 2 * g._n_parameters() bic = (2 * n_samples * sgh + np.log(n_samples) * g._n_parameters()) bound = n_features / np.sqrt(n_samples) assert (g.aic(X) - aic) / n_samples < bound assert (g.bic(X) - bic) / n_samples < bound
def test_gaussian_mixture_aic_bic(): # Test the aic and bic criteria rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 3, 2 X = rng.randn(n_samples, n_features) # standard gaussian entropy sgh = 0.5 * (fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi))) for cv_type in COVARIANCE_TYPE: g = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=rng, max_iter=200) g.fit(X) aic = 2 * n_samples * sgh + 2 * g._n_parameters() bic = (2 * n_samples * sgh + np.log(n_samples) * g._n_parameters()) bound = n_features / np.sqrt(n_samples) assert_true((g.aic(X) - aic) / n_samples < bound) assert_true((g.bic(X) - bic) / n_samples < bound)
def test_bayesian_ridge_score_values(): """Check value of score on toy example. Compute log marginal likelihood with equation (36) in Sparse Bayesian Learning and the Relevance Vector Machine (Tipping, 2001): - 0.5 * (log |Id/alpha + X.X^T/lambda| + y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi)) + lambda_1 * log(lambda) - lambda_2 * lambda + alpha_1 * log(alpha) - alpha_2 * alpha and check equality with the score computed during training. """ X, y = diabetes.data, diabetes.target n_samples = X.shape[0] # check with initial values of alpha and lambda (see code for the values) eps = np.finfo(np.float64).eps alpha_ = 1. / (np.var(y) + eps) lambda_ = 1. # value of the parameters of the Gamma hyperpriors alpha_1 = 0.1 alpha_2 = 0.1 lambda_1 = 0.1 lambda_2 = 0.1 # compute score using formula of docstring score = lambda_1 * log(lambda_) - lambda_2 * lambda_ score += alpha_1 * log(alpha_) - alpha_2 * alpha_ M = 1. / alpha_ * np.eye(n_samples) + 1. / lambda_ * np.dot(X, X.T) M_inv = pinvh(M) score += - 0.5 * (fast_logdet(M) + np.dot(y.T, np.dot(M_inv, y)) + n_samples * log(2 * np.pi)) # compute score with BayesianRidge clf = BayesianRidge(alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, n_iter=1, fit_intercept=False, compute_score=True) clf.fit(X, y) assert_almost_equal(clf.scores_[0], score, decimal=9)
def _ridge_evidence_fun_grad(theta, X, y, verbose, other): nv = theta[0] alpha = theta[1] XX, Xy, yy = other N, p = X.shape I = np.eye(p) # Prior covariance matrix for current parameter setting Cprior = 1. / alpha * I Cprior[0, 0] = 0 invCprior = alpha * I invCprior[0, 0] = 0 # Posterior covariance and mean SS = linalg.pinv(XX / nv + invCprior) mu = np.dot(SS, Xy) / nv # (1) Compute log-evidence term1 = .5 * (fast_logdet(2 * np.pi * SS) - p * np.log(2 * np.pi / alpha) - p * np.log(2 * np.pi * nv)) term2 = -.5 * (yy / nv - np.dot(Xy.T, np.dot(SS, Xy)) / nv**2) logE = term1 + term2 # Gradient with respect to the ridge parameter # dAlpha = .5 * np.trace(1./alpha*I + SS + np.outer(mu, mu)) dAlpha = p / (2 * alpha) - .5 * np.sum(mu * mu) - .5 * np.trace(SS) # Gradient with respect to the noise variance SSinvC = np.dot(SS, invCprior) rss = yy - 2 * np.dot(mu.T, Xy) + np.dot(mu.T, XX).dot(mu) dNsevar = -N / nv + np.trace(I - SSinvC) / nv + rss / nv**2 dEE = np.array([dNsevar.item(), dAlpha]) if verbose: print("-logE: %0.3f | nv: %0.3f | alpha: %0.3f" % (-logE, nv, alpha)) return -logE, -dEE
def score(self, X, y=None): """Return a score associated to new data Parameters ---------- X: array of shape(n_samples, n_features) The data to test Returns ------- ll: array of shape (n_samples), log-likelihood of each row of X under the current model """ Xr = X - self.mean_ n_features = X.shape[1] log_like = np.zeros(X.shape[0]) self.precision_ = linalg.inv(self.covariance_) log_like = -.5 * (Xr * (np.dot(Xr, self.precision_))).sum(axis=1) log_like -= .5 * (fast_logdet(self.covariance_) + n_features * log(2. * np.pi)) return log_like
def samplewise_log_likelihood(X, mean, precision): """Return the log-likelihood of each sample. See - http://www.miketipping.com/papers/met-mppca.pdf code adapted from https://github.com/scikit-learn/scikit-learn/blob/ed5e127b/sklearn/decomposition/pca.py#L516 Parameters ---------- X : array, shape(n_samples, n_features), the sample data mean: float, the mean of the current model precision: array, shape(n_features, n_features), precision matrix of the current model Returns ------- ll : array, shape (n_samples,) : Log-likelihood of each sample under the current model """ Xr = X - mean n_features = X.shape[1] log_like = np.zeros(X.shape[0]) log_like = -.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1) log_like -= .5 * (n_features * log(2. * np.pi) - fast_logdet(precision)) return log_like.reshape(-1, 1)
def covsel(x, p, nonZero, C): """Objective and gradient for MLE of precision given empirical covariance. nonZero is a list of non-zero upper triangle precision matrix entries. Based on sparse GGM estimation code by Mark Schmidt. """ X = np.zeros((p, p)) X[nonZero] = x # fill the diagonal and upper triangle X += np.triu(X, 1).T # fill the lower triangle # Fast Way to compute -logdet(X) + tr(X*C) # f = -2*sum(log(diag(R))) + sum(sum(C.*X)) + (lambda/2)*sum(X(:).^2); f = -fast_logdet(X) + np.sum(C * X) if f < np.inf: g = C - linalg.pinvh(X) g += np.tril(g, -1).T # add contribution from lower to upper triangle g = g[nonZero] else: g = 0 return f, g
def score_samples(self, X): """Compute the log-likelihood of each sample Parameters ---------- X: array, shape (n_samples, n_features) The data Returns ------- ll: array, shape (n_samples,) Log-likelihood of each sample under the current model """ check_is_fitted(self, 'components_') Xr = X - self.mean_ precision = self.get_precision() n_features = X.shape[1] log_like = np.zeros(X.shape[0]) log_like = -.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1) log_like -= .5 * (n_features * log(2. * np.pi) - fast_logdet(precision)) return log_like
def kl_loss(covariance, precision): """Computes the KL divergence between precision estimate and reference covariance. The loss is computed as: Trace(Theta_1 * Sigma_0) - log(Theta_0 * Sigma_1) - dim(Sigma) Parameters ---------- covariance : 2D ndarray (n_features, n_features) Maximum Likelihood Estimator of covariance precision : 2D ndarray (n_features, n_features) The precision matrix of the covariance model to be tested Returns ------- KL-divergence """ assert covariance.shape == precision.shape dim, _ = precision.shape logdet_p_dot_c = fast_logdet(np.dot(precision, covariance)) return 0.5 * (np.sum(precision * covariance) - logdet_p_dot_c - dim)
def _c_step(X, n_support, random_state, remaining_iterations=30, initial_estimates=None, verbose=False, cov_computation_method=empirical_covariance): n_samples, n_features = X.shape dist = np.inf # Initialisation support = np.zeros(n_samples, dtype=bool) if initial_estimates is None: # compute initial robust estimates from a random subset support[random_state.permutation(n_samples)[:n_support]] = True else: # get initial robust estimates from the function parameters location = initial_estimates[0] covariance = initial_estimates[1] # run a special iteration for that case (to get an initial support) precision = linalg.pinvh(covariance) X_centered = X - location dist = (np.dot(X_centered, precision) * X_centered).sum(1) # compute new estimates support[np.argsort(dist)[:n_support]] = True X_support = X[support] location = X_support.mean(0) covariance = cov_computation_method(X_support) # Iterative procedure for Minimum Covariance Determinant computation det = fast_logdet(covariance) # If the data already has singular covariance, calculate the precision, # as the loop below will not be entered. if np.isinf(det): precision = linalg.pinvh(covariance) previous_det = np.inf while (det < previous_det and remaining_iterations > 0 and not np.isinf(det)): # save old estimates values previous_location = location previous_covariance = covariance previous_det = det previous_support = support # compute a new support from the full data set mahalanobis distances precision = linalg.pinvh(covariance) X_centered = X - location dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1) # compute new estimates support = np.zeros(n_samples, dtype=bool) support[np.argsort(dist)[:n_support]] = True X_support = X[support] location = X_support.mean(axis=0) covariance = cov_computation_method(X_support) det = fast_logdet(covariance) # update remaining iterations for early stopping remaining_iterations -= 1 previous_dist = dist dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1) # Check if best fit already found (det => 0, logdet => -inf) if np.isinf(det): results = location, covariance, det, support, dist # Check convergence if np.allclose(det, previous_det): # c_step procedure converged if verbose: print("Optimal couple (location, covariance) found before" " ending iterations (%d left)" % (remaining_iterations)) results = location, covariance, det, support, dist elif det > previous_det: # determinant has increased (should not happen) warnings.warn( "Determinant has increased; this should not happen: " "log(det) > log(previous_det) (%.15f > %.15f). " "You may want to try with a higher value of " "support_fraction (current value: %.3f)." % (det, previous_det, n_support / n_samples), RuntimeWarning) results = previous_location, previous_covariance, \ previous_det, previous_support, previous_dist # Check early stopping if remaining_iterations == 0: if verbose: print('Maximum number of iterations reached') results = location, covariance, det, support, dist return results
def dot_precision(self, X, logdet=False): """Compute the dot product of a matrix X by the data precision matrix with the generative model. Returns ------- Y : array, shape=(n_samples, n_features) =X*precision """ n_features = self.components_.shape[1] # handle corner cases first if self.n_components_ == 0: if logdet: #import pdb;pdb.set_trace() if np.isscalar(self.noise_variance_): logdet_cov = np.log(self.noise_variance_) * X.shape[1] else: assert self.noise_variance_.shape[0] == X.shape[ 1], "self.noise_variance_.shape[0] == X.shape[1]" logdet_cov = np.log(self.noise_variance_).sum() return X / self.noise_variance_, logdet_cov else: return X / self.noise_variance_ if self.n_components_ == n_features: covariance = self.get_covariance() if logdet: return X.dot(linalg.inv(covariance)), fast_logdet(covariance) else: return X.dot(linalg.inv(covariance)) # Get precision using matrix inversion lemma components_ = self.components_ exp_var = self.explained_variance_ if self.whiten: components_ = components_ * np.sqrt(exp_var[:, np.newaxis]) exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.) Xprecision = (1.0 / exp_var_diff) + (1.0 / self.noise_variance_) if logdet: #import pdb;pdb.set_trace() if np.isscalar(self.noise_variance_): logdet_cov = np.log(self.noise_variance_) * X.shape[1] else: assert self.noise_variance_.shape[0] == X.shape[ 1], "self.noise_variance_.shape[0] == X.shape[1]" logdet_cov = np.log(self.noise_variance_).sum() logdet_cov += np.log( ((1.0 / exp_var_diff) + (1.0 / self.noise_variance_))).sum() logdet_cov += np.log(exp_var_diff).sum() Xprecision *= (self.noise_variance_ * self.noise_variance_) Xprecision = (X.dot(components_.T / (-Xprecision))).dot(components_) Xprecision += X / self.noise_variance_ #cprecision=((1.0 / exp_var_diff) + (1.0 / self.noise_variance_))*(-self.noise_variance_ * self.noise_variance_) #cprecision = (components_.T/cprecision).dot(components_) #cprecision.flat[::len(cprecision) + 1] += 1. / self.noise_variance_ #Xcprecision = X.dot(cprecision) if logdet: return Xprecision, logdet_cov else: return Xprecision
def objective_function(self, data, location, covariance): """ """ det = fast_logdet(covariance) return det
def c_step(X, h, objective_function, initial_estimates, verbose=False, cov_computation_method=empirical_covariance): """C_step procedure described in [1] aiming at computing the MCD Parameters ---------- X: array-like, shape (n_samples, n_features) Data set in which we look for the h observations whose scatter matrix has minimum determinant h: int, > n_samples / 2 Number of observations to compute the ribust estimates of location and covariance from. remaining_iterations: int Number of iterations to perform. According to Rousseeuw [1], two iterations are sufficient to get close to the minimum, and we never need more than 30 to reach convergence. initial_estimates: 2-tuple Initial estimates of location and shape from which to run the c_step procedure: - initial_estimates[0]: an initial location estimate - initial_estimates[1]: an initial covariance estimate verbose: boolean Verbose mode Returns ------- location: array-like, shape (n_features,) Robust location estimates covariance: array-like, shape (n_features, n_features) Robust covariance estimates support: array-like, shape (n_samples,) A mask for the `h` observations whose scatter matrix has minimum determinant Notes ----- References: [1] A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS """ n_samples, n_features = X.shape n_iter = 30 remaining_iterations = 30 # Get initial robust estimates from the function parameters location = initial_estimates[0] covariance = initial_estimates[1] # run a special iteration for that case (to get an initial support) precision = pinvh(covariance) X_centered = X - location dist = (np.dot(X_centered, precision) * X_centered).sum(1) # compute new estimates support = np.zeros(n_samples).astype(bool) support[np.argsort(dist)[:h]] = True location = X[support].mean(0) covariance = cov_computation_method(X[support]) previous_obj = np.inf # Iterative procedure for Minimum Covariance Determinant computation obj = objective_function(X[support], location, covariance) while (obj < previous_obj) and (remaining_iterations > 0): # save old estimates values previous_location = location previous_covariance = covariance previous_obj = obj previous_support = support # compute a new support from the full data set mahalanobis distances precision = pinvh(covariance) X_centered = X - location dist = (np.dot(X_centered, precision) * X_centered).sum(1) # compute new estimates support = np.zeros(n_samples).astype(bool) support[np.argsort(dist)[:h]] = True location = X[support].mean(axis=0) covariance = cov_computation_method(X[support]) obj = objective_function(X[support], location, covariance) # update remaining iterations for early stopping remaining_iterations -= 1 # Catch computation errors if np.isinf(obj): raise ValueError( "Singular covariance matrix. " "Please check that the covariance matrix corresponding " "to the dataset is full rank and that MCD is used with " "Gaussian-distributed data (or at least data drawn from a " "unimodal, symetric distribution.") # Check convergence if np.allclose(obj, previous_obj): # c_step procedure converged if verbose: print "Optimal couple (location, covariance) found before" \ "ending iterations (%d left)" % (remaining_iterations) results = location, covariance, obj, support elif obj > previous_obj: # objective function has increased (should not happen) current_iter = n_iter - remaining_iterations warnings.warn("Warning! obj > previous_obj (%.15f > %.15f, iter=%d)" \ % (obj, previous_obj, current_iter), RuntimeWarning) results = previous_location, previous_covariance, \ previous_obj, previous_support # Check early stopping if remaining_iterations == 0: if verbose: print 'Maximum number of iterations reached' obj = fast_logdet(covariance) results = location, covariance, obj, support return results
def _lca(self, X, max_iter=100, regularization=0, tol=1e-10): """ """ n_samples, n_features = X.shape if regularization == np.inf: # Use identity matrix if Ledoit-Wolf shrinkage == 1 print "/!\ use identity matrix" coeff = np.trace(empirical_covariance(X)) / float(n_features) self.cov_ = coeff * np.eye(n_features) prec_ = self.cov_ # learn the kernel dist = np.zeros((n_samples, self.support.shape[0])) for i, x in enumerate(X): for j, t in enumerate(self.support): dist[i, j] = distance.mahalanobis(x, t, prec_) self.kernel = np.exp(-.5 * dist) # decompose the kernel U, D, V = linalg.svd(self.kernel) self.U = U self.D = D return self.cov_ # LCA algorithm starts cov_gauss = empirical_covariance(X) cov_gauss.flat[::n_features + 1] += regularization # EM loop # The last iteration is there to compute the final log-likelihood mean_loglike = -np.inf for l in xrange(max_iter + 1): xax = np.dot(X, np.dot(linalg.pinv(cov_gauss), X.T)) dxax = np.diag(xax).reshape((-1, 1)) logK = -.5 * (dxax + dxax.T - 2. * xax) # each datapoint cannot use itself logK.flat[::n_samples + 1] = -np.inf K = np.exp(logK) loglik1 = -.5 * fast_logdet(cov_gauss) loglik2 = np.log(np.sum(K)) - np.log(n_samples - 1) loglik3 = -n_features / (2. * np.log(2. * np.pi)) loglike = loglik1 + loglik2 + loglik3 old_mean_loglike = mean_loglike mean_loglike = np.mean(loglike) if self.verbose: print "\tIteration %d, loglike = %g" % (l, mean_loglike) if l < max_iter: if mean_loglike - old_mean_loglike < tol: #print "Convergence reached (iteration %d)" % l break # row-normalize the responsibilities B = K / np.sum(K, 1) Bsum = np.sum(B, 0) + np.sum(B, 1) cov_gauss = np.dot(X.T, np.dot(np.diag(Bsum) - B - B.T, X)) \ / float(n_samples) cov_gauss.flat[::n_features + 1] += regularization self.responsibilities = K self.cov_ = cov_gauss # learn the kernel for further decision/prediction prec_ = linalg.pinv(self.cov_) dist = np.zeros((n_samples, self.support.shape[0])) for i, x in enumerate(X): for j, t in enumerate(self.support): dist[i, j] = distance.mahalanobis(x, t, prec_) self.kernel = np.exp(-.5 * dist) # decompose the kernel U, D, V = linalg.svd(self.kernel) self.U = U self.D = D return cov_gauss
def log_likelihood_t(emp_cov, precision): """Gaussian log-likelihood without constant term in time""" score = 0 for e, p in zip(emp_cov, precision): score += fast_logdet(p) - np.sum(e * p) return score
def group_sparse_scores(precisions, n_samples, emp_covs, alpha, duality_gap=False, debug=False): """Compute scores used by group_sparse_covariance. The log-likelihood of a given list of empirical covariances / precisions. Parameters ---------- precisions : numpy.ndarray, shape (n_features, n_features, n_subjects) estimated precisions. n_samples : array-like, shape (n_subjects,) number of samples used in estimating each subject in "precisions". n_samples.sum() must be equal to 1. emp_covs : numpy.ndarray, shape (n_features, n_features, n_subjects) empirical covariance matrix alpha : float regularization parameter duality_gap : bool, optional if True, also returns a duality gap upper bound. debug : bool, optional if True, some consistency checks are performed to help solving numerical problems Returns ------- log_lik : float log-likelihood of precisions on the given covariances. This is the opposite of the loss function, without the regularization term objective : float value of objective function. This is the value minimized by group_sparse_covariance() duality_gap : float duality gap upper bound. The returned bound is tight: it vanishes for the optimal precision matrices """ n_features, _, n_subjects = emp_covs.shape log_lik = 0 for k in range(n_subjects): log_lik_k = -np.sum(emp_covs[..., k] * precisions[..., k]) log_lik_k += fast_logdet(precisions[..., k]) log_lik += n_samples[k] * log_lik_k l2 = np.sqrt((precisions**2).sum(axis=-1)) l12 = l2.sum() - np.diag(l2).sum() # Do not count diagonal terms objective = alpha * l12 - log_lik ret = (log_lik, objective) # Compute duality gap if requested if duality_gap is True: A = np.empty(precisions.shape, dtype=np.float, order="F") for k in range(n_subjects): # TODO: can be computed more efficiently using W_inv. See # Friedman, Jerome, Trevor Hastie, and Robert Tibshirani. # 'Sparse Inverse Covariance Estimation with the Graphical Lasso'. # Biostatistics 9, no. 3 (1 July 2008): 432-441. precisions_inv = scipy.linalg.inv(precisions[..., k]) if debug: assert is_spd(precisions_inv) A[..., k] = n_samples[k] * (precisions_inv - emp_covs[..., k]) if debug: np.testing.assert_almost_equal(A[..., k], A[..., k].T) # Project A on the set of feasible points alpha_max = np.sqrt((A**2).sum(axis=-1)) mask = alpha_max > alpha for k in range(A.shape[-1]): A[mask, k] *= alpha / alpha_max[mask] # Set zeros on diagonals. Essential to get an always positive # duality gap. A[..., k].flat[::A.shape[0] + 1] = 0 alpha_max = np.sqrt((A**2).sum(axis=-1)).max() dual_obj = 0 # dual objective for k in range(n_subjects): B = emp_covs[..., k] + A[..., k] / n_samples[k] dual_obj += n_samples[k] * (n_features + fast_logdet(B)) # The previous computation can lead to a non-feasible point, because # one of the Bs may not be positive definite. # Use another value in this case, that ensure positive definiteness # of B. The upper bound on the duality gap is not tight in the # following, but is smaller than infinity, which is better in any case. if not np.isfinite(dual_obj): for k in range(n_subjects): A[..., k] = -n_samples[k] * emp_covs[..., k] A[..., k].flat[::A.shape[0] + 1] = 0 alpha_max = np.sqrt((A**2).sum(axis=-1)).max() # the second value (0.05 is arbitrary: positive in ]0,1[) gamma = min((alpha / alpha_max, 0.05)) dual_obj = 0 for k in range(n_subjects): # add gamma on the diagonal B = ((1. - gamma) * emp_covs[..., k] + gamma * np.eye(emp_covs.shape[0])) dual_obj += n_samples[k] * (n_features + fast_logdet(B)) gap = objective - dual_obj ret = ret + (gap, ) return ret
def group_sparse_scores(precisions, n_samples, emp_covs, alpha, duality_gap=False, debug=False): """Compute scores used by group_sparse_covariance. The log-likelihood of a given list of empirical covariances / precisions. Parameters ---------- precisions : numpy.ndarray, shape (n_features, n_features, n_subjects) estimated precisions. n_samples : array-like, shape (n_subjects,) number of samples used in estimating each subject in "precisions". n_samples.sum() must be equal to 1. emp_covs : numpy.ndarray, shape (n_features, n_features, n_subjects) empirical covariance matrix alpha : float regularization parameter duality_gap : bool, optional if True, also returns a duality gap upper bound. debug : bool, optional if True, some consistency checks are performed to help solving numerical problems Returns ------- log_lik : float log-likelihood of precisions on the given covariances. This is the opposite of the loss function, without the regularization term objective : float value of objective function. This is the value minimized by group_sparse_covariance() duality_gap : float duality gap upper bound. The returned bound is tight: it vanishes for the optimal precision matrices """ n_features, _, n_subjects = emp_covs.shape log_lik = 0 for k in range(n_subjects): log_lik_k = - np.sum(emp_covs[..., k] * precisions[..., k]) log_lik_k += fast_logdet(precisions[..., k]) log_lik += n_samples[k] * log_lik_k l2 = np.sqrt((precisions ** 2).sum(axis=-1)) l12 = l2.sum() - np.diag(l2).sum() # Do not count diagonal terms objective = alpha * l12 - log_lik ret = (log_lik, objective) # Compute duality gap if requested if duality_gap is True: A = np.empty(precisions.shape, dtype=np.float, order="F") for k in range(n_subjects): # TODO: can be computed more efficiently using W_inv. See # Friedman, Jerome, Trevor Hastie, and Robert Tibshirani. # 'Sparse Inverse Covariance Estimation with the Graphical Lasso'. # Biostatistics 9, no. 3 (1 July 2008): 432-441. precisions_inv = scipy.linalg.inv(precisions[..., k]) if debug: assert is_spd(precisions_inv) A[..., k] = n_samples[k] * (precisions_inv - emp_covs[..., k]) if debug: np.testing.assert_almost_equal(A[..., k], A[..., k].T) # Project A on the set of feasible points alpha_max = np.sqrt((A ** 2).sum(axis=-1)) mask = alpha_max > alpha for k in range(A.shape[-1]): A[mask, k] *= alpha / alpha_max[mask] # Set zeros on diagonals. Essential to get an always positive # duality gap. A[..., k].flat[::A.shape[0] + 1] = 0 alpha_max = np.sqrt((A ** 2).sum(axis=-1)).max() dual_obj = 0 # dual objective for k in range(n_subjects): B = emp_covs[..., k] + A[..., k] / n_samples[k] dual_obj += n_samples[k] * (n_features + fast_logdet(B)) # The previous computation can lead to a non-feasible point, because # one of the Bs may not be positive definite. # Use another value in this case, that ensure positive definiteness # of B. The upper bound on the duality gap is not tight in the # following, but is smaller than infinity, which is better in any case. if not np.isfinite(dual_obj): for k in range(n_subjects): A[..., k] = - n_samples[k] * emp_covs[..., k] A[..., k].flat[::A.shape[0] + 1] = 0 alpha_max = np.sqrt((A ** 2).sum(axis=-1)).max() # the second value (0.05 is arbitrary: positive in ]0,1[) gamma = min((alpha / alpha_max, 0.05)) dual_obj = 0 for k in range(n_subjects): # add gamma on the diagonal B = ((1. - gamma) * emp_covs[..., k] + gamma * np.eye(emp_covs.shape[0])) dual_obj += n_samples[k] * (n_features + fast_logdet(B)) gap = objective - dual_obj ret = ret + (gap,) return ret