def objective_kernel(theta, K, psi, kernel, times): psi, _, _ = check_norm_prox(psi) try: # this works if it is a ExpSineSquared or RBF kernel kernel = kernel(length_scale=theta)(times) except TypeError: # maybe it's a ConstantKernel kernel = kernel(constant_value=theta)(times) obj = 0 for m in range(1, K.shape[0]): # all possible markovians jumps obj += np.sum(np.array(list(map(psi, K[m:] - K[:-m]))) * np.diag(kernel, m)) return obj
def _fit(self, emp_cov, n_samples): if self.kernel_psi is None: n_times = emp_cov.shape[0] if self.kernel_phi is None or callable(self.kernel_phi): # raise ValueError('not implemented') # mimic LTGL kernel_phi = np.eye(n_times) np.fill_diagonal(kernel_phi[:, 1:], self.eta) np.fill_diagonal(kernel_phi[1:], self.eta) # discover best kernel parameter via EM # initialise precision matrices, as warm start self.precision_ = init_precision(emp_cov, mode=self.init) self.latent_ = np.zeros_like(self.precision_) theta_old = np.zeros(n_times * (n_times - 1) // 2) kernel_psi = np.eye(n_times) psi, _, _ = check_norm_prox(self.psi) if self.n_clusters is None: self.n_clusters = n_times for i in range(self.max_iter_ext): # E step - discover best kernel theta = precision_similarity(self.get_precision(), psi) # if i > 0 and np.linalg.norm(theta_old - # theta) / theta.size < self.eps: # break # kernel_psi = theta * self.beta kernel_psi = theta labels_pred = AgglomerativeClustering( n_clusters=self.n_clusters, affinity="precomputed", linkage="complete").fit_predict(kernel_psi) if i > 0 and np.linalg.norm(labels_pred - labels_pred_old ) / labels_pred.size < self.eps: break kernel_psi = kernels.RBF(0.0001)( labels_pred[:, None]) + kernels.RBF(self.beta)( np.arange(n_times)[:, None]) # M step - fix the kernel matrix out = kernel_latent_time_graphical_lasso( emp_cov, alpha=self.alpha, tau=self.tau, rho=self.rho, kernel_phi=self.kernel_phi, kernel_psi=kernel_psi, n_samples=n_samples, tol=self.tol, rtol=self.rtol, psi=self.psi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=self.return_history, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective, init=self.precision_, ) if self.return_history: (self.precision_, self.latent_, self.covariance_, self.history_, self.n_iter_) = out else: (self.precision_, self.latent_, self.covariance_, self.n_iter_) = out theta_old = theta labels_pred_old = labels_pred else: warnings.warn("theta did not converge.") self.similarity_matrix_ = kernel_psi else: if callable(self.kernel_phi): try: # this works if it is a ExpSineSquared or RBF kernel kernel_phi = self.kernel_phi( length_scale=self.ker_phi_param)(self.classes_[:, None]) except TypeError: # maybe it's a ConstantKernel kernel_phi = self.kernel_phi( constant_value=self.ker_phi_param)(self.classes_[:, None]) else: kernel_phi = self.kernel_phi if kernel_phi.shape[0] != self.classes_.size: raise ValueError( "kernel_phi size does not match classes of samples, " "got {} classes and kernel_phi has shape {}".format( self.classes_.size, kernel_phi.shape[0])) if callable(self.kernel_psi): try: # this works if it is a ExpSineSquared kernel kernel_psi = self.kernel_psi( length_scale=self.ker_psi_param)(self.classes_[:, None]) except TypeError: # maybe it's a ConstantKernel kernel_psi = self.kernel_psi( constant_value=self.ker_psi_param)(self.classes_[:, None]) else: kernel_psi = self.kernel_psi if kernel_psi.shape[0] != self.classes_.size: raise ValueError( "kernel_psi size does not match classes of samples, " "got {} classes and kernel_psi has shape {}".format( self.classes_.size, kernel_psi.shape[0])) out = kernel_latent_time_graphical_lasso( emp_cov, alpha=self.alpha, tau=self.tau, rho=self.rho, kernel_phi=kernel_phi, kernel_psi=kernel_psi, n_samples=n_samples, tol=self.tol, rtol=self.rtol, psi=self.psi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=self.return_history, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective, init=self.init, ) if self.return_history: (self.precision_, self.latent_, self.covariance_, self.history_, self.n_iter_) = out else: (self.precision_, self.latent_, self.covariance_, self.n_iter_) = out return self
def _fit_time_poisson_model( X, alpha=0.01, rho=1, kernel=None, max_iter=100, verbose=False, psi="laplacian", gamma=0.1, tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, compute_objective=True, stop_at=None, stop_when=1e-4, n_cores=-1, ): """Time-varying graphical model solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(K_i, X_i) + alpha ||K_i||_{od,1} + sum_{s>t}^T k(s,t) Psi(K_s - K_t) where X is a matrix n_i x D, the observations at time i and the log-likelihood changes according to the distribution. Parameters ---------- X : ndarray, shape (n_times, n_samples, n_features) Data matrix. It has to contain two values: 0 or 1, -1 or 1. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. init : {'empirical', 'zeros', ndarray}, default 'empirical' How to initialise the inverse covariance matrix. Default is take the empirical covariance and inverting it. Returns ------- X : numpy.array, 2-dimensional Solution to the problem. history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) n_times, n_samples, n_features = X.shape n_samples = np.array([n_samples] * n_times) if kernel is None: kernel = np.eye(n_times) K = np.zeros((n_times, n_features, n_features)) Z_M = {} U_M = {} Z_M_old = {} for m in range(1, n_times): # all possible non markovians jumps Z_L = K.copy()[:-m] Z_R = K.copy()[m:] Z_M[m] = (Z_L, Z_R) U_L = np.zeros_like(Z_L) U_R = np.zeros_like(Z_R) U_M[m] = (U_L, U_R) Z_L_old = np.zeros_like(Z_L) Z_R_old = np.zeros_like(Z_R) Z_M_old[m] = (Z_L_old, Z_R_old) checks = [convergence(obj=objective(X, K, Z_M, alpha, kernel, psi))] for iteration_ in range(max_iter): # update K A = np.zeros_like(K) for m in range(1, n_times): A[:-m] += Z_M[m][0] - U_M[m][0] A[m:] += Z_M[m][1] - U_M[m][1] A /= n_times A += A.transpose(0, 2, 1) A /= 2.0 # K_new = np.zeros_like(K) for t in range(n_times): thetas_pred = [] for v in range(n_features): inner_verbose = max(0, verbose - 1) res = fit_each_variable(X[t, :, :], v, alpha, tol=tol, verbose=inner_verbose, A=A[t, :, :], T=n_times, rho=rho) thetas_pred.append(res[0]) K[t, :, :] = build_adjacency_matrix(thetas_pred, "union") # other Zs for m in range(1, n_times): U_L, U_R = U_M[m] A_L = K[:-m] + U_L A_R = K[m:] + U_R if not psi_node_penalty: prox_e = prox_psi(A_R - A_L, lamda=2.0 * np.diag(kernel, m)[:, None, None] / rho) Z_L = 0.5 * (A_L + A_R - prox_e) Z_R = 0.5 * (A_L + A_R + prox_e) else: Z_L, Z_R = prox_psi( np.concatenate((A_L, A_R), axis=1), lamda=0.5 * np.diag(kernel, m)[:, None, None] / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) Z_M[m] = (Z_L, Z_R) # update other residuals U_L += K[:-m] - Z_L U_R += K[m:] - Z_R # diagnostics, reporting, termination checks rnorm = np.sqrt( sum( squared_norm(K[:-m] - Z_M[m][0]) + squared_norm(K[m:] - Z_M[m][1]) for m in range(1, n_times))) snorm = rho * np.sqrt( sum( squared_norm(Z_M[m][0] - Z_M_old[m][0]) + squared_norm(Z_M[m][1] - Z_M_old[m][1]) for m in range(1, n_times))) obj = objective(X, K, Z_M, alpha, kernel, psi) if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=n_features * n_times * tol + rtol * max( np.sqrt( sum( squared_norm(Z_M[m][0]) + squared_norm(Z_M[m][1]) for m in range(1, n_times))), np.sqrt( squared_norm(K) + sum( squared_norm(K[:-m]) + squared_norm(K[m:]) for m in range(1, n_times))), ), e_dual=n_features * n_times * tol + rtol * rho * np.sqrt( sum( squared_norm(U_M[m][0]) + squared_norm(U_M[m][1]) for m in range(1, n_times))), ) for m in range(1, n_times): Z_M_old[m] = (Z_M[m][0].copy(), Z_M[m][1].copy()) if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled # U_0 *= rho / rho_new for m in range(1, n_times): U_L, U_R = U_M[m] U_L *= rho / rho_new U_R *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [K] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=False, dtype=np.float64, order="C", ensure_min_features=2, estimator=self) self.classes_, n_samples = np.unique(y, return_counts=True) self.data = X.copy() if np.unique(self.data).size != 2: raise ValueError("Using the ising distribution your data has " "to contain only two values, either 0 and 1 " "or -1, 1") X = np.array([X[y == cl] for cl in self.classes_]) print(X.shape) if self.kernel is None: # from scipy.optimize import minimize # discover best kernel parameter via EM # initialise precision matrices, as warm start self.precision_ = np.random.rand(X.shape[0], X.shape[0]) n_times = self.precision_.shape[0] kernel = np.eye(n_times) psi, _, _ = check_norm_prox(self.psi) if self.n_clusters is None: self.n_clusters = n_times labels_pred_old = 0 for i in range(self.max_iter_ext): theta = precision_similarity(self.precision_, psi) kernel = theta labels_pred = AgglomerativeClustering( n_clusters=self.n_clusters, affinity="precomputed", linkage="complete").fit_predict(kernel) if i > 0 and np.linalg.norm(labels_pred - labels_pred_old ) / labels_pred.size < self.eps: break kernel = kernels.RBF(0.0001)( labels_pred[:, None]) + kernels.RBF(self.beta)( np.arange(n_times)[:, None]) out = _fit_time_poisson_model( X, alpha=self.alpha, rho=self.rho, kernel=kernel, tol=self.tol, rtol=self.rtol, psi=self.psi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=self.return_history, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective, init=self.precision_, ) if self.return_history: (self.precision_, self.history_, self.n_iter_) = out else: self.precision_, self.n_iter_ = out labels_pred_old = labels_pred else: warnings.warn("theta did not converge.") self.similarity_matrix_ = kernel else: kernel = self.kernel if kernel.shape[0] != self.classes_.size: raise ValueError( "Kernel size does not match classes of samples, " "got {} classes and kernel has shape {}".format( self.classes_.size, kernel.shape[0])) out = _fit_time_poisson_model( X, alpha=self.alpha, rho=self.rho, kernel=kernel, tol=self.tol, rtol=self.rtol, psi=self.psi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=self.return_history, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective, init=self.precision_, ) if self.return_history: (self.precision_, self.history_, self.n_iter_) = out else: self.precision_, self.n_iter_ = out return self
def gradient_equal_time_graphical_lasso(S, K_init, max_iter, loss, C, theta, rho, mult, weights, m, eps, psi, gamma, tol, rtol, verbose, return_history, return_n_iter, mode, compute_objective, stop_at, stop_when, update_rho_options): """Equality constrained time-varying graphical LASSO solver. Solves the following problem via ADMM: min sum_{i=1}^T ||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) s.t. objective = c_i for i = 1, ..., T where S_i = (1/n_i) X_i^T X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. gamma: float, optional Kernel parameter when psi is chosen to be 'kernel'. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) if loss == 'LL': loss_func = neg_logl else: loss_func = dtrace T = S.shape[0] I = np.eye(S.shape[1]) Z_0 = K_init out_obj = [] checks = [convergence(obj=penalty_objective(Z_0, Z_0, Z_0, psi, theta))] def _Z_0(x1, x2, Z_0, loss_res, nabla_con, nabla_pen): A = Z_0 - x2 * (1 - theta) * nabla_pen # A = Z_0 - x1 * nabla_con - x2 * (1 - theta) * nabla_pen A -= x1 * loss_res[:, None, None] * nabla_con return soft_thresholding_od(A, lamda=x2 * theta), A # constrained optimisation via line search def _f(x, _Z_0, Z_0, loss_res, nabla_con, nabla_pen, loss_func, S, C): _Z_0, A = _Z_0(x[0], x[1], Z_0, loss_res, nabla_con, nabla_pen) loss_res = loss_gen(loss_func, S, _Z_0) - C # loss_res_A = loss_gen(loss_func, S, A) - C # return squared_norm(loss_res) + squared_norm(loss_res - loss_res_A) return squared_norm(loss_res) + squared_norm(_Z_0 - A) / (S.shape[1] * S.shape[2]) loss_res = loss_gen(loss_func, S, Z_0) - C for iteration_ in range(max_iter): if loss_func.__name__ == 'neg_logl': nabla_con = np.array( [S_t - np.linalg.inv(A_t) for (S_t, A_t) in zip(S, Z_0)]) # nabla = np.array([S_t - np.linalg.inv(Z_0_t) for (S_t, Z_0_t) in zip(S, Z_0_pre)]) elif loss_func.__name__ == 'dtrace': nabla_con = np.array([(2 * A_t @ S_t - I) for (S_t, A_t) in zip(S, Z_0)]) # nabla = np.array([(2 * Z_0_t @ S_t - I) for (S_t, Z_0_t) in zip(S, Z_0_pre)]) nabla_pen = grad_laplacian(Z_0) out = minimize(partial(_f, _Z_0=_Z_0, Z_0=Z_0, loss_res=loss_res, nabla_con=nabla_con, nabla_pen=nabla_pen, loss_func=loss_func, S=S, C=C), x0=np.zeros(2), method='Nelder-Mead', tol=1e-4) Z_0, _ = _Z_0(out.x[0], out.x[1], Z_0, loss_res, nabla_con, nabla_pen) loss_res = loss_gen(loss_func, S, Z_0) - C out_obj.append(penalty_objective(Z_0, Z_0[:-1], Z_0[1:], psi, theta)) if not iteration_ % 100: print(iteration_) print(np.max(loss_res), np.mean(loss_res)) print(out_obj[-1]) # print(out_obj[-1], np.max(loss_res), np.mean(loss_res)) else: warnings.warn("Objective did not converge.") print(iteration_, out_obj[-1]) # print(check.rnorm, check.e_pri) # print(check.snorm, check.e_dual) covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def latent_time_matrix_decomposition(emp_cov, alpha=0.01, tau=1., rho=1., beta=1., eta=1., max_iter=100, verbose=False, psi='laplacian', phi='laplacian', mode='admm', tol=1e-4, rtol=1e-4, assume_centered=False, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True): r"""Latent variable time-varying matrix decomposition solver. Solves the following problem via ADMM: min sum_{i=1}^T || S_i-(K_i-L_i)||^2 + alpha ||K_i||_{od,1} + tau ||L_i||_* + beta sum_{i=2}^T Psi(K_i - K_{i-1}) + eta sum_{i=2}^T Phi(L_i - L_{i-1}) where S is the matrix to decompose. Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Matrix to decompose. alpha, tau, beta, eta : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) phi, prox_phi, phi_node_penalty = check_norm_prox(phi) Z_0 = np.zeros_like(emp_cov) Z_1 = np.zeros_like(Z_0)[:-1] Z_2 = np.zeros_like(Z_0)[1:] W_0 = np.zeros_like(Z_0) W_1 = np.zeros_like(Z_1) W_2 = np.zeros_like(Z_2) X_0 = np.zeros_like(Z_0) X_1 = np.zeros_like(Z_1) X_2 = np.zeros_like(Z_2) U_1 = np.zeros_like(W_1) U_2 = np.zeros_like(W_2) R_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) W_1_old = np.zeros_like(W_1) W_2_old = np.zeros_like(W_2) # divisor for consensus variables, accounting for two less matrices divisor = np.full(emp_cov.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 checks = [] for iteration_ in range(max_iter): # update R A = Z_0 - W_0 - X_0 R = (rho * A + 2 * emp_cov) / (2 + rho) # update Z_0 A = R + W_0 + X_0 A[:-1] += Z_1 - X_1 A[1:] += Z_2 - X_2 A /= divisor[:, None, None] # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # Z_0 = np.array(map(soft_thresholding_, A)) Z_0 = soft_thresholding(A, lamda=alpha / (rho * divisor[:, None, None])) # update Z_1, Z_2 A_1 = Z_0[:-1] + X_1 A_2 = Z_0[1:] + X_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. * beta / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * beta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update W_0 A = Z_0 - R - X_0 A[:-1] += W_1 - U_1 A[1:] += W_2 - U_2 A /= divisor[:, None, None] A += A.transpose(0, 2, 1) A /= 2. W_0 = np.array([ prox_trace_indicator(a, lamda=tau / (rho * div)) for a, div in zip(A, divisor) ]) # update W_1, W_2 A_1 = W_0[:-1] + U_1 A_2 = W_0[1:] + U_2 if not phi_node_penalty: prox_e = prox_phi(A_2 - A_1, lamda=2. * eta / rho) W_1 = .5 * (A_1 + A_2 - prox_e) W_2 = .5 * (A_1 + A_2 + prox_e) else: W_1, W_2 = prox_phi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * eta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals X_0 += R - Z_0 + W_0 X_1 += Z_0[:-1] - Z_1 X_2 += Z_0[1:] - Z_2 U_1 += W_0[:-1] - W_1 U_2 += W_0[1:] - W_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(R - Z_0 + W_0) + squared_norm(Z_0[:-1] - Z_1) + squared_norm(Z_0[1:] - Z_2) + squared_norm(W_0[:-1] - W_1) + squared_norm(W_0[1:] - W_2)) snorm = rho * np.sqrt( squared_norm(R - R_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old) + squared_norm(W_1 - W_1_old) + squared_norm(W_2 - W_2_old)) obj = objective(emp_cov, R, Z_0, Z_1, Z_2, W_0, W_1, W_2, alpha, tau, beta, eta, psi, phi) \ if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * max( np.sqrt( squared_norm(R) + squared_norm(Z_1) + squared_norm(Z_2) + squared_norm(W_1) + squared_norm(W_2)), np.sqrt( squared_norm(Z_0 - W_0) + squared_norm(Z_0[:-1]) + squared_norm(Z_0[1:]) + squared_norm(W_0[:-1]) + squared_norm(W_0[1:]))), e_dual=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * rho * (np.sqrt( squared_norm(X_0) + squared_norm(X_1) + squared_norm(X_2) + squared_norm(U_1) + squared_norm(U_2)))) R_old = R.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() W_1_old = W_1.copy() W_2_old = W_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled X_0 *= rho / rho_new X_1 *= rho / rho_new X_2 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [Z_0, W_0] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def time_graphical_lasso( emp_cov, alpha=0.01, rho=1, beta=1, max_iter=100, n_samples=None, verbose=False, psi="laplacian", tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, mode="admm", compute_objective=True, stop_at=None, stop_when=1e-4, update_rho_options=None, init="empirical", ): """Time-varying graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(S_i, K_i) + alpha*||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) where S_i = (1/n_i) X_i^T \times X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) Z_0 = init_precision(emp_cov, mode=init) Z_1 = Z_0.copy()[:-1] # np.zeros_like(emp_cov)[:-1] Z_2 = Z_0.copy()[1:] # np.zeros_like(emp_cov)[1:] U_0 = np.zeros_like(Z_0) U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) # divisor for consensus variables, accounting for two less matrices divisor = np.full(emp_cov.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 if n_samples is None: n_samples = np.ones(emp_cov.shape[0]) checks = [convergence(obj=objective(n_samples, emp_cov, Z_0, Z_0, Z_1, Z_2, alpha, beta, psi))] for iteration_ in range(max_iter): # update K A = Z_0 - U_0 A[:-1] += Z_1 - U_1 A[1:] += Z_2 - U_2 A /= divisor[:, None, None] # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # K = np.array(map(soft_thresholding_, A)) A += A.transpose(0, 2, 1) A /= 2.0 A *= -rho * divisor[:, None, None] / n_samples[:, None, None] A += emp_cov K = np.array([prox_logdet(a, lamda=ni / (rho * div)) for a, div, ni in zip(A, divisor, n_samples)]) # update Z_0 A = K + U_0 A += A.transpose(0, 2, 1) A /= 2.0 Z_0 = soft_thresholding(A, lamda=alpha / rho) # other Zs A_1 = K[:-1] + U_1 A_2 = K[1:] + U_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2.0 * beta / rho) Z_1 = 0.5 * (A_1 + A_2 - prox_e) Z_2 = 0.5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi( np.concatenate((A_1, A_2), axis=1), lamda=0.5 * beta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) # update residuals U_0 += K - Z_0 U_1 += K[:-1] - Z_1 U_2 += K[1:] - Z_2 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(K - Z_0) + squared_norm(K[:-1] - Z_1) + squared_norm(K[1:] - Z_2)) snorm = rho * np.sqrt(squared_norm(Z_0 - Z_0_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old)) obj = objective(n_samples, emp_cov, Z_0, K, Z_1, Z_2, alpha, beta, psi) if compute_objective else np.nan # if np.isinf(obj): # Z_0 = Z_0_old # break check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(K.size + 2 * Z_1.size) * tol + rtol * max( np.sqrt(squared_norm(Z_0) + squared_norm(Z_1) + squared_norm(Z_2)), np.sqrt(squared_norm(K) + squared_norm(K[:-1]) + squared_norm(K[1:])), ), e_dual=np.sqrt(K.size + 2 * Z_1.size) * tol + rtol * rho * np.sqrt(squared_norm(U_0) + squared_norm(U_1) + squared_norm(U_2)), # precision=Z_0.copy() ) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled U_0 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new # assert is_pos_def(Z_0) else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
tgl_g = GradientEqualTimeGraphicalLasso(max_iter=max_iter, loss=loss, c_level=c_level, theta=theta, rho=rho, mult=mult, weights=weights, m=m, eps=eps, psi=psi) emp_inv_score_g, baseline_score_g, fit_score_g, pre_g = tgl_g.fit_cov( X_cov).eval_cov_pre() toc = time.perf_counter() print('Gradient Running Time :{}'.format(toc - tic)) psi, prox_psi, psi_node_penalty = check_norm_prox(tgl_g.psi) pre_tgl = {} fit_score_tgl_thres = {} for i in [1e-4, 0]: pre_tgl[i] = np.array([k * (np.abs(k) >= i) for k in pre_]) tgl_g.precision_ = pre_tgl[i] emp_inv_score, baseline_score, fit_score_tgl_thres[ i], _ = tgl_g.eval_cov_pre() print( 'Vanilla Objective', penalty_objective(pre_tgl[i], pre_tgl[i][:-1], pre_tgl[i][1:], psi, tgl_g.theta)) pre = {} fit_score_thres = {}
def taylor_time_graphical_lasso( S, K_init, max_iter, loss, C, theta, rho, mult, weights, m, eps, psi, gamma, tol, rtol, verbose, return_history, return_n_iter, mode, compute_objective, stop_at, stop_when, update_rho_options ): """Equality constrained time-varying graphical LASSO solver. Solves the following problem via ADMM: min sum_{i=1}^T ||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) s.t. objective = c_i for i = 1, ..., T where S_i = (1/n_i) X_i^T X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. gamma: float, optional Kernel parameter when psi is chosen to be 'kernel'. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) if loss == 'LL': loss_func = neg_logl else: loss_func = dtrace T = S.shape[0] S_flat = S.copy().reshape(T, S.shape[1] * S.shape[2]) I_flat = np.diagflat(S.shape[1]).ravel() K = K_init.copy() Z_0 = K_init.copy() Z_1 = Z_0.copy()[:-1] Z_2 = Z_0.copy()[1:] u = np.zeros(T) U_0 = np.zeros_like(Z_0) U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = Z_0.copy() Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) # divisor for consensus variables, accounting for one less matrix for t = 0 and t = T divisor = np.full(T, 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 rho = rho * np.ones(T) if weights[0] is not None: if weights[0] == 'rbf': weights = rbf_weights(T, weights[1], mult) elif weights[0] == 'exp': weights = exp_weights(T, weights[1], mult) elif weights[0] == 'lin': weights = lin_weights(T, weights[1], mult) con_obj = {} for t in range(T): con_obj[t] = [] con_obj_mean = [] con_obj_max = [] # loss residuals loss_res = np.zeros(T) loss_init = loss_gen(loss_func, S, Z_0_old) loss_res_old = loss_init - C # loss_diff = C - loss_init # C_ = C - loss_diff out_obj = [] checks = [ convergence( obj=penalty_objective(Z_0, Z_1, Z_2, psi, theta)) ] def _K(x, A_t, g_t, nabla_t, nabla_t_T_A_t, nabla_t_T_nabla_t, rho_t, divisor_t): _K_t = (A_t + x * g_t * nabla_t - (x * nabla_t_T_A_t + x ** 2 * g_t * nabla_t_T_nabla_t) * nabla_t / (divisor_t * rho_t + x * nabla_t_T_nabla_t) ).reshape(S.shape[1], S.shape[2]) _K_t /= (rho_t * divisor_t) return 0.5 * (_K_t + _K_t.transpose(1, 0)) # def _K(x, A_t, nabla_t): # _A_t = A_t - x * nabla_t # return _A_t # constrained optimisation via line search def _f(x, _K, A_t, g_t, nabla_t, nabla_t_T_A_t, nabla_t_T_nabla_t, rho_t, divisor_t, loss_func, S_t, c_t, loss_res_old_t, nabla_t_T_K_old_t): _K_t = _K(x, A_t, g_t, nabla_t, nabla_t_T_A_t, nabla_t_T_nabla_t, rho_t, divisor_t) loss_res_t = loss_func(S_t, _K_t) - c_t return loss_res_t ** 2 + (loss_res_t - loss_res_old_t - nabla_t @ _K_t.ravel() + nabla_t_T_K_old_t) ** 2 # # constrained optimisation via line search # def _f(x, _K, A_t, nabla_t, loss_func, S_t, c_t, loss_res_old_t): # _K_t = _K(x, A_t, nabla_t) # loss_res_t = loss_func(S_t, _K_t) - c_t # return loss_res_t ** 2 + (loss_res_t - loss_res_old_t - np.sum(nabla_t * (_K_t - A_t))) ** 2 for iteration_ in range(max_iter): # update K A = rho[:, None, None] * (Z_0 - U_0) A[:-1] += rho[:-1, None, None] * (Z_1 - U_1) A[1:] += rho[1:, None, None] * (Z_2 - U_2) # A += A.transpose(0, 2, 1) # A /= 2. # A /= (rho * divisor)[:, None, None] # loss_res_pre = loss_gen(loss_func, S, A) - C if loss_func.__name__ == 'neg_logl': nabla = np.array([S_t - np.linalg.inv(K_t).ravel() for (S_t, K_t) in zip(S_flat, K)]) # nabla = np.array([S_t - np.linalg.inv(K_t) for (S_t, K_t) in zip(S, A)]) elif loss_func.__name__ == 'dtrace': nabla = np.array([(2 * K_t.ravel() @ S_t - I) for (S_t, K_t) in zip(S_flat, K)]) # nabla = np.array([(2 * K_t @ S_t - I) for (S_t, K_t) in zip(S, K)]) nabla_T_K_old = np.array([nabla_t @ K_t.ravel() for (nabla_t, K_t) in zip(nabla, K)]) # nabla_T_K_old = np.array([np.sum(nabla_t * K_t) for (nabla_t, K_t) in zip(nabla, K)]) g = nabla_T_K_old - loss_res_old nabla_T_A = np.array([nabla_t @ A_t.ravel() for (nabla_t, A_t) in zip(nabla, A)]) nabla_T_nabla = np.einsum('ij,ij->i', nabla, nabla) if iteration_ == 0: nabla = np.zeros_like(S_flat) # nabla = np.zeros_like(S) nabla_T_K_old = np.zeros(T) g = np.zeros(T) nabla_T_A = np.zeros(T) nabla_T_nabla = np.zeros(T) col = [] for t in range(T): out = minimize_scalar( partial(_f, _K=_K, A_t=A[t].ravel(), g_t=g[t], nabla_t=nabla[t], nabla_t_T_A_t=nabla_T_A[t], nabla_t_T_nabla_t=nabla_T_nabla[t], rho_t=rho[t], divisor_t=divisor[t], loss_func=loss_func, S_t=S[t], c_t=C[t], loss_res_old_t=loss_res_old[t], nabla_t_T_K_old_t=nabla_T_K_old[t]) ) # out = minimize_scalar( # partial(_f, _K=_K, A_t=A[t], nabla_t=nabla[t], loss_func=loss_func, # S_t=S[t], c_t=C[t], loss_res_old_t=loss_res_pre[t]) # ) K[t] = _K(out.x, A[t].ravel(), g[t], nabla[t], nabla_T_A[t], nabla_T_nabla[t], rho[t], divisor[t]) # K[t] = _K(out.x, A[t], nabla[t]) loss_res[t] = loss_func(S[t], K[t]) - C[t] # u[t] += loss_res[t] if weights[0] is not None: con_obj[t].append(loss_res[t] ** 2) if len(con_obj[t]) > m and np.mean(con_obj[t][-m:-int(m/2)]) < np.mean(con_obj[t][-int(m/2):]) and loss_res[t] > eps: col.append(t) # update Z_0 _Z_0 = K + U_0 _Z_0 += _Z_0.transpose(0, 2, 1) _Z_0 /= 2. Z_0 = soft_thresholding_od(_Z_0, lamda=theta / rho[:, None, None]) # update Z_1, Z_2 A_1 = Z_0[:-1] + U_1 A_2 = Z_0[1:] + U_2 if not psi_node_penalty: A_add = A_2 + A_1 A_sub = A_2 - A_1 prox_e_1 = prox_psi(A_sub, lamda=2. * (1 - theta) / rho[:-1, None, None]) prox_e_2 = prox_psi(A_sub, lamda=2. * (1 - theta) / rho[1:, None, None]) Z_1 = .5 * (A_add - prox_e_1) Z_2 = .5 * (A_add + prox_e_2) # TODO: Fix for rho vector # else: # if weights is not None: # Z_1, Z_2 = prox_psi( # np.concatenate((A_1, A_2), axis=1), lamda=.5 * (1 - theta) / rho[t], # rho=rho[t], tol=tol, rtol=rtol, max_iter=max_iter) # update residuals con_obj_mean.append(np.mean(loss_res) ** 2) con_obj_max.append(np.max(loss_res)) U_0 += K - Z_0 U_1 += K[:-1] - Z_1 U_2 += K[1:] - Z_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(K - Z_0) + squared_norm(K[:-1] - Z_1) + squared_norm(K[1:] - Z_2) ) loss_res_old = loss_res.copy() snorm = np.sqrt( squared_norm(rho[:, None, None] * (Z_0 - Z_0_old)) + squared_norm(rho[:-1, None, None] * (Z_1 - Z_1_old)) + squared_norm(rho[1:, None, None] * (Z_2 - Z_2_old)) ) e_dual = np.sqrt(Z_0.size + 2 * Z_1.size) * tol + rtol * np.sqrt( squared_norm(rho[:, None, None] * U_0) + squared_norm(rho[:-1, None, None] * U_1) + squared_norm(rho[1:, None, None] * U_2) ) obj = objective(loss_res, Z_0, Z_1, Z_2, psi, theta) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(loss_res.size + Z_0.size + 2 * Z_1.size) * tol + rtol * ( max(np.sqrt(squared_norm(Z_0)), np.sqrt(squared_norm(K))) + max(np.sqrt(squared_norm(Z_1)), np.sqrt(squared_norm(K[:-1]))) + max(np.sqrt(squared_norm(Z_2)), np.sqrt(squared_norm(K[1:]))) ), e_dual=e_dual ) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print( "obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) out_obj.append(penalty_objective(Z_0, Z_0[:-1], Z_0[1:], psi, theta)) if not iteration_ % 100: print(iteration_) print(np.max(con_obj_max[-1]), np.mean(loss_res)) print(out_obj[-1]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break if weights[0] is None: if len(con_obj_mean) > m: if np.mean(con_obj_mean[-m:-int(m/2)]) < np.mean(con_obj_mean[-int(m/2):]) and np.max(loss_res) > eps: # or np.mean(con_obj_max[-100:-50]) < np.mean(con_obj_max[-50:])) # np.mean(loss_res) > 0.25: print("Rho Mult", mult * rho[0], iteration_, np.mean(loss_res), con_obj_max[-1]) # loss_diff /= 5 # C_ = C - loss_diff # resscale scaled dual variables rho = mult * rho # u /= mult U_0 /= mult U_1 /= mult U_2 /= mult con_obj_mean = [] con_obj_max = [] else: for t in col: rho *= weights[t] # u /= weights[t] U_0 /= weights[t][:, None, None] U_1 /= weights[t][:-1, None, None] U_2 /= weights[t][1:, None, None] con_obj[t] = [] print('Mult', iteration_, t, rho[t]) else: warnings.warn("Objective did not converge.") print(iteration_, out_obj[-1]) # print(out_obj) print(check.rnorm, check.e_pri) print(check.snorm, check.e_dual) covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def latent_time_graph_lasso( emp_cov, alpha=1, tau=1, rho=1, beta=1., eta=1., max_iter=1000, verbose=False, psi='laplacian', phi='laplacian', mode=None, tol=1e-4, rtol=1e-2, assume_centered=False, return_history=False, return_n_iter=True): r"""Time-varying latent variable graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(K_i-L_i) + alpha ||K_i||_{od,1} + tau ||L_i||_* + beta sum_{i=2}^T Psi(K_i - K_{i-1}) + eta sum_{i=2}^T Phi(L_i - L_{i-1}) where S is the empirical covariance of the data matrix D (training observations by features). Parameters ---------- data_list : list of 2-dimensional matrices. Input matrices. alpha, tau : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi = check_norm_prox(psi) phi, prox_phi = check_norm_prox(phi) # S = np.array(map(empirical_covariance, data_list)) # n_samples = np.array([s for s in [1.]]) K = np.zeros_like(emp_cov) Z_0 = np.zeros_like(K) Z_1 = np.zeros_like(K)[:-1] Z_2 = np.zeros_like(K)[1:] W_0 = np.zeros_like(K) W_1 = np.zeros_like(K)[:-1] W_2 = np.zeros_like(K)[1:] X_0 = np.zeros_like(K) X_1 = np.zeros_like(K)[:-1] X_2 = np.zeros_like(K)[1:] Z_consensus = np.zeros_like(K) # Z_consensus_old = np.zeros_like(K) W_consensus = np.zeros_like(K) # W_consensus_old = np.zeros_like(K) R_old = np.zeros_like(K) # divisor for consensus variables, accounting for two less matrices divisor = np.full(K.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 checks = [] for iteration_ in range(max_iter): # update R A = Z_0 - W_0 - X_0 A[:-1] += Z_1 - W_1 - X_1 A[1:] += Z_2 - W_2 - X_2 A /= divisor[:, None, None] # A += np.array(map(np.transpose, A)) # A /= 2. # A *= - rho / n_samples[:, None, None] A *= - rho A += emp_cov R = np.array([prox_logdet(a, lamda=1. / rho) for a in A]) # update Z_0 # Zold = Z # X_hat = alpha * X + (1 - alpha) * Zold soft_thresholding = partial(soft_thresholding_sign, lamda=alpha / rho) Z_0 = np.array(map(soft_thresholding, R + W_0 + X_0)) # update Z_1, Z_2 # prox_l = partial(prox_laplacian, beta=2. * beta / rho) # prox_e = np.array(map(prox_l, K[1:] - K[:-1] + U_2 - U_1)) if beta != 0: A_1 = R[:-1] + W_1 + X_1 # A_1 = Z_0[:-1].copy() A_2 = R[1:] + W_2 + X_2 # A_2 = Z_0[1:].copy() prox_e = prox_psi(A_2 - A_1, lamda=2. * beta / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1 = Z_0[:-1].copy() Z_2 = Z_0[1:].copy() # update W_0 A = Z_0 - R - X_0 W_0 = np.array(map(partial(prox_trace_indicator, lamda=tau / rho), A)) # update W_1, W_2 if eta != 0: A_1 = Z_1 - R[:-1] - X_1 # A_1 = W_0[:-1].copy() A_2 = Z_2 - R[1:] - X_2 # A_2 = W_0[1:].copy() prox_e = prox_phi(A_2 - A_1, lamda=2. * eta / rho) W_1 = .5 * (A_1 + A_2 - prox_e) W_2 = .5 * (A_1 + A_2 + prox_e) else: W_1 = W_0[:-1].copy() W_2 = W_0[1:].copy() # update residuals X_0 += R - Z_0 + W_0 X_1 += R[:-1] - Z_1 + W_1 X_2 += R[1:] - Z_2 + W_2 # diagnostics, reporting, termination checks X_consensus = X_0.copy() X_consensus[:-1] += X_1 X_consensus[1:] += X_2 X_consensus /= divisor[:, None, None] Z_consensus = Z_0.copy() Z_consensus[:-1] += Z_1 Z_consensus[1:] += Z_2 Z_consensus /= divisor[:, None, None] W_consensus = W_0.copy() W_consensus[:-1] += W_1 W_consensus[1:] += W_2 W_consensus /= divisor[:, None, None] check = convergence( obj=objective(emp_cov, R, Z_0, Z_1, Z_2, W_0, W_1, W_2, alpha, tau, beta, eta, psi, phi), rnorm=np.linalg.norm(R - Z_consensus + W_consensus), snorm=np.linalg.norm(rho * (R - R_old)), e_pri=np.sqrt(np.prod(K.shape)) * tol + rtol * max( np.linalg.norm(R), np.sqrt(squared_norm(Z_consensus) - squared_norm(W_consensus))), e_dual=np.sqrt(np.prod(K.shape)) * tol + rtol * np.linalg.norm( rho * X_consensus) ) R_old = R.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break # if iteration_ % 10 == 0: # rho = rho * 0.8 else: warnings.warn("Objective did not converge.") # return_list = [Z_consensus, W_consensus, emp_cov] return_list = [Z_consensus, W_0, W_1, W_2, emp_cov] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def kernel_time_graphical_lasso( emp_cov, alpha=0.01, rho=1, kernel=None, max_iter=100, n_samples=None, verbose=False, psi="laplacian", tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, mode="admm", update_rho_options=None, compute_objective=True, stop_at=None, stop_when=1e-4, init="empirical", ): """Time-varying graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(K_i-L_i) + alpha ||K_i||_{od,1} + sum_{s>t}^T k_psi(s,t) Psi(K_s - K_t) where S is the empirical covariance of the data matrix D (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. init : {'empirical', 'zeros', ndarray}, default 'empirical' How to initialise the inverse covariance matrix. Default is take the empirical covariance and inverting it. Returns ------- X : numpy.array, 2-dimensional Solution to the problem. history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) n_times, _, n_features = emp_cov.shape if kernel is None: kernel = np.eye(n_times) Z_0 = init_precision(emp_cov, mode=init) U_0 = np.zeros_like(Z_0) Z_0_old = np.zeros_like(Z_0) Z_M, Z_M_old = {}, {} U_M = {} for m in range(1, n_times): # all possible markovians jumps Z_L = Z_0.copy()[:-m] Z_R = Z_0.copy()[m:] Z_M[m] = (Z_L, Z_R) U_L = np.zeros_like(Z_L) U_R = np.zeros_like(Z_R) U_M[m] = (U_L, U_R) Z_L_old = np.zeros_like(Z_L) Z_R_old = np.zeros_like(Z_R) Z_M_old[m] = (Z_L_old, Z_R_old) if n_samples is None: n_samples = np.ones(n_times) checks = [ convergence(obj=objective(n_samples, emp_cov, Z_0, Z_0, Z_M, alpha, kernel, psi)) ] for iteration_ in range(max_iter): # update K A = Z_0 - U_0 for m in range(1, n_times): A[:-m] += Z_M[m][0] - U_M[m][0] A[m:] += Z_M[m][1] - U_M[m][1] A /= n_times # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # K = np.array(map(soft_thresholding_, A)) A += A.transpose(0, 2, 1) A /= 2.0 A *= -rho * n_times / n_samples[:, None, None] A += emp_cov K = np.array([ prox_logdet(a, lamda=ni / (rho * n_times)) for a, ni in zip(A, n_samples) ]) # update Z_0 A = K + U_0 A += A.transpose(0, 2, 1) A /= 2.0 Z_0 = soft_thresholding(A, lamda=alpha / rho) # update residuals U_0 += K - Z_0 # other Zs for m in range(1, n_times): U_L, U_R = U_M[m] A_L = K[:-m] + U_L A_R = K[m:] + U_R if not psi_node_penalty: prox_e = prox_psi(A_R - A_L, lamda=2.0 * np.diag(kernel, m)[:, None, None] / rho) Z_L = 0.5 * (A_L + A_R - prox_e) Z_R = 0.5 * (A_L + A_R + prox_e) else: Z_L, Z_R = prox_psi( np.concatenate((A_L, A_R), axis=1), lamda=0.5 * np.diag(kernel, m)[:, None, None] / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) Z_M[m] = (Z_L, Z_R) # update other residuals U_L += K[:-m] - Z_L U_R += K[m:] - Z_R # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(K - Z_0) + sum( squared_norm(K[:-m] - Z_M[m][0]) + squared_norm(K[m:] - Z_M[m][1]) for m in range(1, n_times))) snorm = rho * np.sqrt( squared_norm(Z_0 - Z_0_old) + sum( squared_norm(Z_M[m][0] - Z_M_old[m][0]) + squared_norm(Z_M[m][1] - Z_M_old[m][1]) for m in range(1, n_times))) obj = objective(n_samples, emp_cov, Z_0, K, Z_M, alpha, kernel, psi) if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=n_features * n_times * tol + rtol * max( np.sqrt( squared_norm(Z_0) + sum( squared_norm(Z_M[m][0]) + squared_norm(Z_M[m][1]) for m in range(1, n_times))), np.sqrt( squared_norm(K) + sum( squared_norm(K[:-m]) + squared_norm(K[m:]) for m in range(1, n_times))), ), e_dual=n_features * n_times * tol + rtol * rho * np.sqrt( squared_norm(U_0) + sum( squared_norm(U_M[m][0]) + squared_norm(U_M[m][1]) for m in range(1, n_times))), ) Z_0_old = Z_0.copy() for m in range(1, n_times): Z_M_old[m] = (Z_M[m][0].copy(), Z_M[m][1].copy()) if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled U_0 *= rho / rho_new for m in range(1, n_times): U_L, U_R = U_M[m] U_L *= rho / rho_new U_R *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def _fit(self, emp_cov, n_samples): if self.kernel is None: # from scipy.optimize import minimize # discover best kernel parameter via EM # initialise precision matrices, as warm start self.precision_ = init_precision(emp_cov, mode=self.init) n_times = self.precision_.shape[0] theta_old = np.zeros(n_times * (n_times - 1) // 2) # idx = np.triu_indices(n_times, 1) kernel = np.eye(n_times) psi, _, _ = check_norm_prox(self.psi) if self.n_clusters is None: self.n_clusters = n_times for i in range(self.max_iter_ext): # E step - discover best kernel # , method='bounded'bounds=[(0, None)]*theta_old.size # theta = minimize( # objective_similarity, theta_old, # args=(self.precision_, self.classes_[:, None], psi) # ).x # theta -= np.min(theta) # theta /= np.max(theta) theta = precision_similarity(self.precision_, psi) # if i > 0 and np.linalg.norm(theta_old - # theta) / theta.size < self.eps: # break # kernel[idx] = theta # kernel[idx[::-1]] = theta kernel = theta labels_pred = AgglomerativeClustering( n_clusters=self.n_clusters, affinity="precomputed", linkage="complete").fit_predict(kernel) if i > 0 and np.linalg.norm(labels_pred - labels_pred_old ) / labels_pred.size < self.eps: break kernel = kernels.RBF(0.0001)( labels_pred[:, None]) + kernels.RBF(self.beta)( np.arange(n_times)[:, None]) # normalize_matrix(kernel_sum) # kernel += kerne * self.beta # M step - fix the kernel matrix out = kernel_time_graphical_lasso( emp_cov, alpha=self.alpha, rho=self.rho, kernel=kernel, n_samples=n_samples, tol=self.tol, rtol=self.rtol, psi=self.psi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=self.return_history, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective, init=self.precision_, ) if self.return_history: (self.precision_, self.covariance_, self.history_, self.n_iter_) = out else: self.precision_, self.covariance_, self.n_iter_ = out theta_old = theta labels_pred_old = labels_pred # kernel = graph_k_means( # list(self.precision_), 3, max_iter=100) # self.similarity_matrix = kernel # theta_old = kernel # if i > 0 and np.linalg.norm(theta_old - # kernel) / kernel.size < self.eps: # break else: warnings.warn("theta did not converge.") self.similarity_matrix_ = kernel else: kernel = self.kernel if kernel.shape[0] != self.classes_.size: raise ValueError( "Kernel size does not match classes of samples, " "got {} classes and kernel has shape {}".format( self.classes_.size, kernel.shape[0])) out = kernel_time_graphical_lasso( emp_cov, alpha=self.alpha, rho=self.rho, kernel=kernel, n_samples=n_samples, tol=self.tol, rtol=self.rtol, psi=self.psi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=self.return_history, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective, init=self.init, ) if self.return_history: (self.precision_, self.covariance_, self.history_, self.n_iter_) = out else: self.precision_, self.covariance_, self.n_iter_ = out return self
def latent_time_graphical_lasso(emp_cov, alpha=0.01, tau=1., rho=1., beta=1., eta=1., max_iter=100, n_samples=None, verbose=False, psi='laplacian', phi='laplacian', mode='admm', tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True, init='empirical'): r"""Latent variable time-varying graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(S_i, K_i-L_i) + alpha ||K_i||_{od,1} + tau ||L_i||_* + beta sum_{i=2}^T Psi(K_i - K_{i-1}) + eta sum_{i=2}^T Phi(L_i - L_{i-1}) where S_i = (1/n_i) X_i^T \times X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, tau, beta, eta : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zeros', ndarray}, default 'empirical' How to initialise the inverse covariance matrix. Default is take the empirical covariance and inverting it. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) phi, prox_phi, phi_node_penalty = check_norm_prox(phi) Z_0 = init_precision(emp_cov, mode=init) Z_1 = Z_0.copy()[:-1] Z_2 = Z_0.copy()[1:] W_0 = np.zeros_like(Z_0) W_1 = np.zeros_like(Z_1) W_2 = np.zeros_like(Z_2) X_0 = np.zeros_like(Z_0) X_1 = np.zeros_like(Z_1) X_2 = np.zeros_like(Z_2) U_1 = np.zeros_like(W_1) U_2 = np.zeros_like(W_2) R_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) W_1_old = np.zeros_like(W_1) W_2_old = np.zeros_like(W_2) # divisor for consensus variables, accounting for two less matrices divisor = np.full(emp_cov.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 if n_samples is None: n_samples = np.ones(emp_cov.shape[0]) checks = [] for iteration_ in range(max_iter): # update R A = Z_0 - W_0 - X_0 A += A.transpose(0, 2, 1) A /= 2. A *= -rho / n_samples[:, None, None] A += emp_cov # A = emp_cov / rho - A R = np.array( [prox_logdet(a, lamda=ni / rho) for a, ni in zip(A, n_samples)]) # update Z_0 A = R + W_0 + X_0 A[:-1] += Z_1 - X_1 A[1:] += Z_2 - X_2 A /= divisor[:, None, None] # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # Z_0 = np.array(map(soft_thresholding_, A)) Z_0 = soft_thresholding(A, lamda=alpha / (rho * divisor[:, None, None])) # update Z_1, Z_2 A_1 = Z_0[:-1] + X_1 A_2 = Z_0[1:] + X_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. * beta / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * beta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update W_0 A = Z_0 - R - X_0 A[:-1] += W_1 - U_1 A[1:] += W_2 - U_2 A /= divisor[:, None, None] A += A.transpose(0, 2, 1) A /= 2. W_0 = np.array([ prox_trace_indicator(a, lamda=tau / (rho * div)) for a, div in zip(A, divisor) ]) # update W_1, W_2 A_1 = W_0[:-1] + U_1 A_2 = W_0[1:] + U_2 if not phi_node_penalty: prox_e = prox_phi(A_2 - A_1, lamda=2. * eta / rho) W_1 = .5 * (A_1 + A_2 - prox_e) W_2 = .5 * (A_1 + A_2 + prox_e) else: W_1, W_2 = prox_phi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * eta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals X_0 += R - Z_0 + W_0 X_1 += Z_0[:-1] - Z_1 X_2 += Z_0[1:] - Z_2 U_1 += W_0[:-1] - W_1 U_2 += W_0[1:] - W_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(R - Z_0 + W_0) + squared_norm(Z_0[:-1] - Z_1) + squared_norm(Z_0[1:] - Z_2) + squared_norm(W_0[:-1] - W_1) + squared_norm(W_0[1:] - W_2)) snorm = rho * np.sqrt( squared_norm(R - R_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old) + squared_norm(W_1 - W_1_old) + squared_norm(W_2 - W_2_old)) obj = objective(emp_cov, n_samples, R, Z_0, Z_1, Z_2, W_0, W_1, W_2, alpha, tau, beta, eta, psi, phi) \ if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * max( np.sqrt( squared_norm(R) + squared_norm(Z_1) + squared_norm(Z_2) + squared_norm(W_1) + squared_norm(W_2)), np.sqrt( squared_norm(Z_0 - W_0) + squared_norm(Z_0[:-1]) + squared_norm(Z_0[1:]) + squared_norm(W_0[:-1]) + squared_norm(W_0[1:]))), e_dual=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * rho * (np.sqrt( squared_norm(X_0) + squared_norm(X_1) + squared_norm(X_2) + squared_norm(U_1) + squared_norm(U_2)))) R_old = R.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() W_1_old = W_1.copy() W_2_old = W_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled X_0 *= rho / rho_new X_1 *= rho / rho_new X_2 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, W_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def equality_time_graphical_lasso( S, K_init, max_iter, loss, C, rho, # n_samples=None, psi, gamma, tol, rtol, verbose, return_history, return_n_iter, mode, compute_objective, stop_at, stop_when, update_rho_options, init): """Equality constrained time-varying graphical LASSO solver. Solves the following problem via ADMM: min sum_{i=1}^T ||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) s.t. objective = c_i for i = 1, ..., T where S_i = (1/n_i) X_i^T X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. gamma: float, optional Kernel parameter when psi is chosen to be 'kernel'. constrained_to: float or ndarray, shape (time steps) Log likelihood constraints for K_i tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) psi_name = psi.__name__ if loss == 'LL': loss_function = neg_logl else: loss_function = dtrace K = K_init Z_0 = K.copy() Z_1 = K.copy()[:-1] Z_2 = K.copy()[1:] u = np.zeros((S.shape[0])) U_0 = np.zeros_like(Z_0) U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) I = np.eye(S.shape[1]) checks = [ convergence( obj=equality_objective(loss_function, S, K, C, Z_0, Z_1, Z_2, psi)) ] for iteration_ in range(max_iter): # update K A_K = U_0 - Z_0 A_K[:-1] += Z_1 - U_1 A_K[1:] += Z_2 - U_2 A_K += A_K.transpose(0, 2, 1) A_K /= 2. K = soft_thresholding_od(A_K, lamda=1. / rho) # update Z_0 residual_loss_constraint_u = loss_gen(loss_function, S, Z_0) - C + u A_Z = K + U_0 A_Z += A_Z.transpose(0, 2, 1) A_Z /= 2. if loss_function == neg_logl: A_Z -= residual_loss_constraint_u[:, None, None] * S Z_0 = np.array([ prox_logdet_constrained(_A, _a, I) for _A, _a in zip(A_Z, residual_loss_constraint_u) ]) elif loss_function == dtrace: Z_0 = np.array([ prox_dtrace_constrained(_A, _S, _a, I) for _A, _S, _a in zip(A_Z, S, residual_loss_constraint_u) ]) # other Zs A_1 = K[:-1] + U_1 A_2 = K[1:] + U_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals residual_loss_constraint = loss_gen(loss_function, S, Z_0) - C u += residual_loss_constraint U_0 += K - Z_0 U_1 += K[:-1] - Z_1 U_2 += K[1:] - Z_2 print(residual_loss_constraint) # diagnostics, reporting, termination checks rnorm = np.sqrt( np.sum(residual_loss_constraint**2) + squared_norm(K - Z_0) + squared_norm(K[:-1] - Z_1) + squared_norm(K[1:] - Z_2)) snorm = rho * np.sqrt( squared_norm(Z_0 - Z_0_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old)) obj = equality_objective(loss_function, S, K, C, Z_0, Z_1, Z_2, psi) if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(Z_0.size + 2 * Z_1.size + S.shape[0]) * tol + rtol * max( np.sqrt( np.sum(C**2) + squared_norm(Z_0) + squared_norm(Z_1) + squared_norm(Z_2)), np.sqrt( np.sum( (residual_loss_constraint + C)**2) + squared_norm(K) + squared_norm(K[:-1]) + squared_norm(K[1:]))), e_dual=np.sqrt(Z_0.size + 2 * Z_1.size) * tol + rtol * rho * np.sqrt(squared_norm(U_0) + squared_norm(U_1) + squared_norm(U_2)), ) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u *= rho / rho_new U_0 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new #assert is_pos_def(Z_0) else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in K]) return_list = [K, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def kernel_latent_time_graphical_lasso( emp_cov, alpha=0.01, tau=1.0, rho=1.0, kernel_psi=None, kernel_phi=None, max_iter=100, verbose=False, psi="laplacian", phi="laplacian", mode="admm", tol=1e-4, rtol=1e-4, assume_centered=False, n_samples=None, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True, init="empirical", ): r"""Time-varying latent variable graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(K_i-L_i) + alpha ||K_i||_{od,1} + tau ||L_i||_* + sum_{s>t}^T k_psi(s,t) Psi(K_s - K_t) + sum_{s>t}^T k_phi(s,t)(L_s - L_t) where S is the empirical covariance of the data matrix D (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, tau, beta, eta : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) phi, prox_phi, phi_node_penalty = check_norm_prox(phi) n_times, _, n_features = emp_cov.shape if kernel_psi is None: kernel_psi = np.eye(n_times) if kernel_phi is None: kernel_phi = np.eye(n_times) Z_0 = init_precision(emp_cov, mode=init) W_0 = np.zeros_like(Z_0) X_0 = np.zeros_like(Z_0) R_old = np.zeros_like(Z_0) Z_M, Z_M_old = {}, {} Y_M = {} W_M, W_M_old = {}, {} U_M = {} for m in range(1, n_times): Z_L = Z_0.copy()[:-m] Z_R = Z_0.copy()[m:] Z_M[m] = (Z_L, Z_R) W_L = np.zeros_like(Z_L) W_R = np.zeros_like(Z_R) W_M[m] = (W_L, W_R) Y_L = np.zeros_like(Z_L) Y_R = np.zeros_like(Z_R) Y_M[m] = (Y_L, Y_R) U_L = np.zeros_like(W_L) U_R = np.zeros_like(W_R) U_M[m] = (U_L, U_R) Z_L_old = np.zeros_like(Z_L) Z_R_old = np.zeros_like(Z_R) Z_M_old[m] = (Z_L_old, Z_R_old) W_L_old = np.zeros_like(W_L) W_R_old = np.zeros_like(W_R) W_M_old[m] = (W_L_old, W_R_old) if n_samples is None: n_samples = np.ones(n_times) checks = [] for iteration_ in range(max_iter): # update R A = Z_0 - W_0 - X_0 A += A.transpose(0, 2, 1) A /= 2.0 A *= -rho / n_samples[:, None, None] A += emp_cov # A = emp_cov / rho - A R = np.array( [prox_logdet(a, lamda=ni / rho) for a, ni in zip(A, n_samples)]) # update Z_0 A = R + W_0 + X_0 for m in range(1, n_times): A[:-m] += Z_M[m][0] - Y_M[m][0] A[m:] += Z_M[m][1] - Y_M[m][1] A /= n_times Z_0 = soft_thresholding(A, lamda=alpha / (rho * n_times)) # update W_0 A = Z_0 - R - X_0 for m in range(1, n_times): A[:-m] += W_M[m][0] - U_M[m][0] A[m:] += W_M[m][1] - U_M[m][1] A /= n_times A += A.transpose(0, 2, 1) A /= 2.0 W_0 = np.array( [prox_trace_indicator(a, lamda=tau / (rho * n_times)) for a in A]) # update residuals X_0 += R - Z_0 + W_0 for m in range(1, n_times): # other Zs Y_L, Y_R = Y_M[m] A_L = Z_0[:-m] + Y_L A_R = Z_0[m:] + Y_R if not psi_node_penalty: prox_e = prox_psi(A_R - A_L, lamda=2.0 * np.diag(kernel_psi, m)[:, None, None] / rho) Z_L = 0.5 * (A_L + A_R - prox_e) Z_R = 0.5 * (A_L + A_R + prox_e) else: Z_L, Z_R = prox_psi( np.concatenate((A_L, A_R), axis=1), lamda=0.5 * np.diag(kernel_psi, m)[:, None, None] / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) Z_M[m] = (Z_L, Z_R) # update other residuals Y_L += Z_0[:-m] - Z_L Y_R += Z_0[m:] - Z_R # other Ws U_L, U_R = U_M[m] A_L = W_0[:-m] + U_L A_R = W_0[m:] + U_R if not phi_node_penalty: prox_e = prox_phi(A_R - A_L, lamda=2.0 * np.diag(kernel_phi, m)[:, None, None] / rho) W_L = 0.5 * (A_L + A_R - prox_e) W_R = 0.5 * (A_L + A_R + prox_e) else: W_L, W_R = prox_phi( np.concatenate((A_L, A_R), axis=1), lamda=0.5 * np.diag(kernel_phi, m)[:, None, None] / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) W_M[m] = (W_L, W_R) # update other residuals U_L += W_0[:-m] - W_L U_R += W_0[m:] - W_R # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(R - Z_0 + W_0) + sum( squared_norm(Z_0[:-m] - Z_M[m][0]) + squared_norm(Z_0[m:] - Z_M[m][1]) + squared_norm(W_0[:-m] - W_M[m][0]) + squared_norm(W_0[m:] - W_M[m][1]) for m in range(1, n_times))) snorm = rho * np.sqrt( squared_norm(R - R_old) + sum( squared_norm(Z_M[m][0] - Z_M_old[m][0]) + squared_norm(Z_M[m][1] - Z_M_old[m][1]) + squared_norm(W_M[m][0] - W_M_old[m][0]) + squared_norm(W_M[m][1] - W_M_old[m][1]) for m in range(1, n_times))) obj = (objective(emp_cov, n_samples, R, Z_0, Z_M, W_0, W_M, alpha, tau, kernel_psi, kernel_phi, psi, phi) if compute_objective else np.nan) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=n_features * np.sqrt(n_times * (2 * n_times - 1)) * tol + rtol * max( np.sqrt( squared_norm(R) + sum( squared_norm(Z_M[m][0]) + squared_norm(Z_M[m][1]) + squared_norm(W_M[m][0]) + squared_norm(W_M[m][1]) for m in range(1, n_times))), np.sqrt( squared_norm(Z_0 - W_0) + sum( squared_norm(Z_0[:-m]) + squared_norm(Z_0[m:]) + squared_norm(W_0[:-m]) + squared_norm(W_0[m:]) for m in range(1, n_times))), ), e_dual=n_features * np.sqrt(n_times * (2 * n_times - 1)) * tol + rtol * rho * np.sqrt( squared_norm(X_0) + sum( squared_norm(Y_M[m][0]) + squared_norm(Y_M[m][1]) + squared_norm(U_M[m][0]) + squared_norm(U_M[m][1]) for m in range(1, n_times))), ) R_old = R.copy() for m in range(1, n_times): Z_M_old[m] = (Z_M[m][0].copy(), Z_M[m][1].copy()) W_M_old[m] = (W_M[m][0].copy(), W_M[m][1].copy()) if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled X_0 *= rho / rho_new for m in range(1, n_times): Y_L, Y_R = Y_M[m] Y_L *= rho / rho_new Y_R *= rho / rho_new U_L, U_R = U_M[m] U_L *= rho / rho_new U_R *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, W_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def inequality_time_graphical_lasso(S, K_init, max_iter, loss, C, theta, c_prox, rho, div, psi, gamma, tol, rtol, verbose, return_history, return_n_iter, mode, compute_objective, stop_at, stop_when, update_rho_options, init): """Inequality constrained time-varying graphical LASSO solver. Solves the following problem via ADMM: min sum_{i=1}^T ||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) s.t. objective =< c_i for i = 1, ..., T where S_i = (1/n_i) X_i^T X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. gamma: float, optional Kernel parameter when psi is chosen to be 'kernel'. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) psi_name = psi.__name__ if loss == 'LL': loss_function = neg_logl else: loss_function = dtrace Z_0 = K_init # init_precision(S, mode=init) Z_1 = Z_0.copy()[:-1] Z_2 = Z_0.copy()[1:] U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) # divisor for consensus variables, accounting for one less matrix for t = 0 and t = T divisor = np.full(S.shape[0], 2, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 out_obj = [] checks = [convergence(obj=penalty_objective(Z_0, Z_1, Z_2, psi, theta))] for iteration_ in range(max_iter): A_K_pen = np.zeros_like(Z_0) A_K_pen[:-1] += Z_1 - U_1 A_K_pen[1:] += Z_2 - U_2 A_K_pen += A_K_pen.transpose(0, 2, 1) A_K_pen /= 2. Z_0 = soft_thresholding_od(A_K_pen / divisor[:, None, None], lamda=theta / (rho * divisor)) # check feasibility and perform line search if necessary losses_all = loss_gen(loss_function, S, Z_0) feasibility_check = losses_all > C infeasible_indices = list( compress(range(len(feasibility_check)), feasibility_check)) for i in infeasible_indices: if c_prox == 'cvx': Z_0[i], loss_i = prox_cvx(loss_function, S[i], Z_0[i], Z_0_old[i], C[i], div) elif c_prox == 'grad': if i > 0: Z_0[i], loss_i = prox_grad(loss_function, S[i], Z_0[i], Z_0_old[i], C[i], 0.) else: Z_0[i], loss_i = prox_grad(loss_function, S[i], Z_0[i], Z_0_old[i], C[i], 0.) # break if losses post-correction blow up losses_all_new = loss_gen(loss_function, S, Z_0) if np.inf in losses_all_new: print(iteration_, 'Inf') covariance_ = np.array([linalg.pinvh(x) for x in Z_0_old]) return_list = [Z_0_old, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list # other Zs A_1 = Z_0[:-1] + U_1 A_2 = Z_0[1:] + U_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. * (1 - theta) / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * (1 - theta) / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals U_1 += Z_0[:-1] - Z_1 U_2 += Z_0[1:] - Z_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(Z_0[:-1] - Z_1) + squared_norm(Z_0[1:] - Z_2)) snorm = rho * np.sqrt( squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old)) obj = penalty_objective(Z_0, Z_1, Z_2, psi, theta) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(losses_all_new.size + 2 * Z_1.size) * tol + rtol * (max(np.sqrt(squared_norm(losses_all_new)), np.sqrt( squared_norm(C))) + max(np.sqrt(squared_norm(Z_1)), np.sqrt(squared_norm(Z_0[:-1]))) + max(np.sqrt(squared_norm(Z_2)), np.sqrt(squared_norm(Z_0[1:])))), e_dual=np.sqrt(2 * Z_1.size) * tol + rtol * rho * np.sqrt(squared_norm(U_1) + squared_norm(U_2))) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) out_obj.append(penalty_objective(Z_0, Z_0[:-1], Z_0[1:], psi, theta)) checks.append(check) # if len(out_obj) > 100 and c_prox == 'grad': # if (np.mean(out_obj[-11:-1]) - np.mean(out_obj[-10:])) < stop_when: # print('obj break') # break if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break # rho_new = update_rho( # rho, rnorm, snorm, iteration=iteration_, # mu=1e2, tau_inc=1.01, tau_dec=1.01) # # **(update_rho_options or {})) # # scaled dual variables should be also rescaled # U_1 *= rho / rho_new # U_2 *= rho / rho_new # rho = rho_new else: warnings.warn("Objective did not converge.") print(iteration_, out_obj[-1]) # print(out_obj) print(check.rnorm, check.e_pri) print(check.snorm, check.e_dual) covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def time_latent_graph_lasso(emp_cov, alpha=1., tau=1., rho=1., beta=1., eta=1., max_iter=1000, verbose=False, psi='laplacian', phi='laplacian', assume_centered=False, tol=1e-4, rtol=1e-2, return_history=False, return_n_iter=True, mode=None): r"""Time-varying latent variable graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(K_i-L_i) + alpha ||K_i||_{od,1} + tau ||L_i||_* + beta sum_{i=2}^T Psi(K_i - K_{i-1}) + eta sum_{i=2}^T Phi(L_i - L_{i-1}) where S is the empirical covariance of the data matrix D (training observations by features). Parameters ---------- data_list : list of 2-dimensional matrices. Input matrices. alpha, tau : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi = check_norm_prox(psi) phi, prox_phi = check_norm_prox(phi) # emp_cov = np.array([empirical_covariance( # x, assume_centered=assume_centered) for x in data_list]) n_samples = np.array([s for s in [1.]]) K = np.zeros_like(emp_cov) L = np.zeros_like(emp_cov) X = np.zeros_like(emp_cov) Z_0 = np.zeros_like(emp_cov) Z_1 = np.zeros_like(emp_cov)[:-1] Z_2 = np.zeros_like(emp_cov)[1:] W_0 = np.zeros_like(emp_cov) W_1 = np.zeros_like(emp_cov)[:-1] W_2 = np.zeros_like(emp_cov)[1:] U_0 = np.zeros_like(emp_cov) U_1 = np.zeros_like(emp_cov)[:-1] U_2 = np.zeros_like(emp_cov)[1:] Y_0 = np.zeros_like(emp_cov) Y_1 = np.zeros_like(emp_cov)[:-1] Y_2 = np.zeros_like(emp_cov)[1:] U_consensus = np.zeros_like(emp_cov) Y_consensus = np.zeros_like(emp_cov) Z_consensus = np.zeros_like(emp_cov) Z_consensus_old = np.zeros_like(emp_cov) W_consensus = np.zeros_like(emp_cov) W_consensus_old = np.zeros_like(emp_cov) R_old = np.zeros_like(emp_cov) # divisor for consensus variables, accounting for two less matrices divisor = np.zeros(emp_cov.shape[0]) + 3 divisor[0] -= 1 divisor[-1] -= 1 # eta = np.divide(n_samples, divisor * rho) checks = [] for iteration_ in range(max_iter): # update R A = K - L - X # A += np.array(map(np.transpose, A)) # A /= 2. A *= -rho / n_samples[:, None, None] A += emp_cov R = np.array(map(prox_logdet, A, n_samples / rho)) # update K, L K = L + R + X + Z_0 - U_0 K[:-1] += Z_1 - U_1 K[1:] += Z_2 - U_2 K /= divisor[:, None, None] + 1 L = K - R - X + W_0 - Y_0 L[:-1] += W_1 - Y_1 L[1:] += W_2 - Y_2 L /= divisor[:, None, None] + 1 # update Z_0 # Zold = Z # X_hat = alpha * X + (1 - alpha) * Zold soft_thresholding = partial(soft_thresholding_sign, lamda=alpha / rho) Z_0 = np.array(map(soft_thresholding, K + U_0)) # update Z_1, Z_2 A_1 = K[:-1] + U_1 A_2 = K[1:] + U_2 prox_e = prox_psi(A_2 - A_1, lamda=2. * beta / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) # update W_0 A = L + Y_0 W_0 = np.array(map(partial(prox_trace_indicator, lamda=tau / rho), A)) # update W_1, W_2 A_1 = L[:-1] + Y_1 A_2 = L[1:] + Y_2 prox_e = prox_phi(A_2 - A_1, lamda=2. * eta / rho) W_1 = .5 * (A_1 + A_2 - prox_e) W_2 = .5 * (A_1 + A_2 + prox_e) # update residuals X += R - K + L U_0 += (K - Z_0) U_1 += (K[:-1] - Z_1) U_2 += (K[1:] - Z_2) Y_0 += (L - W_0) Y_1 += (L[:-1] - W_1) Y_2 += (L[1:] - W_2) # diagnostics, reporting, termination checks Z_consensus = Z_0.copy() Z_consensus[:-1] += Z_1 Z_consensus[1:] += Z_2 Z_consensus /= divisor[:, None, None] U_consensus = U_0.copy() U_consensus[:-1] += U_1 U_consensus[1:] += U_2 U_consensus /= divisor[:, None, None] W_consensus = W_0.copy() W_consensus[:-1] += W_1 W_consensus[1:] += W_2 W_consensus /= divisor[:, None, None] Y_consensus = Y_0.copy() Y_consensus[:-1] += Y_1 Y_consensus[1:] += Y_2 Y_consensus /= divisor[:, None, None] check = convergence( obj=objective(n_samples, emp_cov, R, Z_0, Z_1, Z_2, W_0, W_1, W_2, alpha, tau, beta, eta, psi, phi), rnorm=np.sqrt( squared_norm(K - Z_consensus) + squared_norm(L - W_consensus) + squared_norm(K - L - R)), snorm=np.sqrt( squared_norm(rho * (Z_consensus - Z_consensus_old)) + squared_norm(rho * (W_consensus - W_consensus_old)) + squared_norm(rho * (R - R_old))), e_pri=np.sqrt(np.prod(K.shape) * 3) * tol + rtol * max( np.sqrt( squared_norm(K) + squared_norm(L) + squared_norm(K - L)), np.sqrt( squared_norm(Z_consensus) + squared_norm(W_consensus) + squared_norm(R))), e_dual=np.sqrt(np.prod(K.shape) * 3) * tol + rtol * np.sqrt( squared_norm(rho * (U_consensus)) + squared_norm(rho * (Y_consensus)) + squared_norm(rho * (X)))) Z_consensus_old = Z_consensus.copy() W_consensus_old = W_consensus.copy() R_old = R.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break else: warnings.warn("Objective did not converge.") return_list = [K, L, emp_cov] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list