def test_soft_thresholding_od(): """Test soft_thresholding_od function.""" # matrix OD array = np.arange(9).reshape(3, 3) output = np.array([[0, 0, 1], [2, 4, 4], [5, 6, 8]]) assert_array_equal(prox.soft_thresholding_od(array, 1), output) # tensor OD array = np.arange(27).reshape(3, 3, 3) output = np.array( [ [[0, 0, 1], [2, 4, 4], [5, 6, 8]], [[9, 9, 10], [11, 13, 13], [14, 15, 17]], [[18, 18, 19], [20, 22, 22], [23, 24, 26]] ]) assert_array_equal(prox.soft_thresholding_od(array, 1), output) # tensor OD, lamda is a list array = np.arange(27).reshape(3, 3, 3) output = np.array( [ [[0, 0, 1], [2, 4, 4], [5, 6, 8]], [[9, 8, 9], [10, 13, 12], [13, 14, 17]], [[18, 16, 17], [18, 22, 20], [21, 22, 26]] ]) assert_array_equal( prox.soft_thresholding_od(array, np.arange(1, 4)), output)
def choose_gamma( gamma, x, beta, alpha, lamda, grad, function_f=None, delta=1e-4, eps=0.5, max_iter=1000, p=1, x_inv=None, choose="gamma", laplacian_penalty=False, ): """Choose gamma for backtracking. References ---------- Salzo S. (2017). https://doi.org/10.1137/16M1073741 """ fx = function_f(K=x) for i in range(max_iter): if laplacian_penalty: prox = soft_thresholding_od(x - gamma * grad, alpha * gamma) else: prox = prox_FL(x - gamma * grad, beta * gamma, alpha * gamma, p=p, symmetric=True) if positive_definite(prox) and choose != "gamma": break if choose == "gamma": y_minus_x = prox - x loss_diff = function_f(K=x + lamda * y_minus_x) - fx tolerance = _scalar_product(y_minus_x, grad) tolerance += delta / gamma * _scalar_product(y_minus_x, y_minus_x) if loss_diff <= lamda * tolerance: break gamma *= eps return gamma, prox
def tgl_forward_backward( emp_cov, alpha=0.01, beta=1., max_iter=100, n_samples=None, verbose=False, tol=1e-4, delta=1e-4, gamma=1., lamda=1., eps=0.5, debug=False, return_history=False, return_n_iter=True, choose='gamma', lamda_criterion='b', time_norm=1, compute_objective=True, return_n_linesearch=False, vareps=1e-5, stop_at=None, stop_when=1e-4, laplacian_penalty=False, init='empirical'): """Time-varying graphical lasso solver with forward-backward splitting. Solves the following problem via FBS: min sum_{i=1}^T -n_i log_likelihood(S_i, K_i) + alpha*||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) where S_i = (1/n_i) X_i^T \times X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_times, n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameters. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. verbose : bool, default False Print info at each iteration. tol : float, optional Absolute tolerance for convergence. delta, gamma, lamda, eps : float, optional FBS parameters. debug : bool, default False Run in debug mode. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. choose : ('gamma', 'lambda', 'fixed', 'both) Search iteratively gamma / lambda / none / both. lamda_criterion : ('a', 'b', 'c') Criterion to choose lamda. See ref for details. time_norm : float, optional Choose the temporal norm between points. compute_objective : bool, default True Choose to compute the objective value. return_n_linesearch : bool, optional Return the number of line-search iterations before convergence. vareps : float, optional Jitter for the loss. stop_at, stop_when : float, optional Other convergence criteria, as used in the paper. laplacian_penalty : bool, default False Use Laplacian penalty. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K, covariance : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ available_choose = ('gamma', 'lamda', 'fixed', 'both') if choose not in available_choose: raise ValueError( "`choose` parameter must be one of %s." % available_choose) n_times, _, n_features = emp_cov.shape K = init_precision(emp_cov, mode=init) if laplacian_penalty: obj_partial = partial( objective_laplacian, n_samples=n_samples, emp_cov=emp_cov, alpha=alpha, beta=beta, vareps=vareps) function_f = partial( loss_laplacian, beta=beta, n_samples=n_samples, S=emp_cov, vareps=vareps) gradient_f = partial( grad_loss_laplacian, emp_cov=emp_cov, beta=beta, n_samples=n_samples, vareps=vareps) function_g = partial(penalty_laplacian, alpha=alpha) else: psi = partial(vector_p_norm, p=time_norm) obj_partial = partial( objective, n_samples=n_samples, emp_cov=emp_cov, alpha=alpha, beta=beta, psi=psi, vareps=vareps) function_f = partial( loss, n_samples=n_samples, S=emp_cov, vareps=vareps) gradient_f = partial( grad_loss, emp_cov=emp_cov, n_samples=n_samples, vareps=vareps) function_g = partial(penalty, alpha=alpha, beta=beta, psi=psi) max_residual = -np.inf n_linesearch = 0 checks = [convergence(obj=obj_partial(precision=K))] for iteration_ in range(max_iter): k_previous = K.copy() x_inv = np.array([linalg.pinvh(x) for x in K]) grad = gradient_f(K, x_inv=x_inv) if choose in ['gamma', 'both']: gamma, y = choose_gamma( gamma / eps if iteration_ > 0 else gamma, K, function_f=function_f, beta=beta, alpha=alpha, lamda=lamda, grad=grad, delta=delta, eps=eps, max_iter=200, p=time_norm, x_inv=x_inv, choose=choose, laplacian_penalty=laplacian_penalty) x_hat = K - gamma * grad if choose not in ['gamma', 'both']: if laplacian_penalty: y = soft_thresholding_od(x_hat, alpha * gamma) else: y = prox_FL( x_hat, beta * gamma, alpha * gamma, p=time_norm, symmetric=True) if choose in ('lamda', 'both'): lamda, n_ls = choose_lamda( min(lamda / eps if iteration_ > 0 else lamda, 1), K, function_f=function_f, objective_f=obj_partial, gradient_f=gradient_f, function_g=function_g, gamma=gamma, delta=delta, eps=eps, criterion=lamda_criterion, max_iter=200, p=time_norm, grad=grad, prox=y, vareps=vareps) n_linesearch += n_ls K = K + min(max(lamda, 0), 1) * (y - K) # K, t = fista_step(Y, Y - Y_old, t) check = convergence( obj=obj_partial(precision=K), rnorm=np.linalg.norm(upper_diag_3d(K) - upper_diag_3d(k_previous)), snorm=np.linalg.norm( obj_partial(precision=K) - obj_partial(precision=k_previous)), e_pri=np.sqrt(upper_diag_3d(K).size) * tol + tol * max( np.linalg.norm(upper_diag_3d(K)), np.linalg.norm(upper_diag_3d(k_previous))), e_dual=tol) if verbose and iteration_ % (50 if verbose < 2 else 1) == 0: print( "obj: %.4f, rnorm: %.7f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) if return_history: checks.append(check) if np.isnan(check.rnorm) or np.isnan(check.snorm): warnings.warn("precision is not positive definite.") if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break else: # use this convergence criterion subgrad = (x_hat - K) / gamma if 0: if laplacian_penalty: grad = grad_loss_laplacian( K, emp_cov, n_samples, vareps=vareps) else: grad = grad_loss(K, emp_cov, n_samples, vareps=vareps) res_norm = np.linalg.norm(grad + subgrad) if iteration_ == 0: normalizer = res_norm + 1e-6 max_residual = max( np.linalg.norm(grad), np.linalg.norm(subgrad)) + 1e-6 else: res_norm = np.linalg.norm(K - k_previous) / gamma max_residual = max(max_residual, res_norm) normalizer = max( np.linalg.norm(grad), np.linalg.norm(subgrad)) + 1e-6 r_rel = res_norm / max_residual r_norm = res_norm / normalizer if not debug and (r_rel <= tol or r_norm <= tol) and iteration_ > 0: # or ( # check.rnorm <= check.e_pri and iteration_ > 0): break else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(k) for k in K]) return_list = [K, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) if return_n_linesearch: return_list.append(n_linesearch) return return_list
def _fit(X, alpha=1e-2, gamma=1e-3, tol=1e-3, max_iter=1000, verbose=0, return_history=True, compute_objective=True, warm_start=None, return_n_iter=False, adjust_gamma=False, A=None, T=0, rho=1, update_gamma=0.5, line_search=False): n, d = X.shape if warm_start is None: theta = np.zeros((d, d)) else: theta = check_array(warm_start) thetas = [theta] theta_new = theta.copy() checks = [] for iter_ in range(max_iter): theta_old = thetas[-1] if not line_search: grad = _gradient_ising(X, theta, n, A, rho, T) theta_new = theta - gamma * grad theta = (theta_new + theta_new.T) / 2 theta = soft_thresholding_od(theta, alpha * gamma) else: while True: grad = _gradient_ising(X, theta, n, A, rho, T) theta_new = theta - gamma * grad theta = (theta_new + theta_new.T) / 2 theta = soft_thresholding_od(theta, alpha * gamma) print(theta) loss_new = loss(X, theta) loss_old = loss(X, theta_old) # Line search diff_theta2 = np.linalg.norm(theta_old - theta)**2 grad_diff = np.trace(grad.dot(theta_old - theta)) diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma)) if loss_new > diff or np.isinf(loss_new) or np.isnan(loss_new): gamma = update_gamma * gamma theta = theta_old - gamma * grad theta = soft_thresholding_od(theta, alpha * gamma) loss_new = loss(X, theta) diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma)) else: break thetas.append(theta) with warnings.catch_warnings(): warnings.simplefilter("ignore") check = convergence( iter=iter_, obj=objective(X, theta, alpha), iter_norm=np.linalg.norm(thetas[-2] - thetas[-1]), iter_r_norm=(np.linalg.norm(thetas[-2] - thetas[-1]) / np.linalg.norm(thetas[-1]))) checks.append(check) # if adjust_gamma: # TODO multiply or divide if verbose: print('Iter: %d, objective: %.4f, iter_norm %.4f' % (check[0], check[1], check[2])) if np.abs(check[2]) < tol: break return_list = [thetas[-1]] if return_history: return_list.append(thetas) return_list.append(checks) if return_n_iter: return_list.append(iter_) return return_list
def inequality_time_graphical_lasso(S, K_init, max_iter, loss, C, theta, c_prox, rho, div, psi, gamma, tol, rtol, verbose, return_history, return_n_iter, mode, compute_objective, stop_at, stop_when, update_rho_options, init): """Inequality constrained time-varying graphical LASSO solver. Solves the following problem via ADMM: min sum_{i=1}^T ||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) s.t. objective =< c_i for i = 1, ..., T where S_i = (1/n_i) X_i^T X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. gamma: float, optional Kernel parameter when psi is chosen to be 'kernel'. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) psi_name = psi.__name__ if loss == 'LL': loss_function = neg_logl else: loss_function = dtrace Z_0 = K_init # init_precision(S, mode=init) Z_1 = Z_0.copy()[:-1] Z_2 = Z_0.copy()[1:] U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) # divisor for consensus variables, accounting for one less matrix for t = 0 and t = T divisor = np.full(S.shape[0], 2, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 out_obj = [] checks = [convergence(obj=penalty_objective(Z_0, Z_1, Z_2, psi, theta))] for iteration_ in range(max_iter): A_K_pen = np.zeros_like(Z_0) A_K_pen[:-1] += Z_1 - U_1 A_K_pen[1:] += Z_2 - U_2 A_K_pen += A_K_pen.transpose(0, 2, 1) A_K_pen /= 2. Z_0 = soft_thresholding_od(A_K_pen / divisor[:, None, None], lamda=theta / (rho * divisor)) # check feasibility and perform line search if necessary losses_all = loss_gen(loss_function, S, Z_0) feasibility_check = losses_all > C infeasible_indices = list( compress(range(len(feasibility_check)), feasibility_check)) for i in infeasible_indices: if c_prox == 'cvx': Z_0[i], loss_i = prox_cvx(loss_function, S[i], Z_0[i], Z_0_old[i], C[i], div) elif c_prox == 'grad': if i > 0: Z_0[i], loss_i = prox_grad(loss_function, S[i], Z_0[i], Z_0_old[i], C[i], 0.) else: Z_0[i], loss_i = prox_grad(loss_function, S[i], Z_0[i], Z_0_old[i], C[i], 0.) # break if losses post-correction blow up losses_all_new = loss_gen(loss_function, S, Z_0) if np.inf in losses_all_new: print(iteration_, 'Inf') covariance_ = np.array([linalg.pinvh(x) for x in Z_0_old]) return_list = [Z_0_old, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list # other Zs A_1 = Z_0[:-1] + U_1 A_2 = Z_0[1:] + U_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. * (1 - theta) / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * (1 - theta) / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals U_1 += Z_0[:-1] - Z_1 U_2 += Z_0[1:] - Z_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(Z_0[:-1] - Z_1) + squared_norm(Z_0[1:] - Z_2)) snorm = rho * np.sqrt( squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old)) obj = penalty_objective(Z_0, Z_1, Z_2, psi, theta) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(losses_all_new.size + 2 * Z_1.size) * tol + rtol * (max(np.sqrt(squared_norm(losses_all_new)), np.sqrt( squared_norm(C))) + max(np.sqrt(squared_norm(Z_1)), np.sqrt(squared_norm(Z_0[:-1]))) + max(np.sqrt(squared_norm(Z_2)), np.sqrt(squared_norm(Z_0[1:])))), e_dual=np.sqrt(2 * Z_1.size) * tol + rtol * rho * np.sqrt(squared_norm(U_1) + squared_norm(U_2))) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) out_obj.append(penalty_objective(Z_0, Z_0[:-1], Z_0[1:], psi, theta)) checks.append(check) # if len(out_obj) > 100 and c_prox == 'grad': # if (np.mean(out_obj[-11:-1]) - np.mean(out_obj[-10:])) < stop_when: # print('obj break') # break if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break # rho_new = update_rho( # rho, rnorm, snorm, iteration=iteration_, # mu=1e2, tau_inc=1.01, tau_dec=1.01) # # **(update_rho_options or {})) # # scaled dual variables should be also rescaled # U_1 *= rho / rho_new # U_2 *= rho / rho_new # rho = rho_new else: warnings.warn("Objective did not converge.") print(iteration_, out_obj[-1]) # print(out_obj) print(check.rnorm, check.e_pri) print(check.snorm, check.e_dual) covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def _Z_0(x1, x2, Z_0, loss_res, nabla_con, nabla_pen): A = Z_0 - x2 * (1 - theta) * nabla_pen # A = Z_0 - x1 * nabla_con - x2 * (1 - theta) * nabla_pen A -= x1 * loss_res[:, None, None] * nabla_con return soft_thresholding_od(A, lamda=x2 * theta), A
def taylor_time_graphical_lasso( S, K_init, max_iter, loss, C, theta, rho, mult, weights, m, eps, psi, gamma, tol, rtol, verbose, return_history, return_n_iter, mode, compute_objective, stop_at, stop_when, update_rho_options ): """Equality constrained time-varying graphical LASSO solver. Solves the following problem via ADMM: min sum_{i=1}^T ||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) s.t. objective = c_i for i = 1, ..., T where S_i = (1/n_i) X_i^T X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. gamma: float, optional Kernel parameter when psi is chosen to be 'kernel'. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) if loss == 'LL': loss_func = neg_logl else: loss_func = dtrace T = S.shape[0] S_flat = S.copy().reshape(T, S.shape[1] * S.shape[2]) I_flat = np.diagflat(S.shape[1]).ravel() K = K_init.copy() Z_0 = K_init.copy() Z_1 = Z_0.copy()[:-1] Z_2 = Z_0.copy()[1:] u = np.zeros(T) U_0 = np.zeros_like(Z_0) U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = Z_0.copy() Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) # divisor for consensus variables, accounting for one less matrix for t = 0 and t = T divisor = np.full(T, 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 rho = rho * np.ones(T) if weights[0] is not None: if weights[0] == 'rbf': weights = rbf_weights(T, weights[1], mult) elif weights[0] == 'exp': weights = exp_weights(T, weights[1], mult) elif weights[0] == 'lin': weights = lin_weights(T, weights[1], mult) con_obj = {} for t in range(T): con_obj[t] = [] con_obj_mean = [] con_obj_max = [] # loss residuals loss_res = np.zeros(T) loss_init = loss_gen(loss_func, S, Z_0_old) loss_res_old = loss_init - C # loss_diff = C - loss_init # C_ = C - loss_diff out_obj = [] checks = [ convergence( obj=penalty_objective(Z_0, Z_1, Z_2, psi, theta)) ] def _K(x, A_t, g_t, nabla_t, nabla_t_T_A_t, nabla_t_T_nabla_t, rho_t, divisor_t): _K_t = (A_t + x * g_t * nabla_t - (x * nabla_t_T_A_t + x ** 2 * g_t * nabla_t_T_nabla_t) * nabla_t / (divisor_t * rho_t + x * nabla_t_T_nabla_t) ).reshape(S.shape[1], S.shape[2]) _K_t /= (rho_t * divisor_t) return 0.5 * (_K_t + _K_t.transpose(1, 0)) # def _K(x, A_t, nabla_t): # _A_t = A_t - x * nabla_t # return _A_t # constrained optimisation via line search def _f(x, _K, A_t, g_t, nabla_t, nabla_t_T_A_t, nabla_t_T_nabla_t, rho_t, divisor_t, loss_func, S_t, c_t, loss_res_old_t, nabla_t_T_K_old_t): _K_t = _K(x, A_t, g_t, nabla_t, nabla_t_T_A_t, nabla_t_T_nabla_t, rho_t, divisor_t) loss_res_t = loss_func(S_t, _K_t) - c_t return loss_res_t ** 2 + (loss_res_t - loss_res_old_t - nabla_t @ _K_t.ravel() + nabla_t_T_K_old_t) ** 2 # # constrained optimisation via line search # def _f(x, _K, A_t, nabla_t, loss_func, S_t, c_t, loss_res_old_t): # _K_t = _K(x, A_t, nabla_t) # loss_res_t = loss_func(S_t, _K_t) - c_t # return loss_res_t ** 2 + (loss_res_t - loss_res_old_t - np.sum(nabla_t * (_K_t - A_t))) ** 2 for iteration_ in range(max_iter): # update K A = rho[:, None, None] * (Z_0 - U_0) A[:-1] += rho[:-1, None, None] * (Z_1 - U_1) A[1:] += rho[1:, None, None] * (Z_2 - U_2) # A += A.transpose(0, 2, 1) # A /= 2. # A /= (rho * divisor)[:, None, None] # loss_res_pre = loss_gen(loss_func, S, A) - C if loss_func.__name__ == 'neg_logl': nabla = np.array([S_t - np.linalg.inv(K_t).ravel() for (S_t, K_t) in zip(S_flat, K)]) # nabla = np.array([S_t - np.linalg.inv(K_t) for (S_t, K_t) in zip(S, A)]) elif loss_func.__name__ == 'dtrace': nabla = np.array([(2 * K_t.ravel() @ S_t - I) for (S_t, K_t) in zip(S_flat, K)]) # nabla = np.array([(2 * K_t @ S_t - I) for (S_t, K_t) in zip(S, K)]) nabla_T_K_old = np.array([nabla_t @ K_t.ravel() for (nabla_t, K_t) in zip(nabla, K)]) # nabla_T_K_old = np.array([np.sum(nabla_t * K_t) for (nabla_t, K_t) in zip(nabla, K)]) g = nabla_T_K_old - loss_res_old nabla_T_A = np.array([nabla_t @ A_t.ravel() for (nabla_t, A_t) in zip(nabla, A)]) nabla_T_nabla = np.einsum('ij,ij->i', nabla, nabla) if iteration_ == 0: nabla = np.zeros_like(S_flat) # nabla = np.zeros_like(S) nabla_T_K_old = np.zeros(T) g = np.zeros(T) nabla_T_A = np.zeros(T) nabla_T_nabla = np.zeros(T) col = [] for t in range(T): out = minimize_scalar( partial(_f, _K=_K, A_t=A[t].ravel(), g_t=g[t], nabla_t=nabla[t], nabla_t_T_A_t=nabla_T_A[t], nabla_t_T_nabla_t=nabla_T_nabla[t], rho_t=rho[t], divisor_t=divisor[t], loss_func=loss_func, S_t=S[t], c_t=C[t], loss_res_old_t=loss_res_old[t], nabla_t_T_K_old_t=nabla_T_K_old[t]) ) # out = minimize_scalar( # partial(_f, _K=_K, A_t=A[t], nabla_t=nabla[t], loss_func=loss_func, # S_t=S[t], c_t=C[t], loss_res_old_t=loss_res_pre[t]) # ) K[t] = _K(out.x, A[t].ravel(), g[t], nabla[t], nabla_T_A[t], nabla_T_nabla[t], rho[t], divisor[t]) # K[t] = _K(out.x, A[t], nabla[t]) loss_res[t] = loss_func(S[t], K[t]) - C[t] # u[t] += loss_res[t] if weights[0] is not None: con_obj[t].append(loss_res[t] ** 2) if len(con_obj[t]) > m and np.mean(con_obj[t][-m:-int(m/2)]) < np.mean(con_obj[t][-int(m/2):]) and loss_res[t] > eps: col.append(t) # update Z_0 _Z_0 = K + U_0 _Z_0 += _Z_0.transpose(0, 2, 1) _Z_0 /= 2. Z_0 = soft_thresholding_od(_Z_0, lamda=theta / rho[:, None, None]) # update Z_1, Z_2 A_1 = Z_0[:-1] + U_1 A_2 = Z_0[1:] + U_2 if not psi_node_penalty: A_add = A_2 + A_1 A_sub = A_2 - A_1 prox_e_1 = prox_psi(A_sub, lamda=2. * (1 - theta) / rho[:-1, None, None]) prox_e_2 = prox_psi(A_sub, lamda=2. * (1 - theta) / rho[1:, None, None]) Z_1 = .5 * (A_add - prox_e_1) Z_2 = .5 * (A_add + prox_e_2) # TODO: Fix for rho vector # else: # if weights is not None: # Z_1, Z_2 = prox_psi( # np.concatenate((A_1, A_2), axis=1), lamda=.5 * (1 - theta) / rho[t], # rho=rho[t], tol=tol, rtol=rtol, max_iter=max_iter) # update residuals con_obj_mean.append(np.mean(loss_res) ** 2) con_obj_max.append(np.max(loss_res)) U_0 += K - Z_0 U_1 += K[:-1] - Z_1 U_2 += K[1:] - Z_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(K - Z_0) + squared_norm(K[:-1] - Z_1) + squared_norm(K[1:] - Z_2) ) loss_res_old = loss_res.copy() snorm = np.sqrt( squared_norm(rho[:, None, None] * (Z_0 - Z_0_old)) + squared_norm(rho[:-1, None, None] * (Z_1 - Z_1_old)) + squared_norm(rho[1:, None, None] * (Z_2 - Z_2_old)) ) e_dual = np.sqrt(Z_0.size + 2 * Z_1.size) * tol + rtol * np.sqrt( squared_norm(rho[:, None, None] * U_0) + squared_norm(rho[:-1, None, None] * U_1) + squared_norm(rho[1:, None, None] * U_2) ) obj = objective(loss_res, Z_0, Z_1, Z_2, psi, theta) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(loss_res.size + Z_0.size + 2 * Z_1.size) * tol + rtol * ( max(np.sqrt(squared_norm(Z_0)), np.sqrt(squared_norm(K))) + max(np.sqrt(squared_norm(Z_1)), np.sqrt(squared_norm(K[:-1]))) + max(np.sqrt(squared_norm(Z_2)), np.sqrt(squared_norm(K[1:]))) ), e_dual=e_dual ) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print( "obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) out_obj.append(penalty_objective(Z_0, Z_0[:-1], Z_0[1:], psi, theta)) if not iteration_ % 100: print(iteration_) print(np.max(con_obj_max[-1]), np.mean(loss_res)) print(out_obj[-1]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break if weights[0] is None: if len(con_obj_mean) > m: if np.mean(con_obj_mean[-m:-int(m/2)]) < np.mean(con_obj_mean[-int(m/2):]) and np.max(loss_res) > eps: # or np.mean(con_obj_max[-100:-50]) < np.mean(con_obj_max[-50:])) # np.mean(loss_res) > 0.25: print("Rho Mult", mult * rho[0], iteration_, np.mean(loss_res), con_obj_max[-1]) # loss_diff /= 5 # C_ = C - loss_diff # resscale scaled dual variables rho = mult * rho # u /= mult U_0 /= mult U_1 /= mult U_2 /= mult con_obj_mean = [] con_obj_max = [] else: for t in col: rho *= weights[t] # u /= weights[t] U_0 /= weights[t][:, None, None] U_1 /= weights[t][:-1, None, None] U_2 /= weights[t][1:, None, None] con_obj[t] = [] print('Mult', iteration_, t, rho[t]) else: warnings.warn("Objective did not converge.") print(iteration_, out_obj[-1]) # print(out_obj) print(check.rnorm, check.e_pri) print(check.snorm, check.e_dual) covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def time_graphical_lasso(emp_cov, alpha=0.01, rho=1, beta=1, theta=0.5, max_iter=100, n_samples=None, verbose=False, psi='laplacian', tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, mode='admm', compute_objective=True, stop_at=None, stop_when=1e-4, update_rho_options=None, init='empirical'): """Time-varying graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(S_i, K_i) + alpha*||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) where S_i = (1/n_i) X_i^T \times X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) Z_0 = init_precision(emp_cov, mode=init) Z_1 = Z_0.copy()[:-1] # np.zeros_like(emp_cov)[:-1] Z_2 = Z_0.copy()[1:] # np.zeros_like(emp_cov)[1:] U_0 = np.zeros_like(Z_0) U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) # divisor for consensus variables, accounting for two less matrices divisor = np.full(emp_cov.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 if n_samples is None: n_samples = np.ones(emp_cov.shape[0]) checks = [ convergence(obj=objective(n_samples, emp_cov, Z_0, Z_0, Z_1, Z_2, alpha, beta, psi)) ] for iteration_ in range(max_iter): # update K A = Z_0 - U_0 A[:-1] += Z_1 - U_1 A[1:] += Z_2 - U_2 A += A.transpose(0, 2, 1) A /= 2. A *= -rho / n_samples[:, None, None] A += emp_cov K = np.array([ prox_logdet_alt(a, lamda=rho * div) for a, div in zip(A, divisor) ]) # update Z_0 A = K + U_0 A += A.transpose(0, 2, 1) A /= 2. Z_0 = soft_thresholding_od(A, lamda=alpha / rho) # other Zs A_1 = K[:-1] + U_1 A_2 = K[1:] + U_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. * beta / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * beta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals U_0 += K - Z_0 U_1 += K[:-1] - Z_1 U_2 += K[1:] - Z_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(K - Z_0) + squared_norm(K[:-1] - Z_1) + squared_norm(K[1:] - Z_2)) snorm = rho * np.sqrt( squared_norm(Z_0 - Z_0_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old)) obj = objective( n_samples, emp_cov, Z_0, K, Z_1, Z_2, alpha, beta, psi) \ if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(K.size + 2 * Z_1.size) * tol + rtol * max( np.sqrt( squared_norm(Z_0) + squared_norm(Z_1) + squared_norm(Z_2)), np.sqrt( squared_norm(K) + squared_norm(K[:-1]) + squared_norm(K[1:]))), e_dual=np.sqrt(K.size + 2 * Z_1.size) * tol + rtol * rho * np.sqrt(squared_norm(U_0) + squared_norm(U_1) + squared_norm(U_2)), # precision=Z_0.copy() ) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled U_0 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new #assert is_pos_def(Z_0) else: warnings.warn("Objective did not converge.") print(iteration_, penalty_objective(Z_0, Z_0[:-1], Z_0[1:], psi, theta)) covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def graphical_lasso( emp_cov, alpha=0.01, rho=1, over_relax=1, max_iter=100, verbose=False, tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True, init="empirical", ): r"""Graphical lasso solver via ADMM. Solves the following problem: minimize trace(S*K) - log det K + alpha ||K||_{od,1} where S = (1/n) X^T \times X is the empirical covariance of the data matrix X (training observations by features). Parameters ---------- emp_cov : array-like Empirical covariance matrix. alpha : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. over_relax : float, optional Over-relaxation parameter (typically between 1.0 and 1.8). max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zeros', ndarray}, default 'empirical' How to initialise the inverse covariance matrix. Default is take the empirical covariance and inverting it. Returns ------- precision_ : numpy.array, 2-dimensional Solution to the problem. covariance_ : np.array, 2 dimensional Empirical covariance matrix. n_iter_ : int If return_n_iter, returns the number of iterations before convergence. history_ : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ Z = init_precision(emp_cov, mode=init) U = np.zeros_like(emp_cov) Z_old = np.zeros_like(Z) checks = [] for iteration_ in range(max_iter): # x-update A = Z - U A += A.T A /= 2.0 K = prox_logdet(emp_cov - rho * A, lamda=1.0 / rho) # z-update with relaxation K_hat = over_relax * K - (1 - over_relax) * Z Z = soft_thresholding_od(K_hat + U, lamda=alpha / rho) # update residuals U += K_hat - Z # diagnostics, reporting, termination checks obj = objective(emp_cov, K, Z, alpha) if compute_objective else np.nan rnorm = np.linalg.norm(K - Z, "fro") snorm = rho * np.linalg.norm(Z - Z_old, "fro") check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(K.size) * tol + rtol * max(np.linalg.norm(K, "fro"), np.linalg.norm(Z, "fro")), e_dual=np.sqrt(K.size) * tol + rtol * rho * np.linalg.norm(U), ) Z_old = Z.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled U *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [Z, emp_cov] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def equality_time_graphical_lasso( S, K_init, max_iter, loss, C, rho, # n_samples=None, psi, gamma, tol, rtol, verbose, return_history, return_n_iter, mode, compute_objective, stop_at, stop_when, update_rho_options, init): """Equality constrained time-varying graphical LASSO solver. Solves the following problem via ADMM: min sum_{i=1}^T ||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) s.t. objective = c_i for i = 1, ..., T where S_i = (1/n_i) X_i^T X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. gamma: float, optional Kernel parameter when psi is chosen to be 'kernel'. constrained_to: float or ndarray, shape (time steps) Log likelihood constraints for K_i tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) psi_name = psi.__name__ if loss == 'LL': loss_function = neg_logl else: loss_function = dtrace K = K_init Z_0 = K.copy() Z_1 = K.copy()[:-1] Z_2 = K.copy()[1:] u = np.zeros((S.shape[0])) U_0 = np.zeros_like(Z_0) U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) I = np.eye(S.shape[1]) checks = [ convergence( obj=equality_objective(loss_function, S, K, C, Z_0, Z_1, Z_2, psi)) ] for iteration_ in range(max_iter): # update K A_K = U_0 - Z_0 A_K[:-1] += Z_1 - U_1 A_K[1:] += Z_2 - U_2 A_K += A_K.transpose(0, 2, 1) A_K /= 2. K = soft_thresholding_od(A_K, lamda=1. / rho) # update Z_0 residual_loss_constraint_u = loss_gen(loss_function, S, Z_0) - C + u A_Z = K + U_0 A_Z += A_Z.transpose(0, 2, 1) A_Z /= 2. if loss_function == neg_logl: A_Z -= residual_loss_constraint_u[:, None, None] * S Z_0 = np.array([ prox_logdet_constrained(_A, _a, I) for _A, _a in zip(A_Z, residual_loss_constraint_u) ]) elif loss_function == dtrace: Z_0 = np.array([ prox_dtrace_constrained(_A, _S, _a, I) for _A, _S, _a in zip(A_Z, S, residual_loss_constraint_u) ]) # other Zs A_1 = K[:-1] + U_1 A_2 = K[1:] + U_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals residual_loss_constraint = loss_gen(loss_function, S, Z_0) - C u += residual_loss_constraint U_0 += K - Z_0 U_1 += K[:-1] - Z_1 U_2 += K[1:] - Z_2 print(residual_loss_constraint) # diagnostics, reporting, termination checks rnorm = np.sqrt( np.sum(residual_loss_constraint**2) + squared_norm(K - Z_0) + squared_norm(K[:-1] - Z_1) + squared_norm(K[1:] - Z_2)) snorm = rho * np.sqrt( squared_norm(Z_0 - Z_0_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old)) obj = equality_objective(loss_function, S, K, C, Z_0, Z_1, Z_2, psi) if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(Z_0.size + 2 * Z_1.size + S.shape[0]) * tol + rtol * max( np.sqrt( np.sum(C**2) + squared_norm(Z_0) + squared_norm(Z_1) + squared_norm(Z_2)), np.sqrt( np.sum( (residual_loss_constraint + C)**2) + squared_norm(K) + squared_norm(K[:-1]) + squared_norm(K[1:]))), e_dual=np.sqrt(Z_0.size + 2 * Z_1.size) * tol + rtol * rho * np.sqrt(squared_norm(U_0) + squared_norm(U_1) + squared_norm(U_2)), ) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u *= rho / rho_new U_0 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new #assert is_pos_def(Z_0) else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in K]) return_list = [K, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def _Z_0(x, A_t, nabla_t, rho_t, divisor_t): _A_t = A_t - x * nabla_t return soft_thresholding_od(_A_t, lamda=theta / (rho_t * divisor_t))
def _Z_0(x, A_t, g_t, nabla_t, rho_t, divisor_t): _A_t = A_t + x * g_t * nabla_t A_t = 0.5 * (_A_t + _A_t.transpose(1, 0)) return soft_thresholding_od(A_t, lamda=theta / (rho_t * divisor_t))