def test_soft_thresholding(): """Test soft_thresholding function.""" # array array = np.arange(3) output = np.array([0, 0.5, 1.5]) assert_array_equal(prox.soft_thresholding(array, .5), output) # matrix array = np.arange(9).reshape(3, 3) output = np.array([[0, 0, 1], [2, 3, 4], [5, 6, 7]]) assert_array_equal(prox.soft_thresholding(array, 1), output) # tensor array = np.arange(27).reshape(3, 3, 3) output = array - 1 output[0, 0, 0] = 0 assert_array_equal(prox.soft_thresholding(array, 1), output) # tensor, lamda is a matrix array = np.arange(27).reshape(3, 3, 3) output = array - 1 output[0, 0, 0] = 0 output[1] -= 1 output[2] -= 2 assert_array_equal( prox.soft_thresholding(array, np.arange(1, 4)[:, None, None]), output)
def fit_each_variable( X, ix, alpha=1e-2, gamma=1e-3, tol=1e-3, max_iter=1000, verbose=0, return_history=True, compute_objective=True, return_n_iter=False, adjust_gamma=False, ): n, d = X.shape theta = np.zeros(d - 1) + 1e-15 selector = [i for i in range(d) if i != ix] def gradient(X, theta, r, selector, n): XX = X[:, r].T.dot(X[:, selector]) XXT = X[:, selector].T.dot(X[:, selector]).dot(theta) return -(1 / n) * XX + (1 / n) * XXT thetas = [theta] checks = [] for iter_ in range(max_iter): theta_new = theta - gamma * gradient(X, theta, ix, selector, n) theta = soft_thresholding(theta_new, alpha * gamma) thetas.append(theta) check = convergence( iter=iter_, obj=objective(X, theta, n, ix, selector, alpha), iter_norm=np.linalg.norm(thetas[-2] - thetas[-1]), iter_r_norm=(np.linalg.norm(thetas[-2] - thetas[-1]) / np.linalg.norm(thetas[-1])), ) checks.append(check) # if adjust_gamma: # TODO multiply or divide if verbose: print("Iter: %d, objective: %.4f, iter_norm %.4f" % (check[0], check[1], check[2])) if check[-2] < tol: break return_list = [thetas[-1]] if return_history: return_list.append(thetas) return_list.append(checks) if return_n_iter: return_list.append(iter_) return return_list
def lasso(A, b, lamda=1.0, rho=1.0, alpha=1.0, max_iter=1000, tol=1e-4, rtol=1e-2, return_history=False): r"""Solves the following problem via ADMM: minimize 1/2*|| Ax - b ||_2^2 + \lambda || x ||_1 Parameters ---------- A : array-like, 2-dimensional Input matrix. b : array-like, 1-dimensional Output vector. lamda : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. alpha : float, optional Over-relaxation parameter (typically between 1.0 and 1.8). max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- x : numpy.array Solution to the problem. history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ n_samples, n_features = A.shape # % save a matrix-vector multiply Atb = A.T.dot(b) # ADMM solver x = np.zeros(n_features) z = np.zeros(n_features) u = np.zeros(n_features) # % cache the factorization L, U = lu_factor(A, rho) hist = [] for _ in range(max_iter): # % x-update q = Atb + rho * (z - u) # % temporary value if n_samples >= n_features: x = np.linalg.lstsq(U, np.linalg.lstsq(L, q)[0])[0] else: x = q - A.T.dot( np.linalg.lstsq( U, np.linalg.lstsq( L, A.dot(q))[0])[0]) / rho x /= rho # % z-update with relaxation zold = z x_hat = alpha * x + (1 - alpha) * zold z = soft_thresholding(x_hat + u, lamda / rho) # % u-update u += (x_hat - z) # % diagnostics, reporting, termination checks history = ( objective(A, b, lamda, x, z), # obj np.linalg.norm(x - z), # r norm np.linalg.norm(-rho * (z - zold)), # s norm np.sqrt(n_features) * tol + rtol * max( np.linalg.norm(x), np.linalg.norm(-z)), # eps pri np.sqrt(n_features) * tol + rtol * np.linalg.norm(rho * u) # eps dual ) hist.append(history) if history[1] < history[3] and history[2] < history[4]: break return z, history if return_history else z
def lasso_kernel_admm(K, y, lamda=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None, sample_weight=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||y_i - alpha_i * sum_{k=1}^{n_k} w_k * K_{ik}||^2 + lamda ||w||_1 + beta sum_{j=1}^{c_i}||alpha_j||_2^2 """ n_kernels, n_samples, n_features = K.shape coef = np.ones(n_kernels) # alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] # u = [np.zeros(K[j].shape[1]) for j in range(n_patients)] w_1 = coef.copy() u_1 = np.zeros(n_kernels) # x_old = [np.zeros(K[0].shape[1]) for j in range(n_patients)] w_1_old = w_1.copy() Y = y[:, None].dot(y[:, None].T) checks = [] for iteration_ in range(max_iter): # update w KK = 2 * np.tensordot(K, K.T, axes=([1, 2], [0, 1])) yy = 2 * np.tensordot(Y, K, axes=([0, 1], [1, 2])) yy += rho * (w_1 - u_1) coef = _solve_cholesky_kernel(KK, yy[..., None], rho).ravel() w_1 = soft_thresholding(coef + u_1, lamda / rho) # w_2 = prox_laplacian(coef + u_2, beta / rho) u_1 += coef - w_1 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(coef - w_1)) snorm = rho * np.sqrt(squared_norm(w_1 - w_1_old)) obj = lasso_objective(Y, coef, K, w_1, lamda) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(coef.size) * tol + rtol * max(np.sqrt(squared_norm(coef)), np.sqrt(squared_norm(w_1))), e_dual=np.sqrt(coef.size) * tol + rtol * rho * (np.sqrt(squared_norm(u_1)))) w_1_old = w_1.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u_1 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [coef] if return_n_iter: return_list.append(iteration_) return return_list
def latent_time_matrix_decomposition(emp_cov, alpha=0.01, tau=1., rho=1., beta=1., eta=1., max_iter=100, verbose=False, psi='laplacian', phi='laplacian', mode='admm', tol=1e-4, rtol=1e-4, assume_centered=False, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True): r"""Latent variable time-varying matrix decomposition solver. Solves the following problem via ADMM: min sum_{i=1}^T || S_i-(K_i-L_i)||^2 + alpha ||K_i||_{od,1} + tau ||L_i||_* + beta sum_{i=2}^T Psi(K_i - K_{i-1}) + eta sum_{i=2}^T Phi(L_i - L_{i-1}) where S is the matrix to decompose. Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Matrix to decompose. alpha, tau, beta, eta : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) phi, prox_phi, phi_node_penalty = check_norm_prox(phi) Z_0 = np.zeros_like(emp_cov) Z_1 = np.zeros_like(Z_0)[:-1] Z_2 = np.zeros_like(Z_0)[1:] W_0 = np.zeros_like(Z_0) W_1 = np.zeros_like(Z_1) W_2 = np.zeros_like(Z_2) X_0 = np.zeros_like(Z_0) X_1 = np.zeros_like(Z_1) X_2 = np.zeros_like(Z_2) U_1 = np.zeros_like(W_1) U_2 = np.zeros_like(W_2) R_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) W_1_old = np.zeros_like(W_1) W_2_old = np.zeros_like(W_2) # divisor for consensus variables, accounting for two less matrices divisor = np.full(emp_cov.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 checks = [] for iteration_ in range(max_iter): # update R A = Z_0 - W_0 - X_0 R = (rho * A + 2 * emp_cov) / (2 + rho) # update Z_0 A = R + W_0 + X_0 A[:-1] += Z_1 - X_1 A[1:] += Z_2 - X_2 A /= divisor[:, None, None] # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # Z_0 = np.array(map(soft_thresholding_, A)) Z_0 = soft_thresholding(A, lamda=alpha / (rho * divisor[:, None, None])) # update Z_1, Z_2 A_1 = Z_0[:-1] + X_1 A_2 = Z_0[1:] + X_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. * beta / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * beta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update W_0 A = Z_0 - R - X_0 A[:-1] += W_1 - U_1 A[1:] += W_2 - U_2 A /= divisor[:, None, None] A += A.transpose(0, 2, 1) A /= 2. W_0 = np.array([ prox_trace_indicator(a, lamda=tau / (rho * div)) for a, div in zip(A, divisor) ]) # update W_1, W_2 A_1 = W_0[:-1] + U_1 A_2 = W_0[1:] + U_2 if not phi_node_penalty: prox_e = prox_phi(A_2 - A_1, lamda=2. * eta / rho) W_1 = .5 * (A_1 + A_2 - prox_e) W_2 = .5 * (A_1 + A_2 + prox_e) else: W_1, W_2 = prox_phi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * eta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals X_0 += R - Z_0 + W_0 X_1 += Z_0[:-1] - Z_1 X_2 += Z_0[1:] - Z_2 U_1 += W_0[:-1] - W_1 U_2 += W_0[1:] - W_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(R - Z_0 + W_0) + squared_norm(Z_0[:-1] - Z_1) + squared_norm(Z_0[1:] - Z_2) + squared_norm(W_0[:-1] - W_1) + squared_norm(W_0[1:] - W_2)) snorm = rho * np.sqrt( squared_norm(R - R_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old) + squared_norm(W_1 - W_1_old) + squared_norm(W_2 - W_2_old)) obj = objective(emp_cov, R, Z_0, Z_1, Z_2, W_0, W_1, W_2, alpha, tau, beta, eta, psi, phi) \ if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * max( np.sqrt( squared_norm(R) + squared_norm(Z_1) + squared_norm(Z_2) + squared_norm(W_1) + squared_norm(W_2)), np.sqrt( squared_norm(Z_0 - W_0) + squared_norm(Z_0[:-1]) + squared_norm(Z_0[1:]) + squared_norm(W_0[:-1]) + squared_norm(W_0[1:]))), e_dual=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * rho * (np.sqrt( squared_norm(X_0) + squared_norm(X_1) + squared_norm(X_2) + squared_norm(U_1) + squared_norm(U_2)))) R_old = R.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() W_1_old = W_1.copy() W_2_old = W_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled X_0 *= rho / rho_new X_1 *= rho / rho_new X_2 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [Z_0, W_0] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def time_graphical_lasso( emp_cov, alpha=0.01, rho=1, beta=1, max_iter=100, n_samples=None, verbose=False, psi="laplacian", tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, mode="admm", compute_objective=True, stop_at=None, stop_when=1e-4, update_rho_options=None, init="empirical", ): """Time-varying graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(S_i, K_i) + alpha*||K_i||_{od,1} + beta sum_{i=2}^T Psi(K_i - K_{i-1}) where S_i = (1/n_i) X_i^T \times X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zero', ndarray} Choose how to initialize the precision matrix, with the inverse empirical covariance, zero matrix or precomputed. Returns ------- K : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) Z_0 = init_precision(emp_cov, mode=init) Z_1 = Z_0.copy()[:-1] # np.zeros_like(emp_cov)[:-1] Z_2 = Z_0.copy()[1:] # np.zeros_like(emp_cov)[1:] U_0 = np.zeros_like(Z_0) U_1 = np.zeros_like(Z_1) U_2 = np.zeros_like(Z_2) Z_0_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) # divisor for consensus variables, accounting for two less matrices divisor = np.full(emp_cov.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 if n_samples is None: n_samples = np.ones(emp_cov.shape[0]) checks = [convergence(obj=objective(n_samples, emp_cov, Z_0, Z_0, Z_1, Z_2, alpha, beta, psi))] for iteration_ in range(max_iter): # update K A = Z_0 - U_0 A[:-1] += Z_1 - U_1 A[1:] += Z_2 - U_2 A /= divisor[:, None, None] # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # K = np.array(map(soft_thresholding_, A)) A += A.transpose(0, 2, 1) A /= 2.0 A *= -rho * divisor[:, None, None] / n_samples[:, None, None] A += emp_cov K = np.array([prox_logdet(a, lamda=ni / (rho * div)) for a, div, ni in zip(A, divisor, n_samples)]) # update Z_0 A = K + U_0 A += A.transpose(0, 2, 1) A /= 2.0 Z_0 = soft_thresholding(A, lamda=alpha / rho) # other Zs A_1 = K[:-1] + U_1 A_2 = K[1:] + U_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2.0 * beta / rho) Z_1 = 0.5 * (A_1 + A_2 - prox_e) Z_2 = 0.5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi( np.concatenate((A_1, A_2), axis=1), lamda=0.5 * beta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) # update residuals U_0 += K - Z_0 U_1 += K[:-1] - Z_1 U_2 += K[1:] - Z_2 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(K - Z_0) + squared_norm(K[:-1] - Z_1) + squared_norm(K[1:] - Z_2)) snorm = rho * np.sqrt(squared_norm(Z_0 - Z_0_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old)) obj = objective(n_samples, emp_cov, Z_0, K, Z_1, Z_2, alpha, beta, psi) if compute_objective else np.nan # if np.isinf(obj): # Z_0 = Z_0_old # break check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(K.size + 2 * Z_1.size) * tol + rtol * max( np.sqrt(squared_norm(Z_0) + squared_norm(Z_1) + squared_norm(Z_2)), np.sqrt(squared_norm(K) + squared_norm(K[:-1]) + squared_norm(K[1:])), ), e_dual=np.sqrt(K.size + 2 * Z_1.size) * tol + rtol * rho * np.sqrt(squared_norm(U_0) + squared_norm(U_1) + squared_norm(U_2)), # precision=Z_0.copy() ) Z_0_old = Z_0.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled U_0 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new # assert is_pos_def(Z_0) else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def logistic_alternating(K, y, lamda=0.01, beta=0.01, gamma=.5, max_iter=100, l1_ratio_lamda=0.1, l1_ratio_beta=0.1, deep=True, verbose=0, tol=1e-4, return_n_iter=True, fit_intercept=True, lr_p2=None): # multiple patient n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] objective_new = 0 max_iter_deep = max_iter // 3 if deep else 1 if lr_p2 is None: raise ValueError("lr_p2 cant be None") for iteration_ in range(max_iter): w_old = coef.copy() alpha_old = [a.copy() for a in alpha] objective_old = objective_new for i in range(n_patients): lr_p2[i].fit(np.tensordot(coef, K[i], axes=1), y[i]) alpha = [log.coef_.ravel() for log in lr_p2] intercepts = [log.intercept_.ravel() for log in lr_p2] alpha_intercept = [np.hstack((a, c)) for a, c in zip(alpha, intercepts)] # X = np.tensordot(alpha, K, axes=([0], [2])).T # X = sum(K[j].dot(alpha[j]).T for j in range(n_patients)) # coef = lr_p1.fit(X, y).coef_.ravel() for it in range(max_iter_deep): coef_old = coef.copy() l2_reg = beta * (1 - l1_ratio_beta) loss, gradient = _logistic_loss_and_grad( coef, alpha_intercept, K, y, l2_reg) l1_reg = beta * l1_ratio_beta coef = soft_thresholding(coef - gamma * gradient, gamma * l1_reg) coef = np.maximum(coef, 0.) if np.linalg.norm(coef - coef_old) < tol: break obj = logistic_objective(K, y, alpha, coef, lamda, beta) objective_difference = abs(objective_new - objective_old) # snorm = np.sqrt(squared_norm(coef - w_old) + # squared_norm(alpha - alpha_old)) diff_w = np.linalg.norm(coef - w_old) diff_a = np.sqrt( sum(squared_norm(a - a_old) for a, a_old in zip(alpha, alpha_old))) if verbose:# and iteration_ % 10 == 0: # print("obj: %.4f, snorm: %.4f" % (obj, snorm)) print("obj: %.4f, loss: %.4f, diff_w: %.4f, diff_a: %.4f" % ( obj, logistic_loss(K, y, alpha, coef, lamda, beta), diff_w, diff_a)) if diff_a < tol and objective_difference < tol: break if np.isnan(diff_w) or np.isnan(diff_a) or np.isnan(objective_difference): raise ValueError('something is nan') else: warnings.warn("Objective did not converge.") return_list = [alpha, coef, intercepts] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning_admm2( K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||y_i - alpha_i * sum_{k=1}^{n_k} w_k * K_{ik}||^2 + lamda ||w||_1 + beta sum_{j=1}^{c_i}||alpha_j||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] u = [np.zeros(K[j].shape[1]) for j in range(n_patients)] u_1 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) x_old = [np.zeros(K[0].shape[1]) for j in range(n_patients)] w_1_old = w_1.copy() # w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update x A = [K[j].T.dot(coef) for j in range(n_patients)] x = [prox_laplacian(y[j] + rho * (A[j].T.dot(alpha[j]) - u[j]), rho / 2.) for j in range(n_patients)] # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK KK = [rho * A[j].dot(A[j].T) for j in range(n_patients)] yy = [rho * A[j].dot(x[j] + u[j]) for j in range(n_patients)] alpha = [_solve_cholesky_kernel( KK[j], yy[j][..., None], 2 * beta).ravel() for j in range(n_patients)] # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(A[j].dot(x[j] + u[j]) for j in range(n_patients)) yy += w_1 - u_1 coef = _solve_cholesky_kernel(KK, yy[..., None], 1).ravel() w_1 = soft_thresholding(coef + u_1, lamda / rho) # w_2 = prox_laplacian(coef + u_2, beta / rho) # update residuals alpha_coef_K = [ alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients)] residuals = [x[j] - alpha_coef_K[j] for j in range(n_patients)] u = [u[j] + residuals[j] for j in range(n_patients)] u_1 += coef - w_1 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(coef - w_1) + sum(squared_norm(residuals[j]) for j in range(n_patients))) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + sum(squared_norm(x[j] - x_old[j]) for j in range(n_patients))) obj = objective_admm2(x, y, alpha, lamda, beta, w_1) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(coef.size + sum( x[j].size for j in range(n_patients))) * tol + rtol * max( np.sqrt(squared_norm(coef) + sum(squared_norm( alpha_coef_K[j]) for j in range(n_patients))), np.sqrt(squared_norm(w_1) + sum(squared_norm( x[j]) for j in range(n_patients)))), e_dual=np.sqrt(coef.size + sum( x[j].size for j in range(n_patients))) * tol + rtol * rho * ( np.sqrt(squared_norm(u_1) + sum(squared_norm( u[j]) for j in range(n_patients))))) w_1_old = w_1.copy() x_old = [x[j].copy() for j in range(n_patients)] if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u = [u[j] * (rho / rho_new) for j in range(n_patients)] u_1 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def kernel_latent_time_graphical_lasso( emp_cov, alpha=0.01, tau=1.0, rho=1.0, kernel_psi=None, kernel_phi=None, max_iter=100, verbose=False, psi="laplacian", phi="laplacian", mode="admm", tol=1e-4, rtol=1e-4, assume_centered=False, n_samples=None, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True, init="empirical", ): r"""Time-varying latent variable graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(K_i-L_i) + alpha ||K_i||_{od,1} + tau ||L_i||_* + sum_{s>t}^T k_psi(s,t) Psi(K_s - K_t) + sum_{s>t}^T k_phi(s,t)(L_s - L_t) where S is the empirical covariance of the data matrix D (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, tau, beta, eta : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) phi, prox_phi, phi_node_penalty = check_norm_prox(phi) n_times, _, n_features = emp_cov.shape if kernel_psi is None: kernel_psi = np.eye(n_times) if kernel_phi is None: kernel_phi = np.eye(n_times) Z_0 = init_precision(emp_cov, mode=init) W_0 = np.zeros_like(Z_0) X_0 = np.zeros_like(Z_0) R_old = np.zeros_like(Z_0) Z_M, Z_M_old = {}, {} Y_M = {} W_M, W_M_old = {}, {} U_M = {} for m in range(1, n_times): Z_L = Z_0.copy()[:-m] Z_R = Z_0.copy()[m:] Z_M[m] = (Z_L, Z_R) W_L = np.zeros_like(Z_L) W_R = np.zeros_like(Z_R) W_M[m] = (W_L, W_R) Y_L = np.zeros_like(Z_L) Y_R = np.zeros_like(Z_R) Y_M[m] = (Y_L, Y_R) U_L = np.zeros_like(W_L) U_R = np.zeros_like(W_R) U_M[m] = (U_L, U_R) Z_L_old = np.zeros_like(Z_L) Z_R_old = np.zeros_like(Z_R) Z_M_old[m] = (Z_L_old, Z_R_old) W_L_old = np.zeros_like(W_L) W_R_old = np.zeros_like(W_R) W_M_old[m] = (W_L_old, W_R_old) if n_samples is None: n_samples = np.ones(n_times) checks = [] for iteration_ in range(max_iter): # update R A = Z_0 - W_0 - X_0 A += A.transpose(0, 2, 1) A /= 2.0 A *= -rho / n_samples[:, None, None] A += emp_cov # A = emp_cov / rho - A R = np.array( [prox_logdet(a, lamda=ni / rho) for a, ni in zip(A, n_samples)]) # update Z_0 A = R + W_0 + X_0 for m in range(1, n_times): A[:-m] += Z_M[m][0] - Y_M[m][0] A[m:] += Z_M[m][1] - Y_M[m][1] A /= n_times Z_0 = soft_thresholding(A, lamda=alpha / (rho * n_times)) # update W_0 A = Z_0 - R - X_0 for m in range(1, n_times): A[:-m] += W_M[m][0] - U_M[m][0] A[m:] += W_M[m][1] - U_M[m][1] A /= n_times A += A.transpose(0, 2, 1) A /= 2.0 W_0 = np.array( [prox_trace_indicator(a, lamda=tau / (rho * n_times)) for a in A]) # update residuals X_0 += R - Z_0 + W_0 for m in range(1, n_times): # other Zs Y_L, Y_R = Y_M[m] A_L = Z_0[:-m] + Y_L A_R = Z_0[m:] + Y_R if not psi_node_penalty: prox_e = prox_psi(A_R - A_L, lamda=2.0 * np.diag(kernel_psi, m)[:, None, None] / rho) Z_L = 0.5 * (A_L + A_R - prox_e) Z_R = 0.5 * (A_L + A_R + prox_e) else: Z_L, Z_R = prox_psi( np.concatenate((A_L, A_R), axis=1), lamda=0.5 * np.diag(kernel_psi, m)[:, None, None] / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) Z_M[m] = (Z_L, Z_R) # update other residuals Y_L += Z_0[:-m] - Z_L Y_R += Z_0[m:] - Z_R # other Ws U_L, U_R = U_M[m] A_L = W_0[:-m] + U_L A_R = W_0[m:] + U_R if not phi_node_penalty: prox_e = prox_phi(A_R - A_L, lamda=2.0 * np.diag(kernel_phi, m)[:, None, None] / rho) W_L = 0.5 * (A_L + A_R - prox_e) W_R = 0.5 * (A_L + A_R + prox_e) else: W_L, W_R = prox_phi( np.concatenate((A_L, A_R), axis=1), lamda=0.5 * np.diag(kernel_phi, m)[:, None, None] / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) W_M[m] = (W_L, W_R) # update other residuals U_L += W_0[:-m] - W_L U_R += W_0[m:] - W_R # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(R - Z_0 + W_0) + sum( squared_norm(Z_0[:-m] - Z_M[m][0]) + squared_norm(Z_0[m:] - Z_M[m][1]) + squared_norm(W_0[:-m] - W_M[m][0]) + squared_norm(W_0[m:] - W_M[m][1]) for m in range(1, n_times))) snorm = rho * np.sqrt( squared_norm(R - R_old) + sum( squared_norm(Z_M[m][0] - Z_M_old[m][0]) + squared_norm(Z_M[m][1] - Z_M_old[m][1]) + squared_norm(W_M[m][0] - W_M_old[m][0]) + squared_norm(W_M[m][1] - W_M_old[m][1]) for m in range(1, n_times))) obj = (objective(emp_cov, n_samples, R, Z_0, Z_M, W_0, W_M, alpha, tau, kernel_psi, kernel_phi, psi, phi) if compute_objective else np.nan) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=n_features * np.sqrt(n_times * (2 * n_times - 1)) * tol + rtol * max( np.sqrt( squared_norm(R) + sum( squared_norm(Z_M[m][0]) + squared_norm(Z_M[m][1]) + squared_norm(W_M[m][0]) + squared_norm(W_M[m][1]) for m in range(1, n_times))), np.sqrt( squared_norm(Z_0 - W_0) + sum( squared_norm(Z_0[:-m]) + squared_norm(Z_0[m:]) + squared_norm(W_0[:-m]) + squared_norm(W_0[m:]) for m in range(1, n_times))), ), e_dual=n_features * np.sqrt(n_times * (2 * n_times - 1)) * tol + rtol * rho * np.sqrt( squared_norm(X_0) + sum( squared_norm(Y_M[m][0]) + squared_norm(Y_M[m][1]) + squared_norm(U_M[m][0]) + squared_norm(U_M[m][1]) for m in range(1, n_times))), ) R_old = R.copy() for m in range(1, n_times): Z_M_old[m] = (Z_M[m][0].copy(), Z_M[m][1].copy()) W_M_old[m] = (W_M[m][0].copy(), W_M[m][1].copy()) if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled X_0 *= rho / rho_new for m in range(1, n_times): Y_L, Y_R = Y_M[m] Y_L *= rho / rho_new Y_R *= rho / rho_new U_L, U_R = U_M[m] U_L *= rho / rho_new U_R *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, W_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def group_lasso(A, b, lamda=1.0, groups=None, rho=1.0, alpha=1.0, max_iter=1000, tol=1e-4, rtol=1e-2, return_history=False): r"""Group Lasso solver. Solves the following problem via ADMM minimize 1/2*|| Ax - b ||_2^2 + \lambda sum(norm(x_i)) The input p is a K-element vector giving the block sizes n_i, so that x_i is in R^{n_i}. Parameters ---------- A : array-like, 2-dimensional Input matrix. b : array-like, 1-dimensional Output vector. lamda : float, optional Regularisation parameter. groups : list Groups of variables. rho : float, optional Augmented Lagrangian parameter. alpha : float, optional Over-relaxation parameter (typically between 1.0 and 1.8). max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- x : numpy.array Solution to the problem. history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ n_samples, n_features = A.shape # check valid partition if not np.allclose(flatten(groups), np.arange(n_features)): raise ValueError("Invalid partition in groups. " "Groups must be non-overlapping and each variables " "must be selected") # % save a matrix-vector multiply Atb = A.T.dot(b) # ADMM solver x = np.zeros(n_features) z = np.zeros(n_features) u = np.zeros(n_features) # % pre-factor L, U = lu_factor(A, rho) hist = [] for _ in range(max_iter): # % x-update q = Atb + rho * (z - u) # % temporary value if n_samples >= n_features: x = np.linalg.lstsq(U, np.linalg.lstsq(L, q)[0])[0] else: x = q - A.T.dot( np.linalg.lstsq(U, np.linalg.lstsq(L, A.dot(q))[0])[0]) / rho x /= rho # % z-update with relaxation zold = z x_hat = alpha * x + (1 - alpha) * zold for group in groups: z[group] = soft_thresholding(x_hat[group] + u[group], lamda / rho) # % u-update u += (x_hat - z) # % diagnostics, reporting, termination checks history = ( objective(A, b, lamda, groups, x, z), # obj np.linalg.norm(x - z), # r norm np.linalg.norm(-rho * (z - zold)), # s norm np.sqrt(n_features) * tol + rtol * max(np.linalg.norm(x), np.linalg.norm(-z)), # eps pri np.sqrt(n_features) * tol + rtol * np.linalg.norm(rho * u) # eps dual ) hist.append(history) if history[1] < history[3] and history[2] < history[4]: break return z, hist if return_history else z
def group_lasso_overlap(A, b, lamda=1.0, groups=None, rho=1.0, max_iter=100, tol=1e-4, verbose=False, rtol=1e-2): r"""Group Lasso with Overlap solver. Solves the following problem via ADMM minimize 1/2*|| Ax - b ||_2^2 + \lambda sum(norm(x_i)) The input p is a K-element vector giving the block sizes n_i, so that x_i is in R^{n_i}. Parameters ---------- A : array-like, 2-dimensional Input matrix. b : array-like, 1-dimensional Output vector. lamda : float, optional Regularisation parameter. groups : list Groups of variables. rho : float, optional Augmented Lagrangian parameter. alpha : float, optional Over-relaxation parameter (typically between 1.0 and 1.8). max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. Returns ------- x : numpy.array Solution to the problem. history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ n, d = A.shape x = [np.zeros(len(g)) for g in groups] # local variables z = np.zeros(d) y = [np.zeros(len(g)) for g in groups] D = np.diag(D_function(d, groups)) Atb = A.T.dot(b) inv = np.linalg.inv(A.T.dot(A) + rho * D) hist = [] count = 0 for k in range(max_iter): # x update for i, g in enumerate(groups): x[i] = soft_thresholding(x[i] - y[i] / rho, lamda / rho) # z update zold = z x_consensus = P_star_x_bar_function(x, d, groups) y_consensus = P_star_x_bar_function(y, d, groups) z = inv.dot(Atb + D.dot(y_consensus + rho * x_consensus)) for i, g in enumerate(groups): y[i] += rho * (x[i] - z[g]) # diagnostics, reporting, termination checks history = ( objective(A, b, lamda, x, z), # objective np.linalg.norm(x_consensus - z), # rnorm np.linalg.norm(-rho * (z - zold)), # snorm np.sqrt(d) * tol + rtol * max(np.linalg.norm(x_consensus), np.linalg.norm(-z)), # eps primal np.sqrt(d) * tol + rtol * np.linalg.norm(rho * y_consensus) # eps dual ) if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % history) hist.append(history) if history[1] < history[3] and history[2] < history[4]: if count > 10: break else: count += 1 else: count = 0 return z, hist, k
def kernel_time_graphical_lasso( emp_cov, alpha=0.01, rho=1, kernel=None, max_iter=100, n_samples=None, verbose=False, psi="laplacian", tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, mode="admm", update_rho_options=None, compute_objective=True, stop_at=None, stop_when=1e-4, init="empirical", ): """Time-varying graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(K_i-L_i) + alpha ||K_i||_{od,1} + sum_{s>t}^T k_psi(s,t) Psi(K_s - K_t) where S is the empirical covariance of the data matrix D (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, beta : float, optional Regularisation parameter. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. init : {'empirical', 'zeros', ndarray}, default 'empirical' How to initialise the inverse covariance matrix. Default is take the empirical covariance and inverting it. Returns ------- X : numpy.array, 2-dimensional Solution to the problem. history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) n_times, _, n_features = emp_cov.shape if kernel is None: kernel = np.eye(n_times) Z_0 = init_precision(emp_cov, mode=init) U_0 = np.zeros_like(Z_0) Z_0_old = np.zeros_like(Z_0) Z_M, Z_M_old = {}, {} U_M = {} for m in range(1, n_times): # all possible markovians jumps Z_L = Z_0.copy()[:-m] Z_R = Z_0.copy()[m:] Z_M[m] = (Z_L, Z_R) U_L = np.zeros_like(Z_L) U_R = np.zeros_like(Z_R) U_M[m] = (U_L, U_R) Z_L_old = np.zeros_like(Z_L) Z_R_old = np.zeros_like(Z_R) Z_M_old[m] = (Z_L_old, Z_R_old) if n_samples is None: n_samples = np.ones(n_times) checks = [ convergence(obj=objective(n_samples, emp_cov, Z_0, Z_0, Z_M, alpha, kernel, psi)) ] for iteration_ in range(max_iter): # update K A = Z_0 - U_0 for m in range(1, n_times): A[:-m] += Z_M[m][0] - U_M[m][0] A[m:] += Z_M[m][1] - U_M[m][1] A /= n_times # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # K = np.array(map(soft_thresholding_, A)) A += A.transpose(0, 2, 1) A /= 2.0 A *= -rho * n_times / n_samples[:, None, None] A += emp_cov K = np.array([ prox_logdet(a, lamda=ni / (rho * n_times)) for a, ni in zip(A, n_samples) ]) # update Z_0 A = K + U_0 A += A.transpose(0, 2, 1) A /= 2.0 Z_0 = soft_thresholding(A, lamda=alpha / rho) # update residuals U_0 += K - Z_0 # other Zs for m in range(1, n_times): U_L, U_R = U_M[m] A_L = K[:-m] + U_L A_R = K[m:] + U_R if not psi_node_penalty: prox_e = prox_psi(A_R - A_L, lamda=2.0 * np.diag(kernel, m)[:, None, None] / rho) Z_L = 0.5 * (A_L + A_R - prox_e) Z_R = 0.5 * (A_L + A_R + prox_e) else: Z_L, Z_R = prox_psi( np.concatenate((A_L, A_R), axis=1), lamda=0.5 * np.diag(kernel, m)[:, None, None] / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter, ) Z_M[m] = (Z_L, Z_R) # update other residuals U_L += K[:-m] - Z_L U_R += K[m:] - Z_R # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(K - Z_0) + sum( squared_norm(K[:-m] - Z_M[m][0]) + squared_norm(K[m:] - Z_M[m][1]) for m in range(1, n_times))) snorm = rho * np.sqrt( squared_norm(Z_0 - Z_0_old) + sum( squared_norm(Z_M[m][0] - Z_M_old[m][0]) + squared_norm(Z_M[m][1] - Z_M_old[m][1]) for m in range(1, n_times))) obj = objective(n_samples, emp_cov, Z_0, K, Z_M, alpha, kernel, psi) if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=n_features * n_times * tol + rtol * max( np.sqrt( squared_norm(Z_0) + sum( squared_norm(Z_M[m][0]) + squared_norm(Z_M[m][1]) for m in range(1, n_times))), np.sqrt( squared_norm(K) + sum( squared_norm(K[:-m]) + squared_norm(K[m:]) for m in range(1, n_times))), ), e_dual=n_features * n_times * tol + rtol * rho * np.sqrt( squared_norm(U_0) + sum( squared_norm(U_M[m][0]) + squared_norm(U_M[m][1]) for m in range(1, n_times))), ) Z_0_old = Z_0.copy() for m in range(1, n_times): Z_M_old[m] = (Z_M[m][0].copy(), Z_M[m][1].copy()) if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if stop_at is not None: if abs(check.obj - stop_at) / abs(stop_at) < stop_when: break if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled U_0 *= rho / rho_new for m in range(1, n_times): U_L, U_R = U_M[m] U_L *= rho / rho_new U_R *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_ + 1) return return_list
def latent_time_graphical_lasso(emp_cov, alpha=0.01, tau=1., rho=1., beta=1., eta=1., max_iter=100, n_samples=None, verbose=False, psi='laplacian', phi='laplacian', mode='admm', tol=1e-4, rtol=1e-4, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True, init='empirical'): r"""Latent variable time-varying graphical lasso solver. Solves the following problem via ADMM: min sum_{i=1}^T -n_i log_likelihood(S_i, K_i-L_i) + alpha ||K_i||_{od,1} + tau ||L_i||_* + beta sum_{i=2}^T Psi(K_i - K_{i-1}) + eta sum_{i=2}^T Phi(L_i - L_{i-1}) where S_i = (1/n_i) X_i^T \times X_i is the empirical covariance of data matrix X (training observations by features). Parameters ---------- emp_cov : ndarray, shape (n_features, n_features) Empirical covariance of data. alpha, tau, beta, eta : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. n_samples : ndarray Number of samples available for each time point. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. update_rho_options : dict, optional Arguments for the rho update. See regain.update_rules.update_rho function for more information. compute_objective : bool, default True Choose to compute the objective value. init : {'empirical', 'zeros', ndarray}, default 'empirical' How to initialise the inverse covariance matrix. Default is take the empirical covariance and inverting it. Returns ------- K, L : numpy.array, 3-dimensional (T x d x d) Solution to the problem for each time t=1...T . history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ psi, prox_psi, psi_node_penalty = check_norm_prox(psi) phi, prox_phi, phi_node_penalty = check_norm_prox(phi) Z_0 = init_precision(emp_cov, mode=init) Z_1 = Z_0.copy()[:-1] Z_2 = Z_0.copy()[1:] W_0 = np.zeros_like(Z_0) W_1 = np.zeros_like(Z_1) W_2 = np.zeros_like(Z_2) X_0 = np.zeros_like(Z_0) X_1 = np.zeros_like(Z_1) X_2 = np.zeros_like(Z_2) U_1 = np.zeros_like(W_1) U_2 = np.zeros_like(W_2) R_old = np.zeros_like(Z_0) Z_1_old = np.zeros_like(Z_1) Z_2_old = np.zeros_like(Z_2) W_1_old = np.zeros_like(W_1) W_2_old = np.zeros_like(W_2) # divisor for consensus variables, accounting for two less matrices divisor = np.full(emp_cov.shape[0], 3, dtype=float) divisor[0] -= 1 divisor[-1] -= 1 if n_samples is None: n_samples = np.ones(emp_cov.shape[0]) checks = [] for iteration_ in range(max_iter): # update R A = Z_0 - W_0 - X_0 A += A.transpose(0, 2, 1) A /= 2. A *= -rho / n_samples[:, None, None] A += emp_cov # A = emp_cov / rho - A R = np.array( [prox_logdet(a, lamda=ni / rho) for a, ni in zip(A, n_samples)]) # update Z_0 A = R + W_0 + X_0 A[:-1] += Z_1 - X_1 A[1:] += Z_2 - X_2 A /= divisor[:, None, None] # soft_thresholding_ = partial(soft_thresholding, lamda=alpha / rho) # Z_0 = np.array(map(soft_thresholding_, A)) Z_0 = soft_thresholding(A, lamda=alpha / (rho * divisor[:, None, None])) # update Z_1, Z_2 A_1 = Z_0[:-1] + X_1 A_2 = Z_0[1:] + X_2 if not psi_node_penalty: prox_e = prox_psi(A_2 - A_1, lamda=2. * beta / rho) Z_1 = .5 * (A_1 + A_2 - prox_e) Z_2 = .5 * (A_1 + A_2 + prox_e) else: Z_1, Z_2 = prox_psi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * beta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update W_0 A = Z_0 - R - X_0 A[:-1] += W_1 - U_1 A[1:] += W_2 - U_2 A /= divisor[:, None, None] A += A.transpose(0, 2, 1) A /= 2. W_0 = np.array([ prox_trace_indicator(a, lamda=tau / (rho * div)) for a, div in zip(A, divisor) ]) # update W_1, W_2 A_1 = W_0[:-1] + U_1 A_2 = W_0[1:] + U_2 if not phi_node_penalty: prox_e = prox_phi(A_2 - A_1, lamda=2. * eta / rho) W_1 = .5 * (A_1 + A_2 - prox_e) W_2 = .5 * (A_1 + A_2 + prox_e) else: W_1, W_2 = prox_phi(np.concatenate((A_1, A_2), axis=1), lamda=.5 * eta / rho, rho=rho, tol=tol, rtol=rtol, max_iter=max_iter) # update residuals X_0 += R - Z_0 + W_0 X_1 += Z_0[:-1] - Z_1 X_2 += Z_0[1:] - Z_2 U_1 += W_0[:-1] - W_1 U_2 += W_0[1:] - W_2 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(R - Z_0 + W_0) + squared_norm(Z_0[:-1] - Z_1) + squared_norm(Z_0[1:] - Z_2) + squared_norm(W_0[:-1] - W_1) + squared_norm(W_0[1:] - W_2)) snorm = rho * np.sqrt( squared_norm(R - R_old) + squared_norm(Z_1 - Z_1_old) + squared_norm(Z_2 - Z_2_old) + squared_norm(W_1 - W_1_old) + squared_norm(W_2 - W_2_old)) obj = objective(emp_cov, n_samples, R, Z_0, Z_1, Z_2, W_0, W_1, W_2, alpha, tau, beta, eta, psi, phi) \ if compute_objective else np.nan check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * max( np.sqrt( squared_norm(R) + squared_norm(Z_1) + squared_norm(Z_2) + squared_norm(W_1) + squared_norm(W_2)), np.sqrt( squared_norm(Z_0 - W_0) + squared_norm(Z_0[:-1]) + squared_norm(Z_0[1:]) + squared_norm(W_0[:-1]) + squared_norm(W_0[1:]))), e_dual=np.sqrt(R.size + 4 * Z_1.size) * tol + rtol * rho * (np.sqrt( squared_norm(X_0) + squared_norm(X_1) + squared_norm(X_2) + squared_norm(U_1) + squared_norm(U_2)))) R_old = R.copy() Z_1_old = Z_1.copy() Z_2_old = Z_2.copy() W_1_old = W_1.copy() W_2_old = W_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled X_0 *= rho / rho_new X_1 *= rho / rho_new X_2 *= rho / rho_new U_1 *= rho / rho_new U_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") covariance_ = np.array([linalg.pinvh(x) for x in Z_0]) return_list = [Z_0, W_0, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def logistic_alternating(K, y, lamda=0.01, beta=0.01, gamma=.5, max_iter=100, l1_ratio_lamda=0.1, l1_ratio_beta=0.1, deep=True, verbose=0, tol=1e-4, return_n_iter=True, fit_intercept=True, lr_p2=None): # multiple patient n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] objective_new = 0 max_iter_deep = max_iter // 3 if deep else 1 if lr_p2 is None: raise ValueError("lr_p2 cant be None") for iteration_ in range(max_iter): w_old = coef.copy() alpha_old = [a.copy() for a in alpha] objective_old = objective_new for i in range(n_patients): lr_p2[i].fit(np.tensordot(coef, K[i], axes=1), y[i]) alpha = [log.coef_.ravel() for log in lr_p2] intercepts = [log.intercept_.ravel() for log in lr_p2] alpha_intercept = [ np.hstack((a, c)) for a, c in zip(alpha, intercepts) ] # X = np.tensordot(alpha, K, axes=([0], [2])).T # X = sum(K[j].dot(alpha[j]).T for j in range(n_patients)) # coef = lr_p1.fit(X, y).coef_.ravel() for it in range(max_iter_deep): coef_old = coef.copy() l2_reg = beta * (1 - l1_ratio_beta) loss, gradient = _logistic_loss_and_grad(coef, alpha_intercept, K, y, l2_reg) l1_reg = beta * l1_ratio_beta coef = soft_thresholding(coef - gamma * gradient, gamma * l1_reg) coef = np.maximum(coef, 0.) if np.linalg.norm(coef - coef_old) < tol: break obj = logistic_objective(K, y, alpha, coef, lamda, beta) objective_difference = abs(objective_new - objective_old) # snorm = np.sqrt(squared_norm(coef - w_old) + # squared_norm(alpha - alpha_old)) diff_w = np.linalg.norm(coef - w_old) diff_a = np.sqrt( sum(squared_norm(a - a_old) for a, a_old in zip(alpha, alpha_old))) if verbose: # and iteration_ % 10 == 0: # print("obj: %.4f, snorm: %.4f" % (obj, snorm)) print("obj: %.4f, loss: %.4f, diff_w: %.4f, diff_a: %.4f" % (obj, logistic_loss(K, y, alpha, coef, lamda, beta), diff_w, diff_a)) if diff_a < tol and objective_difference < tol: break if np.isnan(diff_w) or np.isnan(diff_a) or np.isnan( objective_difference): raise ValueError('something is nan') else: warnings.warn("Objective did not converge.") return_list = [alpha, coef, intercepts] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning_admm(K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) u_1 = np.zeros(n_kernels) u_2 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) w_2 = np.zeros(n_kernels) w_1_old = w_1.copy() w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK A = [K[j].T.dot(coef) for j in range(n_patients)] KK = [A[j].dot(A[j].T) for j in range(n_patients)] yy = [y[j].dot(A[j]) for j in range(n_patients)] alpha = [ _solve_cholesky_kernel(KK[j], yy[j][..., None], 2).ravel() for j in range(n_patients) ] # alpha = [_solve_cholesky_kernel( # K_dot_coef[j], y[j][..., None], 0).ravel() for j in range(n_patients)] w_1 = soft_thresholding(coef + u_1, lamda / rho) w_2 = prox_laplacian(coef + u_2, beta / rho) # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(y[j].dot(A[j].T) for j in range(n_patients)) yy += rho * (w_1 + w_2 - u_1 - u_2) coef = _solve_cholesky_kernel(KK, yy[..., None], 2 * rho).ravel() # update residuals u_1 += coef - w_1 u_2 += coef - w_2 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(coef - w_1) + squared_norm(coef - w_2)) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + squared_norm(w_2 - w_2_old)) obj = objective_admm(K, y, alpha, lamda, beta, coef, w_1, w_2) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(2 * coef.size) * tol + rtol * max(np.sqrt(squared_norm(coef) + squared_norm(coef)), np.sqrt(squared_norm(w_1) + squared_norm(w_2))), e_dual=np.sqrt(2 * coef.size) * tol + rtol * rho * (np.sqrt(squared_norm(u_1) + squared_norm(u_2)))) w_1_old = w_1.copy() w_2_old = w_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u_1 *= rho / rho_new u_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def infimal_convolution( S, alpha=1.0, tau=1.0, rho=1.0, max_iter=100, verbose=False, tol=1e-4, rtol=1e-2, return_history=False, return_n_iter=True, update_rho_options=None, compute_objective=True, ): r"""Latent variable graphical lasso solver. Solves the following problem via ADMM: min - log_likelihood(S, K-L) + alpha ||K||_{od,1} + tau ||L_i||_* where S is the empirical covariance of the data matrix D (training observations by features). Parameters ---------- emp_cov : array-like Empirical covariance matrix. alpha, tau : float, optional Regularisation parameters. rho : float, optional Augmented Lagrangian parameter. max_iter : int, optional Maximum number of iterations. tol : float, optional Absolute tolerance for convergence. rtol : float, optional Relative tolerance for convergence. return_history : bool, optional Return the history of computed values. return_n_iter : bool, optional Return the number of iteration before convergence. verbose : bool, default False Print info at each iteration. Returns ------- K, L : np.array, 2-dimensional, size (d x d) Solution to the problem. S : np.array, 2 dimensional Empirical covariance matrix. n_iter : int If return_n_iter, returns the number of iterations before convergence. history : list If return_history, then also a structure that contains the objective value, the primal and dual residual norms, and tolerances for the primal and dual residual norms at each iteration. """ K = np.zeros_like(S) L = np.zeros_like(S) U = np.zeros_like(S) R_old = np.zeros_like(S) checks = [] for iteration_ in range(max_iter): # update R A = K - L - U A += A.T A /= 2.0 R = prox_laplacian(S + rho * A, lamda=rho / 2.0) A = L + R + U K = soft_thresholding(A, lamda=alpha / rho) A = K - R - U A += A.T A /= 2.0 L = prox_trace_indicator(A, lamda=tau / rho) # update residuals U += R - K + L # diagnostics, reporting, termination checks obj = objective(S, R, K, L, alpha, tau) if compute_objective else np.nan rnorm = np.linalg.norm(R - K + L) snorm = rho * np.linalg.norm(R - R_old) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(R.size) * tol + rtol * max(np.linalg.norm(R), np.linalg.norm(K - L)), e_dual=np.sqrt(R.size) * tol + rtol * rho * np.linalg.norm(U), ) R_old = R.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check[:5]) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual: break if check.obj == np.inf: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled U *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") covariance_ = linalg.pinvh(K) return_list = [K, L, covariance_] if return_history: return_list.append(checks) if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning_admm2(K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||y_i - alpha_i * sum_{k=1}^{n_k} w_k * K_{ik}||^2 + lamda ||w||_1 + beta sum_{j=1}^{c_i}||alpha_j||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] u = [np.zeros(K[j].shape[1]) for j in range(n_patients)] u_1 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) x_old = [np.zeros(K[0].shape[1]) for j in range(n_patients)] w_1_old = w_1.copy() # w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update x A = [K[j].T.dot(coef) for j in range(n_patients)] x = [ prox_laplacian(y[j] + rho * (A[j].T.dot(alpha[j]) - u[j]), rho / 2.) for j in range(n_patients) ] # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK KK = [rho * A[j].dot(A[j].T) for j in range(n_patients)] yy = [rho * A[j].dot(x[j] + u[j]) for j in range(n_patients)] alpha = [ _solve_cholesky_kernel(KK[j], yy[j][..., None], 2 * beta).ravel() for j in range(n_patients) ] # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(A[j].dot(x[j] + u[j]) for j in range(n_patients)) yy += w_1 - u_1 coef = _solve_cholesky_kernel(KK, yy[..., None], 1).ravel() w_1 = soft_thresholding(coef + u_1, lamda / rho) # w_2 = prox_laplacian(coef + u_2, beta / rho) # update residuals alpha_coef_K = [ alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients) ] residuals = [x[j] - alpha_coef_K[j] for j in range(n_patients)] u = [u[j] + residuals[j] for j in range(n_patients)] u_1 += coef - w_1 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(coef - w_1) + sum(squared_norm(residuals[j]) for j in range(n_patients))) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + sum(squared_norm(x[j] - x_old[j]) for j in range(n_patients))) obj = objective_admm2(x, y, alpha, lamda, beta, w_1) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(coef.size + sum(x[j].size for j in range(n_patients))) * tol + rtol * max( np.sqrt( squared_norm(coef) + sum( squared_norm(alpha_coef_K[j]) for j in range(n_patients))), np.sqrt( squared_norm(w_1) + sum(squared_norm(x[j]) for j in range(n_patients)))), e_dual=np.sqrt(coef.size + sum(x[j].size for j in range(n_patients))) * tol + rtol * rho * (np.sqrt( squared_norm(u_1) + sum(squared_norm(u[j]) for j in range(n_patients))))) w_1_old = w_1.copy() x_old = [x[j].copy() for j in range(n_patients)] if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u = [u[j] * (rho / rho_new) for j in range(n_patients)] u_1 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning_admm( K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) u_1 = np.zeros(n_kernels) u_2 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) w_2 = np.zeros(n_kernels) w_1_old = w_1.copy() w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK A = [K[j].T.dot(coef) for j in range(n_patients)] KK = [A[j].dot(A[j].T) for j in range(n_patients)] yy = [y[j].dot(A[j]) for j in range(n_patients)] alpha = [_solve_cholesky_kernel( KK[j], yy[j][..., None], 2).ravel() for j in range(n_patients)] # alpha = [_solve_cholesky_kernel( # K_dot_coef[j], y[j][..., None], 0).ravel() for j in range(n_patients)] w_1 = soft_thresholding(coef + u_1, lamda / rho) w_2 = prox_laplacian(coef + u_2, beta / rho) # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(y[j].dot(A[j].T) for j in range(n_patients)) yy += rho * (w_1 + w_2 - u_1 - u_2) coef = _solve_cholesky_kernel(KK, yy[..., None], 2 * rho).ravel() # update residuals u_1 += coef - w_1 u_2 += coef - w_2 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(coef - w_1) + squared_norm(coef - w_2)) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + squared_norm(w_2 - w_2_old)) obj = objective_admm(K, y, alpha, lamda, beta, coef, w_1, w_2) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(2 * coef.size) * tol + rtol * max( np.sqrt(squared_norm(coef) + squared_norm(coef)), np.sqrt(squared_norm(w_1) + squared_norm(w_2))), e_dual=np.sqrt(2 * coef.size) * tol + rtol * rho * ( np.sqrt(squared_norm(u_1) + squared_norm(u_2)))) w_1_old = w_1.copy() w_2_old = w_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u_1 *= rho / rho_new u_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning(K, y, lamda=0.01, beta=0.01, gamma='auto', max_iter=100, verbose=0, tol=1e-4, return_n_iter=True): """Elastic Net kernel learning. Solve the following problem via alternating minimisation: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] # KKT = [K[j].T.dot(K[j]) for j in range(len(K))] # print(KKT[0].shape) if gamma == 'auto': lipschitz_constant = np.array([ sum( np.linalg.norm(K_j[i].dot(K_j[i].T)) for i in range(K_j.shape[0])) for K_j in K ]) gamma = 1. / (lipschitz_constant) objective_new = 0 for iteration_ in range(max_iter): w_old = coef.copy() alpha_old = [a.copy() for a in alpha] objective_old = objective_new # update w A = [K[j].dot(alpha[j]) for j in range(n_patients)] alpha_coef_K = [ alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients) ] gradient = sum( (alpha_coef_K[j] - y[j]).dot(A[j].T) for j in range(n_patients)) # gradient_2 = coef.dot(sum( # np.dot(K[j].dot(alpha[j]), K[j].dot(alpha[j]).T) # for j in range(len(K)))) - sum( # y[j].dot(K[j].dot(alpha[j]).T) for j in range(len(K))) # gradient = coef.dot(sum( # alpha[j].dot(KKT[j].dot(alpha[j])) for j in range(len(K)))) - sum( # y[j].dot(K[j].dot(alpha[j]).T) for j in range(len(K))) # gradient += 2 * beta * coef coef = soft_thresholding(coef - gamma * gradient, lamda=lamda * gamma) # update alpha # for j in range(len(K)): # alpha[j] = _solve_cholesky_kernel( # K[j].T.dot(coef), y[j][..., None], lamda).ravel() A = [K[j].T.dot(coef) for j in range(n_patients)] alpha_coef_K = [ alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients) ] gradient = [(alpha_coef_K[j] - y[j]).dot(A[j].T) + 2 * beta * alpha[j] for j in range(n_patients)] alpha = [alpha[j] - gamma * gradient[j] for j in range(n_patients)] objective_new = objective(K, y, alpha, lamda, beta, coef) objective_difference = abs(objective_new - objective_old) snorm = np.sqrt( squared_norm(coef - w_old) + sum(squared_norm(a - a_old) for a, a_old in zip(alpha, alpha_old))) obj = objective(K, y, alpha, lamda, beta, coef) if verbose and iteration_ % 10 == 0: print("obj: %.4f, snorm: %.4f" % (obj, snorm)) if snorm < tol and objective_difference < tol: break if np.isnan(snorm) or np.isnan(objective_difference): raise ValueError('assdgg') else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning( K, y, lamda=0.01, beta=0.01, gamma='auto', max_iter=100, verbose=0, tol=1e-4, return_n_iter=True): """Elastic Net kernel learning. Solve the following problem via alternating minimisation: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] # KKT = [K[j].T.dot(K[j]) for j in range(len(K))] # print(KKT[0].shape) if gamma == 'auto': lipschitz_constant = np.array([ sum(np.linalg.norm(K_j[i].dot(K_j[i].T)) for i in range(K_j.shape[0])) for K_j in K]) gamma = 1. / (lipschitz_constant) objective_new = 0 for iteration_ in range(max_iter): w_old = coef.copy() alpha_old = [a.copy() for a in alpha] objective_old = objective_new # update w A = [K[j].dot(alpha[j]) for j in range(n_patients)] alpha_coef_K = [alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients)] gradient = sum((alpha_coef_K[j] - y[j]).dot(A[j].T) for j in range(n_patients)) # gradient_2 = coef.dot(sum( # np.dot(K[j].dot(alpha[j]), K[j].dot(alpha[j]).T) # for j in range(len(K)))) - sum( # y[j].dot(K[j].dot(alpha[j]).T) for j in range(len(K))) # gradient = coef.dot(sum( # alpha[j].dot(KKT[j].dot(alpha[j])) for j in range(len(K)))) - sum( # y[j].dot(K[j].dot(alpha[j]).T) for j in range(len(K))) # gradient += 2 * beta * coef coef = soft_thresholding(coef - gamma * gradient, lamda=lamda * gamma) # update alpha # for j in range(len(K)): # alpha[j] = _solve_cholesky_kernel( # K[j].T.dot(coef), y[j][..., None], lamda).ravel() A = [K[j].T.dot(coef) for j in range(n_patients)] alpha_coef_K = [alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients)] gradient = [(alpha_coef_K[j] - y[j]).dot(A[j].T) + 2 * beta * alpha[j] for j in range(n_patients)] alpha = [alpha[j] - gamma * gradient[j] for j in range(n_patients)] objective_new = objective(K, y, alpha, lamda, beta, coef) objective_difference = abs(objective_new - objective_old) snorm = np.sqrt(squared_norm(coef - w_old) + sum( squared_norm(a - a_old) for a, a_old in zip(alpha, alpha_old))) obj = objective(K, y, alpha, lamda, beta, coef) if verbose and iteration_ % 10 == 0: print("obj: %.4f, snorm: %.4f" % (obj, snorm)) if snorm < tol and objective_difference < tol: break if np.isnan(snorm) or np.isnan(objective_difference): raise ValueError('assdgg') else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def fit_each_variable( X, ix, alpha=1e-2, gamma=1, tol=1e-3, max_iter=100, verbose=0, update_gamma=0.5, return_history=True, compute_objective=True, return_n_iter=False, adjust_gamma=False, A=None, T=0, rho=1, ): n, d = X.shape theta = np.zeros(d - 1) selector = [i for i in range(d) if i != ix] def gradient(X, theta, r, selector, n, A, T, rho): XTX = X[:, selector].T.dot(X[:, r]) EXK = X[:, selector].T.dot(np.exp(X[:, selector].dot(theta))) to_add = 0 if A is not None: to_add = (rho * T) * (theta - A[r, selector]) / n return -(1 / n) * (XTX - EXK) + to_add thetas = [theta] checks = [] for iter_ in range(max_iter): theta_old = thetas[-1] grad = gradient(X, theta, ix, selector, n, A, T, rho) while True: theta_new = theta - gamma * grad theta = soft_thresholding(theta_new, alpha * gamma) loss_new = loss_single_variable(X, theta, n, ix, selector) loss_old = loss_single_variable(X, theta_old, n, ix, selector) # Line search diff_theta2 = np.linalg.norm(theta_old - theta)**2 grad_diff = grad.dot(theta_old - theta) diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma)) if loss_new > diff or np.isinf(loss_new) or np.isnan(loss_new): gamma = update_gamma * gamma theta = theta_old - gamma * grad theta = soft_thresholding(theta, alpha * gamma) loss_new = loss_single_variable(X, theta, n, ix, selector) diff = loss_old - grad_diff + (diff_theta2 / (2 * gamma)) else: break thetas.append(theta) if iter_ > 0: check = convergence( iter=iter_, obj=objective_single_variable(X, theta, n, ix, selector, alpha), iter_norm=np.linalg.norm(thetas[-2] - thetas[-1]), iter_r_norm=(np.linalg.norm(thetas[-2] - thetas[-1]) / np.linalg.norm(thetas[-2])), ) checks.append(check) # if adjust_gamma: # TODO multiply or divide if verbose: print("Iter: %d, objective: %.4f, iter_norm %.4f," " iter_norm_normalized: %.4f" % (check[0], check[1], check[2], check[3])) if np.abs(check[2]) < tol: break return_list = [thetas[-1]] if return_history: return_list.append(thetas) return_list.append(checks) if return_n_iter: return_list.append(iter_) return return_list