def _init_dbeta0(mask, mask0, jac0): size_mat = mask.sum() if jac0 is not None: dbeta0_new = init_dbeta0_new(jac0, mask, mask0) else: dbeta0_new = np.zeros(size_mat) return dbeta0_new
def get_beta_jac_t_v_implicit( X_train, y_train, log_alpha, get_v, mask0=None, dense0=None, tol=1e-3, model="lasso", sk=False, max_iter=1000, sol_lin_sys=None, n=1, sigma=0, delta=0, epsilon=0): alpha = np.exp(log_alpha) n_samples, n_features = X_train.shape mask, dense, _ = get_beta_jac_iterdiff( X_train, y_train, log_alpha, mask0=mask0, dense0=dense0, tol=tol, max_iter=max_iter, compute_jac=False, model=model) mat_to_inv = model.get_hessian(X_train, y_train, mask, dense, log_alpha) size_mat = mat_to_inv.shape[0] v = get_v(mask, dense) if hasattr(model, 'dual'): v = model.get_dual_v(X_train, y_train, v, log_alpha) # TODO: to clean is_sparse = issparse(X_train) if not alpha.shape: alphas = np.ones(n_features) * alpha else: alphas = alpha.copy() if sol_lin_sys is not None and not hasattr(model, 'dual'): sol0 = init_dbeta0_new(sol_lin_sys, mask, mask0) else: size_mat = mat_to_inv.shape[0] sol0 = np.zeros(size_mat) try: sol = cg( mat_to_inv, - model.generalized_supp(X_train, v, log_alpha), # x0=sol0, tol=tol, maxiter=1e5) x0=sol0, tol=tol) if sol[1] == 0: sol_lin_sys = sol[0] else: raise ValueError('cg did not converge.') except Exception: print("Matrix to invert was badly conditioned") size_mat = mat_to_inv.shape[0] if is_sparse: reg_amount = 1e-7 * norm(model.reduce_X(X_train, mask).todense(), ord=2) ** 2 mat_to_inv += reg_amount * identity(size_mat) else: reg_amount = 1e-7 * norm(model.reduce_X(X_train, mask), ord=2) ** 2 mat_to_inv += reg_amount * np.eye(size_mat) sol = cg( mat_to_inv + reg_amount * identity(size_mat), - model.generalized_supp(X_train, v, log_alpha), x0=sol0, atol=1e-3) sol_lin_sys = sol[0] jac_t_v = model._get_jac_t_v( X_train, y_train, sol_lin_sys, mask, dense, alphas, v.copy(), n_samples) return mask, dense, jac_t_v, sol[0]
def init_dbeta0_mcp(jac0, mask, mask0): # mask_both = np.logical_and(mask_old, mask) size_mat = mask.sum() dbeta0_new = np.zeros((size_mat, 2)) # count = 0 # count_old = 0 # n_features = mask.shape[0] for j in range(2): dbeta0_new[:, j] = init_dbeta0_new(jac0[:, j], mask, mask0) return dbeta0_new
def get_beta_jac_t_v_implicit( X_train, y_train, log_alpha, X_val, y_val, mask0=None, dense0=None, jac0=None, tol=1e-3, model="lasso", sk=False, maxit=1000, sol_lin_sys=None, criterion="cv", n=1, sigma=0, delta=0, epsilon=0): alpha = np.exp(log_alpha) n_samples, n_features = X_train.shape # compute beta using sklearn lasso if sk: clf = Lasso( alpha=alpha, fit_intercept=False, warm_start=True, tol=tol, max_iter=10000) clf.fit(X, y) coef_ = clf.coef_ mask = coef_ != 0 dense = coef_[mask] # compute beta using vanilla numba cd lasso else: mask, dense = get_beta_jac_iterdiff( X_train, y_train, log_alpha, mask0=mask0, dense0=dense0, maxit=maxit, tol=tol, compute_jac=False, jac0=None) # v = 2 * X_val[:, mask].T @ ( # X_val[:, mask] @ dense - y_val) / X_val.shape[0] if criterion == "cv": v = 2 * X_val[:, mask].T @ ( X_val[:, mask] @ dense - y_val) / X_val.shape[0] elif criterion == "sure": if n == 1: v = 2 * X_train[:, mask].T @ ( X_train[:, mask] @dense - y_train - 2 * sigma ** 2 / epsilon * delta) elif n == 2: v = 2 * sigma ** 2 * X_train[:, mask].T @ delta / epsilon is_sparse = issparse(X_train) if not alpha.shape: alphas = np.ones(n_features) * alpha else: alphas = alpha.copy() if sol_lin_sys is not None: sol0 = init_dbeta0_new(sol_lin_sys, mask, mask0) else: size_mat = mask.sum() sol0 = np.zeros(size_mat) mat_to_inv = X_train[:, mask].T @ X_train[:, mask] size_mat = mask.sum() if is_sparse: try: # reg_amount = 1e-7 * norm(X_train[:, mask].todense(), ord=2) ** 2 # mat_to_inv += reg_amount * identity(size_mat) sol = cg( mat_to_inv, - n_samples * v, # x0=sol0, tol=tol, maxiter=1e5) x0=sol0, tol=1e-15, maxiter=1e5) # x0=sol0, atol=1e-3) # sol = cg( # mat_to_inv, - alpha * n_samples * v, # x0=sol0, atol=1e-3) if sol[1] == 0: jac = sol[0] else: raise ValueError('cg did not converge.') except: print("Matrix to invert was badly conditioned") size_mat = mask.sum() reg_amount = 1e-7 * norm(X_train[:, mask].todense(), ord=2) ** 2 sol = cg( mat_to_inv + reg_amount * identity(size_mat), - n_samples * v, x0=sol0, # - alpha * n_samples * v, x0=sol0, atol=1e-3) jac = sol[0] else: try: jac = solve( X_train[:, mask].T @ X_train[:, mask], - n_samples * v, sym_pos=True, assume_a='pos') # import ipdb; ipdb.set_trace() except: print("Matrix to invert was badly conditioned") size_mat = mask.sum() reg_amount = 1e-9 * norm(X_train[:, mask], ord=2) ** 2 jac = solve( X_train[:, mask].T @ X_train[:, mask] + reg_amount * np.eye(size_mat), - n_samples * v, sym_pos=True, assume_a='pos') if model == "lasso": jac_t_v = alpha * np.sign(dense) @ jac elif model == "wlasso": jac_t_v = np.zeros(n_features) jac_t_v[mask] = alphas[mask] * np.sign(dense) * jac return mask, dense, jac_t_v, jac
def get_beta_jac_fast_iterdiff(X, y, log_alpha, X_val, y_val, mask0=None, dense0=None, jac0=None, tol=1e-3, maxit=100, niter_jac=1000, tol_jac=1e-6, model="lasso", criterion="cv", sigma=1, epsilon=0.1, delta=None, n=1): n_samples, n_features = X.shape if model == "mcp": mask, dense = get_beta_jac_iterdiff(X, y, log_alpha, mask0=mask0, dense0=dense0, jac0=jac0, tol=tol, maxit=maxit, compute_jac=False, model="mcp") else: mask, dense = get_beta_jac_iterdiff(X, y, log_alpha, mask0=mask0, dense0=dense0, jac0=jac0, tol=tol, maxit=maxit, compute_jac=False, model="lasso") # TODO this is dirty, to improve and to jit size_mat = mask.sum() if model == "lasso": if jac0 is not None: dbeta0_new = init_dbeta0_new(jac0, mask, mask0) else: dbeta0_new = np.zeros(size_mat) elif model == "mcp": # TODO add warm start if jac0 is None: dbeta0_new = np.zeros((size_mat, 2)) else: dbeta0_new = init_dbeta0_mcp(jac0, mask, mask0) else: if jac0 is None: dbeta0_new = np.zeros((size_mat, size_mat)) else: dbeta0_new = init_dbeta0_new_p(jac0, mask, mask0) if criterion == "cv": v = 2 * X_val[:, mask].T @ (X_val[:, mask] @ dense - y_val) / X_val.shape[0] elif criterion == "sure": if n == 1: v = 2 * X[:, mask].T @ (X[:, mask] @ dense - y - 2 * sigma**2 / epsilon * delta) elif n == 2: v = 2 * sigma**2 * X[:, mask].T @ delta / epsilon jac = get_only_jac(X[:, mask], np.exp(log_alpha), np.sign(dense), v, dbeta=dbeta0_new, niter_jac=niter_jac, tol_jac=tol_jac, model=model, mask=mask, dense=dense) return mask, dense, jac
def compute_beta_grad_implicit(X, y, log_alpha, get_grad_outer, mask0=None, dense0=None, tol=1e-3, model="lasso", max_iter=1000, sol_lin_sys=None, tol_lin_sys=1e-6, max_iter_lin_sys=100): """Compute beta and the hypergradient with implicit differentiation. The hypergradient computation is done in 3 steps: - 1 solve the inner optimization problem. - 2 solve a linear system on the support (ie the non-zeros coefficients) of the solution. - 3 use the solution of the linear system to compute the gradient. Parameters ---------- X: array-like, shape (n_samples, n_features) Design matrix. y: ndarray, shape (n_samples,) Observation vector. log_alpha: float or np.array, shape (n_features,) Logarithm of hyperparameter. mask0: ndarray, shape (n_features,) Boolean of active feature of the previous regression coefficients beta for warm start. dense0: ndarray, shape (mask.sum(),) Initial value of the previous regression coefficients beta for warm start. tol: float The tolerance for the inner optimization problem. model: instance of ``sparse_ho.base.BaseModel`` A model that follows the sparse_ho API. max_iter: int Maximum number of iterations for the inner solver. sol_lin_sys: ndarray Previous solution of the linear system for warm start. tol_lin_sys: float Tolerance for the resolution of the linear system. max_iter_lin_sys: int Maximum number of iterations for the resolution of the linear system. """ # 1 compute the regression coefficients beta, stored in mask and dense alpha = np.exp(log_alpha) mask, dense, _ = compute_beta(X, y, log_alpha, mask0=mask0, dense0=dense0, tol=tol, max_iter=max_iter, compute_jac=False, model=model) n_features = X.shape[1] mat_to_inv = model.get_mat_vec(X, y, mask, dense, log_alpha) v = get_grad_outer(mask, dense) if hasattr(model, 'dual'): v = model.get_dual_v(mask, dense, X, y, v, log_alpha) # 2 solve the linear system # TODO I think this should be removed if not alpha.shape: alphas = np.ones(n_features) * alpha else: alphas = alpha.copy() if sol_lin_sys is not None and not hasattr(model, 'dual'): sol0 = init_dbeta0_new(sol_lin_sys, mask, mask0) else: sol0 = None # TODO add warm start for SVM and SVR sol = cg(mat_to_inv, -model.generalized_supp(X, v, log_alpha), x0=sol0, tol=tol_lin_sys, maxiter=max_iter_lin_sys) sol_lin_sys = sol[0] # 3 compute the gradient grad = model._get_grad(X, y, sol_lin_sys, mask, dense, alphas, v) return mask, dense, grad, sol_lin_sys