def test_lasso_dual(): """ Check that the solution of the lasso signal approximator dual composite is soft-thresholding """ l1 = .1 sparsity = R.l1norm(10, lagrange=l1) x = np.arange(10) - 5 loss = R.quadratic.shift(-x, coef=0.5) pen = R.simple_problem(loss, sparsity) solver = R.FISTA(pen) pen.lipschitz = 1 solver.fit(backtrack=False) soln = solver.composite.coefs st = np.maximum(np.fabs(x)-l1,0) * np.sign(x) np.testing.assert_almost_equal(soln,st, decimal=3) pen = R.simple_problem(loss, sparsity) solver = R.FISTA(pen) solver.fit(monotonicity_restart=False) soln = solver.composite.coefs st = np.maximum(np.fabs(x)-l1,0) * np.sign(x) np.testing.assert_almost_equal(soln,st, decimal=3) pen = R.container(loss, sparsity) solver = R.FISTA(pen) solver.fit() soln = solver.composite.coefs np.testing.assert_almost_equal(soln,st, decimal=3)
def test_simple_problem(self): tests = [] atom, q, prox_center, L = self.atom, self.q, self.prox_center, self.L loss = self.loss problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, min_its=100) tests.append((atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem with monotonicity\n %s' % str(self))) # write the loss in terms of a quadratic for the smooth loss and a smooth function... q = rr.identity_quadratic(L, prox_center, 0, 0) lossq = rr.quadratic.shift(prox_center.copy(), coef=0.6*L) lossq.quadratic = rr.identity_quadratic(0.4*L, prox_center.copy(), 0, 0) problem = rr.simple_problem(lossq, atom) tests.append((atom.proximal(q), problem.solve(coef_stop=self.coef_stop, FISTA=self.FISTA, tol=1.0e-12), 'solving prox with simple_problem ' + 'with monotonicity but loss has identity_quadratic %s\n ' % str(self))) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append((atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem no monotonicity_restart\n %s' % str(self))) d = atom.conjugate problem = rr.simple_problem(loss, d) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append((d.proximal(q), problem.solve(tol=1.e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, monotonicity_restart=False), 'solving dual prox with simple_problem no monotonocity\n %s ' % str(self))) if not self.interactive: for test in tests: yield (all_close,) + test + (self,) else: for test in tests: yield all_close(*((test + (self,))))
def test_using_SLOPE_weights(): n, p = 500, 50 X = np.random.standard_normal((n, p)) #Y = np.random.standard_normal(n) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n)) beta = np.zeros(p) beta[:5] = 5. Y = X.dot(beta) + np.random.standard_normal(n) output_R = fit_slope_R(X, Y, W = None, normalize = True, choice_weights = "bhq") r_beta = output_R[0] r_lambda_seq = output_R[2] W = r_lambda_seq pen = slope(W, lagrange=1.) loss = rr.squared_error(X, Y) problem = rr.simple_problem(loss, pen) soln = problem.solve(tol=1.e-14, min_its=500) # we get a better objective value nt.assert_true(problem.objective(soln) < problem.objective(np.asarray(r_beta))) nt.assert_true(np.linalg.norm(soln - r_beta) < 1.e-6 * np.linalg.norm(soln))
def __init__(self, loss, linear_randomization, quadratic_coef, randomization, penalty, solve_args={'tol':1.e-10, 'min_its':100, 'max_its':500}): (self.loss, self.linear_randomization, self.randomization, self.quadratic_coef) = (loss, linear_randomization, randomization, quadratic_coef) # initialize optimization problem self.penalty = penalty self.problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic( quadratic_coef, 0, self.linear_randomization, 0) self.initial_soln = self.problem.solve(random_term, **solve_args) self.initial_grad = self.loss.smooth_objective(self.initial_soln, mode='grad') self.opt_vars = self.penalty.setup_sampling( \ self.initial_grad, self.initial_soln, self.linear_randomization, self.quadratic_coef)
def test_changepoint_scaled(): p = 150 M = multiscale(p) M.minsize = 10 X = ra.adjoint(M) Y = np.random.standard_normal(p) Y[20:50] += 8 Y += 2 meanY = Y.mean() lammax = np.fabs(np.sqrt(M.sizes) * X.adjoint_map(Y) / (1 + np.sqrt(np.log(M.sizes)))).max() penalty = rr.weighted_l1norm((1 + np.sqrt(np.log(M.sizes))) / np.sqrt(M.sizes), lagrange=0.5*lammax) loss = rr.squared_error(X, Y - meanY) problem = rr.simple_problem(loss, penalty) soln = problem.solve() Yhat = X.linear_map(soln) Yhat += meanY if INTERACTIVE: plt.scatter(np.arange(p), Y) plt.plot(np.arange(p), Yhat) plt.show()
def test_nesta_lasso(): n, p = 1000, 20 X = np.random.standard_normal((n, p)) beta = np.zeros(p) beta[:4] = 30 Y = np.random.standard_normal(n) + np.dot(X, beta) loss = rr.squared_error(X,Y) penalty = rr.l1norm(p, lagrange=2.) # using nesta z = rr.zero(p) primal, dual = rr.nesta(loss, z, penalty, tol=1.e-10, epsilon=2.**(-np.arange(30)), initial_dual=np.zeros(p)) # using simple problem problem = rr.simple_problem(loss, penalty) problem.solve() nt.assert_true(np.linalg.norm(primal - problem.coefs) / np.linalg.norm(problem.coefs) < 1.e-3) # test None as smooth_atom rr.nesta(None, z, penalty, tol=1.e-10, epsilon=2.**(-np.arange(30)), initial_dual=np.zeros(p)) # using coefficients to stop rr.nesta(loss, z, penalty, tol=1.e-10, epsilon=2.**(-np.arange(30)), initial_dual=np.zeros(p), coef_stop=True)
def test_simple(): Z = np.random.standard_normal(100) * 4 p = rr.l1norm(100, lagrange=0.13) L = 0.14 loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, p) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10, debug=True) simple_coef = solver.composite.coefs prox_coef = p.proximal(rr.identity_quadratic(L, Z, 0, 0)) p2 = rr.l1norm(100, lagrange=0.13) p2 = copy(p) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, debug=True) simple_nonsmooth_coef = solver.composite.coefs p = rr.l1norm(100, lagrange=0.13) p.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p) simple_nonsmooth_gengrad = gengrad(problem, L, tol=1.0e-10) p = rr.l1norm(100, lagrange=0.13) problem = rr.separable_problem.singleton(p, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10) separable_coef = solver.composite.coefs loss2 = rr.quadratic.shift(-Z, coef=0.6*L) loss2.quadratic = rr.identity_quadratic(0.4*L, Z, 0, 0) p.coefs *= 0 problem2 = rr.simple_problem(loss2, p) loss2_coefs = problem2.solve(coef_stop=True) solver2 = rr.FISTA(problem2) solver2.fit(tol=1.0e-10, debug=True, coef_stop=True) yield ac, prox_coef, simple_nonsmooth_gengrad, 'prox to nonsmooth gengrad' yield ac, prox_coef, separable_coef, 'prox to separable' yield ac, prox_coef, simple_nonsmooth_coef, 'prox to simple_nonsmooth' yield ac, prox_coef, simple_coef, 'prox to simple' yield ac, prox_coef, loss2_coefs, 'simple where loss has quadratic 1' yield ac, prox_coef, solver2.composite.coefs, 'simple where loss has quadratic 2'
def test_path_group_lasso(): """ this test looks at the paths of three different parameterizations of the same problem """ n = 100 X = np.random.standard_normal((n, 10)) U = np.random.standard_normal((n, 2)) Y = np.random.standard_normal(100) betaX = np.array([3, 4, 5, 0, 0] + [0] * 5) betaU = np.array([10, -5]) Y += (np.dot(X, betaX) + np.dot(U, betaU)) * 5 Xn = rr.normalize( np.hstack([np.ones((100, 1)), X]), inplace=True, center=True, scale=True, intercept_column=0 ).normalized_array() lasso = rr.lasso.squared_error(Xn[:, 1:], Y, penalty_structure=[0] * 7 + [1] * 3, nstep=10) sol = lasso.main(inner_tol=1.0e-12, verbose=True) beta = np.array(sol["beta"].todense()) sols = [] sols_sep = [] for l in sol["lagrange"]: loss = rr.squared_error(Xn, Y, coef=1.0 / n) penalty = rr.mixed_lasso([rr.UNPENALIZED] + [0] * 7 + [1] * 3, lagrange=l) # matrix contains an intercept... problem = rr.simple_problem(loss, penalty) sols.append(problem.solve(tol=1.0e-12).copy()) sep = rr.separable( (11,), [rr.l2norm((7,), np.sqrt(7) * l), rr.l2norm((3,), np.sqrt(3) * l)], [np.arange(1, 8), np.arange(8, 11)], ) sep_problem = rr.simple_problem(loss, sep) sols_sep.append(sep_problem.solve(tol=1.0e-12).copy()) sols = np.array(sols).T sols_sep = np.array(sols_sep).T nt.assert_true(np.linalg.norm(beta - sols) / (1 + np.linalg.norm(beta)) <= 1.0e-4) nt.assert_true(np.linalg.norm(beta - sols_sep) / (1 + np.linalg.norm(beta)) <= 1.0e-4)
def solve_sqrt_lasso_skinny(X, Y, weights=None, initial=None, quadratic=None, solve_args={}): """ Solve the square-root LASSO optimization problem: $$ \text{minimize}_{\beta} \|y-X\beta\|_2 + D |\beta|, $$ where $D$ is the diagonal matrix with weights on its diagonal. Parameters ---------- y : np.float((n,)) The target, in the model $y = X\beta$ X : np.float((n, p)) The data, in the model $y = X\beta$ weights : np.float Coefficients of the L-1 penalty in optimization problem, note that different coordinates can have different coefficients. initial : np.float(p) Initial point for optimization. solve_args : dict Arguments passed to regreg solver. quadratic : `regreg.identity_quadratic` A quadratic term added to objective function. """ n, p = X.shape if weights is None: lam = choose_lambda(X) weights = lam * np.ones((p,)) weight_dict = dict(zip(np.arange(p), 2 * weights)) penalty = rr.mixed_lasso(range(p) + [rr.NONNEGATIVE], lagrange=1., weights=weight_dict) loss = sqlasso_objective_skinny(X, Y) problem = rr.simple_problem(loss, penalty) problem.coefs[-1] = np.linalg.norm(Y) if initial is not None: problem.coefs[:-1] = initial soln = problem.solve(quadratic, **solve_args) _loss = sqlasso_objective(X, Y) return soln[:-1], _loss
def test_admm(n=100, p=10): X = np.random.standard_normal((n, p)) Y = np.random.standard_normal(n) loss = rr.squared_error(X, Y) D = np.identity(p) pen = rr.l1norm(p, lagrange=1.5) ADMM = admm_problem(loss, pen, ra.astransform(D), 0.5) ADMM.solve(niter=1000) coef1 = ADMM.atom_coefs problem2 = rr.simple_problem(loss, pen) coef2 = problem2.solve(tol=1.e-12, min_its=500) np.testing.assert_allclose(coef1, coef2, rtol=1.e-3, atol=1.e-4)
def test_class(): """ runs several class methods on generic instance """ n, p = 100, 20 X = np.random.standard_normal((n, p)) Y = np.random.standard_normal(n) loss = rr.squared_error(X, Y) pen = rr.l1norm(p, lagrange=1.0) problem = rr.simple_problem(loss, pen) problem.latexify() for debug, coef_stop, max_its in product([True, False], [True, False], [5, 100]): rr.gengrad(problem, rr.power_L(X) ** 2, max_its=max_its, debug=debug, coef_stop=coef_stop)
def solve_sqrt_lasso_fat(X, Y, weights=None, initial=None, quadratic=None, solve_args={}): """ Solve the square-root LASSO optimization problem: $$ \text{minimize}_{\beta} \|y-X\beta\|_2 + D |\beta|, $$ where $D$ is the diagonal matrix with weights on its diagonal. Parameters ---------- y : np.float((n,)) The target, in the model $y = X\beta$ X : np.float((n, p)) The data, in the model $y = X\beta$ weights : np.float Coefficients of the L-1 penalty in optimization problem, note that different coordinates can have different coefficients. initial : np.float(p) Initial point for optimization. solve_args : dict Arguments passed to regreg solver. quadratic : `regreg.identity_quadratic` A quadratic term added to objective function. """ X = rr.astransform(X) n, p = X.output_shape[0], X.input_shape[0] if weights is None: lam = choose_lambda(X) weights = lam * np.ones((p,)) loss = sqlasso_objective(X, Y) penalty = rr.weighted_l1norm(weights, lagrange=1.) problem = rr.simple_problem(loss, penalty) if initial is not None: problem.coefs[:] = initial soln = problem.solve(quadratic, **solve_args) return soln, loss
def test_lasso_dual_with_monotonicity(): """ restarting is funny for this simple problem """ l1 = .1 sparsity = R.l1norm(10, lagrange=l1) x = np.arange(10) - 5 loss = R.quadratic.shift(-x, coef=0.5) pen = R.simple_problem(loss, sparsity) solver = R.FISTA(pen) solver.fit() soln = solver.composite.coefs st = np.maximum(np.fabs(x)-l1,0) * np.sign(x) np.testing.assert_almost_equal(soln,st, decimal=3)
def test_equivalence_sqrtlasso(n=200, p=400, s=10, sigma=3.): """ Check equivalent LASSO and sqrtLASSO solutions. """ Y = np.random.standard_normal(n) * sigma beta = np.zeros(p) beta[:s] = 8 * (2 * np.random.binomial(1, 0.5, size=(s,)) - 1) X = np.random.standard_normal((n,p)) + 0.3 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y += np.dot(X, beta) * sigma lam_theor = choose_lambda(X, quantile=0.9) weights = lam_theor*np.ones(p) weights[:3] = 0. soln1, loss1 = solve_sqrt_lasso(X, Y, weights=weights, quadratic=None, solve_args={'min_its':500, 'tol':1.e-10}) G1 = loss1.smooth_objective(soln1, 'grad') # find active set, and estimate of sigma active = (soln1 != 0) nactive = active.sum() subgrad = np.sign(soln1[active]) * weights[active] X_E = X[:,active] X_Ei = np.linalg.pinv(X_E) sigma_E= np.linalg.norm(Y - X_E.dot(X_Ei.dot(Y))) / np.sqrt(n - nactive) multiplier = sigma_E * np.sqrt((n - nactive) / (1 - np.linalg.norm(X_Ei.T.dot(subgrad))**2)) # XXX how should quadratic be changed? # multiply everything by sigma_E? loss2 = rr.glm.gaussian(X, Y) penalty = rr.weighted_l1norm(weights, lagrange=multiplier) problem = rr.simple_problem(loss2, penalty) soln2 = problem.solve(tol=1.e-12, min_its=200) G2 = loss2.smooth_objective(soln2, 'grad') / multiplier np.testing.assert_allclose(G1[3:], G2[3:]) np.testing.assert_allclose(soln1, soln2)
def test_choose_parameter(delta=2, p=60): signal = np.zeros(p) signal[(p//2):] += delta Z = np.random.standard_normal(p) + signal p = Z.shape[0] M = multiscale(p) M.scaling = np.sqrt(M.sizes) lam = choose_tuning_parameter(M) weights = (lam + np.sqrt(2 * np.log(p / M.sizes))) / np.sqrt(p) Z0 = Z - Z.mean() loss = rr.squared_error(ra.adjoint(M), Z0) penalty = rr.weighted_l1norm(weights, lagrange=1.) problem = rr.simple_problem(loss, penalty) coef = problem.solve() active = coef != 0 if active.sum(): X = M.form_matrix(M.slices[active])[0]
def fit(self, **solve_args): """ Fit the lasso using `regreg`. This sets the attributes `soln`, `onestep` and forms the constraints necessary for post-selection inference by calling `form_constraints()`. Parameters ---------- solve_args : keyword args Passed to `regreg.problems.simple_problem.solve`. Returns ------- soln : np.float Solution to lasso. """ penalty = weighted_l1norm(self.feature_weights, lagrange=1.) problem = simple_problem(self.loglike, penalty) _soln = problem.solve(**solve_args) self._soln = _soln if not np.all(_soln == 0): self.active = np.nonzero(_soln != 0)[0] self.active_signs = np.sign(_soln[self.active]) self._active_soln = _soln[self.active] H = self.loglike.hessian(self._soln)[self.active][:,self.active] Hinv = np.linalg.inv(H) G = self.loglike.gradient(self._soln)[self.active] delta = Hinv.dot(G) self._onestep = self._active_soln - delta self.active_penalized = self.feature_weights[self.active] != 0 self._constraints = constraints(-np.diag(self.active_signs)[self.active_penalized], (self.active_signs * delta)[self.active_penalized], covariance=Hinv) else: self.active = [] return self._soln
def _solve_randomized_problem(self, perturb=None, solve_args={'tol': 1.e-12, 'min_its': 50}): # take a new perturbation if supplied if perturb is not None: self._initial_omega = perturb if self._initial_omega is None: self._initial_omega = self.randomizer.sample() quad = rr.identity_quadratic(self.ridge_term, 0, -self._initial_omega, 0) problem = rr.simple_problem(self.loglike, self.penalty) initial_soln = problem.solve(quad, **solve_args) initial_subgrad = -(self.loglike.smooth_objective(initial_soln, 'grad') + quad.objective(initial_soln, 'grad')) return initial_soln, initial_subgrad
def test_changepoint(): p = 150 M = multiscale(p) M.minsize = 10 X = ra.adjoint(M) Y = np.random.standard_normal(p) Y[20:50] += 8 Y += 2 meanY = Y.mean() lammax = np.fabs(X.adjoint_map(Y)).max() penalty = rr.l1norm(X.input_shape, lagrange=0.5*lammax) loss = rr.squared_error(X, Y - meanY) problem = rr.simple_problem(loss, penalty) soln = problem.solve() Yhat = X.linear_map(soln) Yhat += meanY plt.scatter(np.arange(p), Y) plt.plot(np.arange(p), Yhat)
def fit(self, solve_args={'min_its': 30, 'tol': 1.e-8, 'max_its': 300}): """ Fit the lasso using `regreg`. This sets the attribute `soln` and forms the constraints necessary for post-selection inference by calling `form_constraints()`. Parameters ---------- solve_args : {} Passed to `regreg.simple_problem.solve``_ Returns ------- soln : np.float Solution to lasso with `sklearn_alpha=self.lagrange`. """ n, p = self.X.shape loss = self.form_loss(np.arange(p)) penalty = self.form_penalty() problem = simple_problem(loss, penalty) soln = problem.solve(**solve_args) self._soln = soln if not np.all(soln == 0): self.active = np.nonzero(soln)[0] self.inactive = np.array( sorted(set(xrange(p)).difference(self.active))) loss_E = self.form_loss(self.active) self._beta_unpenalized = loss_E.solve(**solve_args) self.form_constraints() else: self.active = []
def test_simple(): Z = np.random.standard_normal((10, 10)) * 4 p = rr.l1_l2((10, 10), lagrange=0.13) dual = p.conjugate L = 0.23 loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, p) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10, debug=True) simple_coef = solver.composite.coefs q = rr.identity_quadratic(L, Z, 0, 0) prox_coef = p.proximal(q) p2 = copy(p) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, debug=True) simple_nonsmooth_coef = solver.composite.coefs p = rr.l1_l2((10, 10), lagrange=0.13) p.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p) simple_nonsmooth_gengrad = gengrad(problem, L, tol=1.0e-10) p = rr.l1_l2((10, 10), lagrange=0.13) problem = rr.separable_problem.singleton(p, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10) separable_coef = solver.composite.coefs ac(prox_coef, Z - simple_coef, 'prox to simple') ac(prox_coef, simple_nonsmooth_gengrad, 'prox to nonsmooth gengrad') ac(prox_coef, separable_coef, 'prox to separable') ac(prox_coef, simple_nonsmooth_coef, 'prox to simple_nonsmooth')
def test_gengrad_blocknorms(): Z = np.random.standard_normal((10, 10)) * 4 p = rr.l1_l2((10, 10), lagrange=0.13) dual = p.conjugate L = 0.23 loss = rr.quadratic_loss.shift(Z, coef=L) problem = rr.simple_problem(loss, p) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10, debug=True) simple_coef = solver.composite.coefs q = rr.identity_quadratic(L, Z, 0, 0) prox_coef = p.proximal(q) p2 = copy(p) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, debug=True) simple_nonsmooth_coef = solver.composite.coefs p = rr.l1_l2((10, 10), lagrange=0.13) p.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p) simple_nonsmooth_gengrad = rr.gengrad(problem, L, tol=1.0e-10) p = rr.l1_l2((10, 10), lagrange=0.13) problem = rr.separable_problem.singleton(p, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10) separable_coef = solver.composite.coefs yield (all_close, prox_coef, simple_coef, "prox to simple", None) yield (all_close, prox_coef, simple_nonsmooth_gengrad, "prox to nonsmooth gengrad", None) yield (all_close, prox_coef, separable_coef, "prox to separable", None) yield (all_close, prox_coef, simple_nonsmooth_coef, "prox to simple_nonsmooth", None)
def fit(self, X, y): """ Fit a regularized regression estimator. Parameters ---------- X : np.ndarray((n, p)) Feature matrix. y : np.ndarray(n) Response vector. Returns ------- self """ self._loglike = loglike = self._loglike_factory(X, y) # with unpenalized parameters possible, # this may be best found by solving a problem with an atom with lagrange=np.inf # this could get expensive though null_grad = loglike.smooth_objective(np.zeros(loglike.shape), 'grad') atom_ = self._construct_atom(null_grad) if self.unpenalized: null_grad = self._fit_null_soln(loglike, atom_) atom_ = self._construct_atom(null_grad) problem = simple_problem(loglike, atom_) if self.initial is not None: problem.coefs[:] = self.initial self._coefs = problem.solve(**self.solve_args) return self
def test_simple(): Z = np.random.standard_normal((10,10)) * 4 p = rr.l1_l2((10,10), lagrange=0.13) dual = p.conjugate L = 0.23 loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, p) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10, debug=True) simple_coef = solver.composite.coefs q = rr.identity_quadratic(L, Z, 0, 0) prox_coef = p.proximal(q) p2 = copy(p) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, debug=True) simple_nonsmooth_coef = solver.composite.coefs p = rr.l1_l2((10,10), lagrange=0.13) p.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p) simple_nonsmooth_gengrad = gengrad(problem, L, tol=1.0e-10) p = rr.l1_l2((10,10), lagrange=0.13) problem = rr.separable_problem.singleton(p, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10) separable_coef = solver.composite.coefs ac(prox_coef, Z-simple_coef, 'prox to simple') ac(prox_coef, simple_nonsmooth_gengrad, 'prox to nonsmooth gengrad') ac(prox_coef, separable_coef, 'prox to separable') ac(prox_coef, simple_nonsmooth_coef, 'prox to simple_nonsmooth')
def fit(self, tol=1.e-12, min_its=50, **solve_args): lasso.fit(self, tol=tol, min_its=min_its, **solve_args) n1 = self.loglike.get_data()[0].shape[0] n = self.loglike_full.get_data()[0].shape[0] _feature_weights = self.feature_weights.copy() _feature_weights[self.active] = 0. _feature_weights[self.inactive] = np.inf _unpenalized_problem = simple_problem(self.loglike_full, weighted_l1norm(_feature_weights, lagrange=1.)) _unpenalized = _unpenalized_problem.solve(**solve_args) _unpenalized_active = _unpenalized[self.active] s = len(self.active) H = self.loglike_full.hessian(_unpenalized) H_AA = H[self.active][:,self.active] _cov_block = np.linalg.inv(H_AA) _subsample_block = (n * 1. / n1) * _cov_block _carve_cov = np.zeros((2*s,2*s)) _carve_cov[:s][:,:s] = _cov_block _carve_cov[s:][:,:s] = _subsample_block _carve_cov[:s][:,s:] = _subsample_block _carve_cov[s:][:,s:] = _subsample_block _carve_linear_part = self._constraints.linear_part.dot(np.identity(2*s)[s:]) _carve_offset = self._constraints.offset self._carve_constraints = constraints(_carve_linear_part, _carve_offset, covariance=_carve_cov) self._carve_feasible = np.hstack([_unpenalized_active, self.onestep_estimator]) self._unpenalized_active = _unpenalized_active self._carve_invcov = H_AA
def fit(self, tol=1.e-12, min_its=50, use_full=True, **solve_args): lasso.fit(self, tol=tol, min_its=min_its, **solve_args) _feature_weights = self.feature_weights.copy() _feature_weights[self.active] = 0. _feature_weights[self.inactive] = np.inf _unpenalized_problem = simple_problem(self.loglike_inference, weighted_l1norm(_feature_weights, lagrange=1.)) _unpenalized = _unpenalized_problem.solve(**solve_args) self._unpenalized_active = _unpenalized[self.active] if use_full: H = self.loglike_full.hessian(_unpenalized) n_inference = self.loglike_inference.data[0].shape[0] n_full = self.loglike_full.data[0].shape[0] H *= (1. * n_inference / n_full) else: H = self.loglike_inference.hessian(_unpenalized) H_AA = H[self.active][:,self.active] self._cov_inference = np.linalg.inv(H_AA)
def test_changepoint(): p = 150 M = multiscale(p) M.minsize = 10 X = ra.adjoint(M) Y = np.random.standard_normal(p) Y[20:50] += 8 Y += 2 meanY = Y.mean() lammax = np.fabs(X.adjoint_map(Y)).max() penalty = rr.l1norm(X.input_shape, lagrange=0.5*lammax) loss = rr.squared_error(X, Y - meanY) problem = rr.simple_problem(loss, penalty) soln = problem.solve() Yhat = X.linear_map(soln) Yhat += meanY plt.scatter(np.arange(p), Y) plt.plot(np.arange(p), Yhat) plt.show()
def fit(self, solve_args={'min_its':30, 'tol':1.e-8, 'max_its':300}): """ Fit the lasso using `regreg`. This sets the attribute `soln` and forms the constraints necessary for post-selection inference by calling `form_constraints()`. Parameters ---------- solve_args : {} Passed to `regreg.simple_problem.solve``_ Returns ------- soln : np.float Solution to lasso with `sklearn_alpha=self.lagrange`. """ n, p = self.X.shape loss = self.form_loss(np.arange(p)) penalty = self.form_penalty() problem = simple_problem(loss, penalty) soln = problem.solve(**solve_args) self._soln = soln if not np.all(soln == 0): self.active = np.nonzero(soln)[0] self.inactive = np.array(sorted(set(xrange(p)).difference(self.active))) loss_E = self.form_loss(self.active) self._beta_unpenalized = loss_E.solve(**solve_args) self.form_constraints() else: self.active = []
def test_sqrt_highdim_lasso(n=500, p=200, signal_fac=1.5, s=5, sigma=3, full=True, rho=0.4, randomizer_scale=1., ndraw=5000, burnin=1000, ridge_term=None, compare_to_lasso=True): """ Compare to R randomized lasso """ inst, const = gaussian_instance, lasso.sqrt_lasso signal = np.sqrt(signal_fac * 2 * np.log(p)) X, Y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] if ridge_term is None: mean_diag = np.mean((X**2).sum(0)) ridge_term = (np.sqrt(mean_diag) / np.sqrt(n)) * np.sqrt(n / (n - 1.)) W = np.ones(X.shape[1]) * choose_lambda(X) * 0.7 perturb = np.random.standard_normal(p) * randomizer_scale / np.sqrt(n) conv = const(X, Y, W, randomizer_scale=randomizer_scale / np.sqrt(n), perturb=perturb, ridge_term=ridge_term) signs = conv.fit() nonzero = signs != 0 # sanity check if compare_to_lasso: q_term = rr.identity_quadratic(ridge_term, 0, -perturb, 0) soln2, sqrt_loss = solve_sqrt_lasso(X, Y, W, solve_args={'min_its':1000}, quadratic=q_term, force_fat=True) soln = conv.initial_soln denom = np.linalg.norm(Y - X.dot(soln)) new_weights = W * denom loss = rr.glm.gaussian(X, Y) pen = rr.weighted_l1norm(new_weights, lagrange=1.) prob = rr.simple_problem(loss, pen) rescaledQ = rr.identity_quadratic(ridge_term * denom, 0, -perturb * denom, 0) soln3 = prob.solve(quadratic=rescaledQ, min_its=1000, tol=1.e-12) np.testing.assert_allclose(conv._initial_omega, perturb * denom) np.testing.assert_allclose(soln, soln2) np.testing.assert_allclose(soln, soln3) if full: (observed_target, cov_target, cov_target_score, alternatives) = full_targets(conv.loglike, conv._W, nonzero) else: (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(conv.loglike, conv._W, nonzero) _, pval, intervals = conv.summary(observed_target, cov_target, cov_target_score, alternatives, ndraw=ndraw, burnin=burnin, compute_intervals=False) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0]
def test_lasso(s=5, n=200, p=20): X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1., rho=0, snr=10) print 'sigma', sigma lam_frac = 1. randomization = laplace(loc=0, scale=1.) loss = randomized.gaussian_Xfixed(X, y) random_Z = randomization.rvs(p) epsilon = 1. lam = sigma * lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) random_Z = randomization.rvs(p) penalty = randomized.selective_l1norm_lan(p, lagrange=lam) #sampler1 = randomized.selective_sampler_MH_lan(loss, # random_Z, # epsilon, # randomization, # penalty) #loss_args = {'mean': np.zeros(n), # 'sigma': sigma, # 'linear_part':np.identity(y.shape[0]), # 'value': 0} #sampler1.setup_sampling(y, loss_args=loss_args) # data, opt_vars = sampler1.state # initial solution # rr.smooth_atom instead of loss? problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon, 0, -random_Z, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) active = (initial_soln != 0) inactive = ~active initial_grad = -np.dot(X.T, y - np.dot(X, initial_soln)) betaE = initial_soln[active] signs = np.sign(betaE) subgradient = random_Z - initial_grad - epsilon * initial_soln cube = np.divide(subgradient[inactive], lam) #print betaE, cube #initial_grad = loss.smooth_objective(initial_soln, mode='grad') #print penalty.setup_sampling(initial_grad, # initial_soln, # random_Z, # epsilon) data0 = y.copy() #active = penalty.active_set if (np.sum(active) == 0): print 'here' return [-1], [-1] nalpha = n nactive = betaE.shape[0] ninactive = cube.shape[0] alpha = np.ones(n) beta_bar = np.linalg.lstsq(X[:, active], y)[0] obs_residuals = y - np.dot(X[:, active], beta_bar) #obs_residuals -= np.mean(obs_residuals) #betaE, cube = opt_vars init_vec_state = np.zeros(n + nactive + ninactive) init_vec_state[:n] = alpha init_vec_state[n:(n + nactive)] = betaE init_vec_state[(n + nactive):] = cube def full_projection(vec_state, signs=signs, nalpha=nalpha, nactive=nactive, ninactive=ninactive): alpha = vec_state[:nalpha].copy() betaE = vec_state[nalpha:(nalpha + nactive)] cube = vec_state[(nalpha + nactive):] #signs = penalty.signs projected_alpha = alpha.copy() projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) projected_alpha = np.clip(alpha, 0, np.inf) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate( (projected_alpha, projected_betaE, projected_cube), 0) null, alt = pval(init_vec_state, full_projection, X, y, obs_residuals, signs, lam, epsilon, nonzero, active) return null, alt
def test_lasso(s=0, n=100, p=20, weights = "neutral", randomization_dist = "logistic", randomization_scale = 1, Langevin_steps = 10000, burning = 2000, X_scaled = True, covariance_estimate = "nonparametric", noise = "uniform"): """ weights: exponential, gamma, normal, gumbel randomization_dist: logistic, laplace """ step_size = 1./p X, y, true_beta, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0, scale=X_scaled, noise=noise) print 'true beta', true_beta lam_frac = 1. if randomization_dist == "laplace": randomization = laplace(loc=0, scale=1.) random_Z = randomization.rvs(p) if randomization_dist == "logistic": random_Z = np.random.logistic(loc=0, scale = 1, size = p) if randomization_dist== "normal": random_Z = np.random.standard_normal(p) print 'randomization', random_Z*randomization_scale loss = lasso_randomX.lasso_randomX(X, y) epsilon = 1./np.sqrt(n) #epsilon = 1. lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))+randomization_scale*np.random.logistic(size=(p,10000))).max(0)) lam_scaled = lam.copy() random_Z_scaled = random_Z.copy() epsilon_scaled = epsilon if (X_scaled == False): random_Z_scaled *= np.sqrt(n) lam_scaled *= np.sqrt(n) epsilon_scaled *= np.sqrt(n) penalty = randomized.selective_l1norm_lan(p, lagrange=lam_scaled) # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon_scaled, 0, -randomization_scale*random_Z_scaled, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) print 'initial solution', initial_soln active = (initial_soln != 0) if np.sum(active)==0: return [-1], [-1] inactive = ~active betaE = initial_soln[active] signs = np.sign(betaE) initial_grad = -np.dot(X.T, y - np.dot(X, initial_soln)) if (X_scaled==False): initial_grad /= np.sqrt(n) print 'initial_gradient', initial_grad subgradient = random_Z - initial_grad - epsilon * initial_soln cube = np.divide(subgradient[inactive], lam) nactive = betaE.shape[0] ninactive = cube.shape[0] beta_unpenalized = np.linalg.lstsq(X[:, active], y)[0] print 'beta_OLS onto E', beta_unpenalized obs_residuals = y - np.dot(X[:, active], beta_unpenalized) # y-X_E\bar{\beta}^E N = np.dot(X[:, inactive].T, obs_residuals) # X_{-E}^T(y-X_E\bar{\beta}_E), null statistic full_null = np.zeros(p) full_null[nactive:] = N # parametric coveriance estimate if covariance_estimate == "parametric": XE_pinv = np.linalg.pinv(X[:, active]) mat = np.zeros((nactive+ninactive, n)) mat[:nactive,:] = XE_pinv mat[nactive:,:] = X[:, inactive].T.dot(np.identity(n)-X[:, active].dot(XE_pinv)) Sigma_full = mat.dot(mat.T) else: Sigma_full = bootstrap_covariance(X,y,active, beta_unpenalized) init_vec_state = np.zeros(n+nactive+ninactive) if weights =="exponential": init_vec_state[:n] = np.ones(n) else: init_vec_state[:n] = np.zeros(n) #init_vec_state[:n] = np.random.standard_normal(n) #init_vec_state[:n] = np.ones(n) init_vec_state[n:(n+nactive)] = betaE init_vec_state[(n+nactive):] = cube def full_projection(vec_state, signs = signs, nactive=nactive, ninactive = ninactive): alpha = vec_state[:n].copy() betaE = vec_state[n:(n+nactive)].copy() cube = vec_state[(n+nactive):].copy() projected_alpha = alpha.copy() projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) if weights == "exponential": projected_alpha = np.clip(alpha, 0, np.inf) if weights == "gamma": projected_alpha = np.clip(alpha, -2+1./n, np.inf) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate((projected_alpha, projected_betaE, projected_cube), 0) Sigma = np.linalg.inv(np.dot(X[:, active].T, X[:, active])) null, alt = pval(init_vec_state, full_projection, X, obs_residuals, beta_unpenalized, full_null, signs, lam, epsilon, nonzero, active, Sigma, weights, randomization_dist, randomization_scale, Langevin_steps, step_size, burning, X_scaled) # Sigma_full[:nactive, :nactive]) return null, alt
def highdim_model_inference(X, y, truth, selection_algorithm, sampler, lam_min, dispersion, success_params=(1, 1), fit_probability=keras_fit, fit_args={ 'epochs': 10, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' }, alpha=0.1, B=2000, naive=True, learner_klass=mixture_learner, how_many=None): n, p = X.shape XTX = X.T.dot(X) instance_hash = hashlib.md5() instance_hash.update(X.tobytes()) instance_hash.update(y.tobytes()) instance_hash.update(truth.tobytes()) instance_id = instance_hash.hexdigest() # run selection algorithm observed_set = repeat_selection(selection_algorithm, sampler, *success_params) observed_list = sorted(observed_set) # observed debiased LASSO estimate loss = rr.squared_error(X, y) pen = rr.l1norm(p, lagrange=lam_min) problem = rr.simple_problem(loss, pen) soln = problem.solve() grad = X.T.dot(X.dot(soln) - y) # gradient at beta_hat M = pseudoinverse_debiasing_matrix(X, observed_list) observed_target = soln[observed_list] - M.dot(grad) tmp = X.dot(M.T) target_cov = tmp.T.dot(tmp) * dispersion cross_cov = np.identity(p)[:, observed_list] * dispersion if len(observed_list) > 0: if how_many is None: how_many = len(observed_list) observed_list = observed_list[:how_many] # find the target, based on the observed outcome (pivots, covered, lengths, pvalues, lower, upper) = [], [], [], [], [], [] targets = [] true_target = truth[observed_list] results = infer_set_target(selection_algorithm, observed_set, observed_list, sampler, observed_target, target_cov, cross_cov, hypothesis=true_target, fit_probability=fit_probability, fit_args=fit_args, success_params=success_params, alpha=alpha, B=B, learner_klass=learner_klass) for i, result in enumerate(results): (pivot, interval, pvalue, _) = result pvalues.append(pvalue) pivots.append(pivot) covered.append((interval[0] < true_target[i]) * (interval[1] > true_target[i])) lengths.append(interval[1] - interval[0]) lower.append(interval[0]) upper.append(interval[1]) if len(pvalues) > 0: df = pd.DataFrame({ 'pivot': pivots, 'pvalue': pvalues, 'coverage': covered, 'length': lengths, 'upper': upper, 'lower': lower, 'id': [instance_id] * len(pvalues), 'target': true_target, 'variable': observed_list, 'B': [B] * len(pvalues) }) if naive: (naive_pvalues, naive_pivots, naive_covered, naive_lengths, naive_upper, naive_lower) = [], [], [], [], [], [] for j, idx in enumerate(observed_list): true_target = truth[idx] target_sd = np.sqrt(target_cov[j, j]) observed_target_j = observed_target[j] quantile = normal_dbn.ppf(1 - 0.5 * alpha) naive_interval = (observed_target_j - quantile * target_sd, observed_target_j + quantile * target_sd) naive_upper.append(naive_interval[1]) naive_lower.append(naive_interval[0]) naive_pivot = (1 - normal_dbn.cdf( (observed_target_j - true_target) / target_sd)) naive_pivot = 2 * min(naive_pivot, 1 - naive_pivot) naive_pivots.append(naive_pivot) naive_pvalue = ( 1 - normal_dbn.cdf(observed_target_j / target_sd)) naive_pvalue = 2 * min(naive_pvalue, 1 - naive_pvalue) naive_pvalues.append(naive_pvalue) naive_covered.append((naive_interval[0] < true_target) * (naive_interval[1] > true_target)) naive_lengths.append(naive_interval[1] - naive_interval[0]) naive_df = pd.DataFrame({ 'naive_pivot': naive_pivots, 'naive_pvalue': naive_pvalues, 'naive_coverage': naive_covered, 'naive_length': naive_lengths, 'naive_upper': naive_upper, 'naive_lower': naive_lower, 'variable': observed_list, }) df = pd.merge(df, naive_df, on='variable') return df
def solve_sqrt_lasso_skinny(X, Y, weights=None, initial=None, quadratic=None, solve_args={}): """ Solve the square-root LASSO optimization problem: $$ \text{minimize}_{\beta} \|y-X\beta\|_2 + D |\beta|, $$ where $D$ is the diagonal matrix with weights on its diagonal. Parameters ---------- y : np.float((n,)) The target, in the model $y = X\beta$ X : np.float((n, p)) The data, in the model $y = X\beta$ weights : np.float Coefficients of the L-1 penalty in optimization problem, note that different coordinates can have different coefficients. initial : np.float(p) Initial point for optimization. solve_args : dict Arguments passed to regreg solver. quadratic : `regreg.identity_quadratic` A quadratic term added to objective function. """ X = rr.astransform(X) n, p = X.output_shape[0], X.input_shape[0] if weights is None: lam = choose_lambda(X) weights = lam * np.ones((p, )) weight_dict = dict(zip(np.arange(p), 2 * weights)) penalty = rr.mixed_lasso(list(np.arange(p)) + [rr.NONNEGATIVE], lagrange=1., weights=weight_dict) loss = sqlasso_objective_skinny(X, Y) problem = rr.simple_problem(loss, penalty) problem.coefs[-1] = np.linalg.norm(Y) if initial is not None: problem.coefs[:-1] = initial if quadratic is not None: collapsed = quadratic.collapsed() new_linear_term = np.zeros(p + 1) new_linear_term[:p] = collapsed.linear_term new_quadratic = rr.identity_quadratic(collapsed.coef, 0., new_linear_term, collapsed.constant_term) else: new_quadratic = None soln = problem.solve(new_quadratic, **solve_args) _loss = sqlasso_objective(X, Y) return soln[:-1], _loss
def test_quadratic_for_smooth(): ''' this test is a check to ensure that the quadratic part of the smooth functions are being used in the proximal step ''' L = 0.45 W = np.random.standard_normal(40) Z = np.random.standard_normal(40) U = np.random.standard_normal(40) atomq = rr.identity_quadratic(0.4, U, W, 0) atom = rr.l1norm(40, quadratic=atomq, lagrange=0.12) # specifying in this way should be the same as if we put 0.5*L below loss = rr.quadratic.shift(Z, coef=0.6 * L) lq = rr.identity_quadratic(0.4 * L, Z, 0, 0) loss.quadratic = lq ww = np.random.standard_normal(40) # specifying in this way should be the same as if we put 0.5*L below loss2 = rr.quadratic.shift(Z, coef=L) yield all_close, loss2.objective(ww), loss.objective( ww), 'checking objective', None yield all_close, lq.objective(ww, 'func'), loss.nonsmooth_objective( ww), 'checking nonsmooth objective', None yield all_close, loss2.smooth_objective( ww, 'func'), 0.5 / 0.3 * loss.smooth_objective( ww, 'func'), 'checking smooth objective func', None yield all_close, loss2.smooth_objective( ww, 'grad'), 0.5 / 0.3 * loss.smooth_objective( ww, 'grad'), 'checking smooth objective grad', None problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12) problem3 = rr.simple_problem(loss, atom) solver3 = rr.FISTA(problem3) solver3.fit(tol=1.0e-12, coef_stop=True) loss4 = rr.quadratic.shift(Z, coef=0.6 * L) problem4 = rr.simple_problem(loss4, atom) problem4.quadratic = lq solver4 = rr.FISTA(problem4) solver4.fit(tol=1.0e-12) gg_soln = rr.gengrad(problem, L) loss6 = rr.quadratic.shift(Z, coef=0.6 * L) loss6.quadratic = lq + atom.quadratic atomcp = copy(atom) atomcp.quadratic = rr.identity_quadratic(0, 0, 0, 0) problem6 = rr.dual_problem(loss6.conjugate, rr.identity(loss6.shape), atomcp.conjugate) problem6.lipschitz = L + atom.quadratic.coef dsoln2 = problem6.solve(coef_stop=True, tol=1.e-10, max_its=100) problem2 = rr.container(loss2, atom) solver2 = rr.FISTA(problem2) solver2.fit(tol=1.0e-12, coef_stop=True) q = rr.identity_quadratic(L, Z, 0, 0) yield all_close, problem.objective( ww), atom.nonsmooth_objective(ww) + q.objective(ww, 'func'), '', None atom = rr.l1norm(40, quadratic=atomq, lagrange=0.12) aq = atom.solve(q) for p, msg in zip([ solver3.composite.coefs, gg_soln, solver2.composite.coefs, dsoln2, solver.composite.coefs, solver4.composite.coefs ], [ 'simple_problem with loss having no quadratic', 'gen grad', 'container with loss having no quadratic', 'dual problem with loss having a quadratic', 'container with loss having a quadratic', 'simple_problem having a quadratic' ]): yield all_close, aq, p, msg, None
def test_solve_QP_lasso(): """ Check the R coordinate descent LASSO solver """ n, p = 100, 200 lam = 0.1 X = np.random.standard_normal((n, p)) Y = np.random.standard_normal(n) loss = rr.squared_error(X, Y, coef=1. / n) pen = rr.l1norm(p, lagrange=lam) problem = rr.simple_problem(loss, pen) soln = problem.solve(min_its=500, tol=1.e-12) numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('Y', Y) rpy.r.assign('lam', lam) R_code = """ library(selectiveInference) p = ncol(X) n = nrow(X) soln_R = rep(0, p) grad = -t(X) %*% Y / n ever_active = as.integer(c(1, rep(0, p-1))) nactive = as.integer(1) kkt_tol = 1.e-12 objective_tol = 1.e-16 parameter_tol = 1.e-10 maxiter = 500 soln_R = selectiveInference:::solve_QP(t(X) %*% X / n, lam, maxiter, soln_R, 1. * grad, grad, ever_active, nactive, kkt_tol, objective_tol, parameter_tol, p, TRUE, TRUE, TRUE)$soln # test wide solver Xtheta = rep(0, n) nactive = as.integer(1) ever_active = as.integer(c(1, rep(0, p-1))) soln_R_wide = rep(0, p) grad = - t(X) %*% Y / n soln_R_wide = selectiveInference:::solve_QP_wide(X, rep(lam, p), 0, maxiter, soln_R_wide, 1. * grad, grad, Xtheta, ever_active, nactive, kkt_tol, objective_tol, parameter_tol, p, TRUE, TRUE, TRUE)$soln """ rpy.r(R_code) soln_R = np.asarray(rpy.r('soln_R')) soln_R_wide = np.asarray(rpy.r('soln_R_wide')) numpy2ri.deactivate() tol = 1.e-5 print(soln - soln_R) print(soln_R - soln_R_wide) yield np.testing.assert_allclose, soln, soln_R, tol, tol, False, 'checking coordinate QP solver for LASSO problem' yield np.testing.assert_allclose, soln, soln_R_wide, tol, tol, False, 'checking wide coordinate QP solver for LASSO problem'
def fit(self, solve_args={'tol':1.e-12, 'min_its':50}, perturb=None): """ Fit the randomized lasso using `regreg`. Parameters ---------- solve_args : keyword args Passed to `regreg.problems.simple_problem.solve`. Returns ------- signs : np.float Support and non-zero signs of randomized lasso solution. """ p = self.nfeature # take a new perturbation if supplied if perturb is not None: self._initial_omega = perturb if self._initial_omega is None: self._initial_omega = self.randomizer.sample() quad = rr.identity_quadratic(self.ridge_term, 0, -self._initial_omega, 0) quad_data = rr.identity_quadratic(0, 0, -self.X.T.dot(self.y), 0) problem = rr.simple_problem(self.loss, self.penalty) self.initial_soln = problem.solve(quad + quad_data, **solve_args) active_signs = np.sign(self.initial_soln) active = self._active = active_signs != 0 self._lagrange = self.penalty.weights unpenalized = self._lagrange == 0 active *= ~unpenalized self._overall = overall = (active + unpenalized) > 0 self._inactive = inactive = ~self._overall self._unpenalized = unpenalized _active_signs = active_signs.copy() _active_signs[unpenalized] = np.nan # don't release sign of unpenalized variables self.selection_variable = {'sign':_active_signs, 'variables':self._overall} # initial state for opt variables initial_subgrad = -(self.loss.smooth_objective(self.initial_soln, 'grad') + quad_data.objective(self.initial_soln, 'grad') + quad.objective(self.initial_soln, 'grad')) self.initial_subgrad = initial_subgrad initial_scalings = np.fabs(self.initial_soln[active]) initial_unpenalized = self.initial_soln[self._unpenalized] self.observed_opt_state = np.concatenate([initial_scalings, initial_unpenalized]) E = overall Q_E = self.Q[E][:,E] _beta_unpenalized = np.linalg.inv(Q_E).dot(self.X[:,E].T.dot(self.y)) beta_bar = np.zeros(p) beta_bar[overall] = _beta_unpenalized self._beta_full = beta_bar # observed state for score in internal coordinates self.observed_internal_state = np.hstack([_beta_unpenalized, -self.loss.smooth_objective(beta_bar, 'grad')[inactive] + quad_data.objective(beta_bar, 'grad')[inactive]]) # form linear part self.num_opt_var = self.observed_opt_state.shape[0] # (\bar{\beta}_{E \cup U}, N_{-E}, c_E, \beta_U, z_{-E}) # E for active # U for unpenalized # -E for inactive _opt_linear_term = np.zeros((p, self.num_opt_var)) _score_linear_term = np.zeros((p, self.num_opt_var)) # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator X, y = self.X, self.y _hessian_active = self.Q[:, active] _hessian_unpen = self.Q[:, unpenalized] _score_linear_term = -np.hstack([_hessian_active, _hessian_unpen]) # set the observed score (data dependent) state self.observed_score_state = _score_linear_term.dot(_beta_unpenalized) self.observed_score_state[inactive] += (self.loss.smooth_objective(beta_bar, 'grad')[inactive] + quad_data.objective(beta_bar, 'grad')[inactive]) def signed_basis_vector(p, j, s): v = np.zeros(p) v[j] = s return v active_directions = np.array([signed_basis_vector(p, j, active_signs[j]) for j in np.nonzero(active)[0]]).T scaling_slice = slice(0, active.sum()) if np.sum(active) == 0: _opt_hessian = 0 else: _opt_hessian = _hessian_active * active_signs[None, active] + self.ridge_term * active_directions _opt_linear_term[:, scaling_slice] = _opt_hessian # beta_U piece unpenalized_slice = slice(active.sum(), self.num_opt_var) unpenalized_directions = np.array([signed_basis_vector(p, j, 1) for j in np.nonzero(unpenalized)[0]]).T if unpenalized.sum(): _opt_linear_term[:, unpenalized_slice] = (_hessian_unpen + self.ridge_term * unpenalized_directions) # two transforms that encode score and optimization # variable roles self.opt_transform = (_opt_linear_term, self.initial_subgrad) self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) # now store everything needed for the projections # the projection acts only on the optimization # variables self._setup = True self.scaling_slice = scaling_slice self.unpenalized_slice = unpenalized_slice self.ndim = self.loss.shape[0] # compute implied mean and covariance opt_linear, opt_offset = self.opt_transform A_scaling = -np.identity(self.num_opt_var) b_scaling = np.zeros(self.num_opt_var) self._setup_sampler(A_scaling, b_scaling, opt_linear, opt_offset) return active_signs
def test_simple_problem(self): tests = [] atom, q, prox_center, L = self.atom, self.q, self.prox_center, self.L loss = self.loss problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, min_its=100) tests.append( (atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem with monotonicity\n %s' % str(self))) # write the loss in terms of a quadratic for the smooth loss and a smooth function... q = rr.identity_quadratic(L, prox_center, 0, 0) lossq = rr.quadratic.shift(prox_center.copy(), coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, prox_center.copy(), 0, 0) problem = rr.simple_problem(lossq, atom) tests.append( (atom.proximal(q), problem.solve(coef_stop=self.coef_stop, FISTA=self.FISTA, tol=1.0e-12), 'solving prox with simple_problem ' + 'with monotonicity but loss has identity_quadratic %s\n ' % str(self))) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append( (atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem no monotonicity_restart\n %s' % str(self))) d = atom.conjugate problem = rr.simple_problem(loss, d) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append( (d.proximal(q), problem.solve(tol=1.e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, monotonicity_restart=False), 'solving dual prox with simple_problem no monotonocity\n %s ' % str(self))) if not self.interactive: for test in tests: yield (all_close, ) + test + (self, ) else: for test in tests: yield all_close(*((test + (self, ))))
def test_quadratic_for_smooth2(): """ this test is a check to ensure that the quadratic part of the smooth functions are being used in the proximal step """ L = 2 W = np.arange(5) Z = 0.5 * np.arange(5)[::-1] U = 1.5 * np.arange(5) atomq = rr.identity_quadratic(0.4, U, W, 0) atom = rr.l1norm(5, quadratic=atomq, lagrange=0.1) # specifying in this way should be the same as if we put 0.5*L below loss = rr.quadratic.shift(-Z, coef=0.6 * L) lq = rr.identity_quadratic(0.4 * L, Z, 0, 0) loss.quadratic = lq ww = np.ones(5) # specifying in this way should be the same as if we put 0.5*L below loss2 = rr.quadratic.shift(-Z, coef=L) np.testing.assert_allclose(loss2.objective(ww), loss.objective(ww)) np.testing.assert_allclose(lq.objective(ww, "func"), loss.nonsmooth_objective(ww)) np.testing.assert_allclose(loss2.smooth_objective(ww, "func"), 0.5 / 0.3 * loss.smooth_objective(ww, "func")) np.testing.assert_allclose(loss2.smooth_objective(ww, "grad"), 0.5 / 0.3 * loss.smooth_objective(ww, "grad")) problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12) problem3 = rr.simple_problem(loss, atom) solver3 = rr.FISTA(problem3) solver3.fit(tol=1.0e-12, coef_stop=True) loss4 = rr.quadratic.shift(-Z, coef=0.6 * L) problem4 = rr.simple_problem(loss4, atom) problem4.quadratic = lq solver4 = rr.FISTA(problem4) solver4.fit(tol=1.0e-12) gg_soln = rr.gengrad(problem4, L) loss6 = rr.quadratic.shift(-Z, coef=0.6 * L) loss6.quadratic = lq + atom.quadratic atomcp = copy(atom) atomcp.quadratic = rr.identity_quadratic(0, 0, 0, 0) problem6 = rr.dual_problem(loss6.conjugate, rr.identity(loss6.primal_shape), atomcp.conjugate) problem6.lipschitz = L + atom.quadratic.coef dsoln2 = problem6.solve(coef_stop=True, tol=1.0e-10, max_its=100) problem2 = rr.container(loss2, atom) solver2 = rr.FISTA(problem2) solver2.fit(tol=1.0e-12, coef_stop=True) q = rr.identity_quadratic(L, Z, 0, 0) ac(problem.objective(ww), atom.nonsmooth_objective(ww) + q.objective(ww, "func")) aq = atom.solve(q) for p, msg in zip( [ solver3.composite.coefs, gg_soln, solver2.composite.coefs, solver4.composite.coefs, dsoln2, solver.composite.coefs, ], [ "simple_problem with loss having no quadratic", "gen grad", "container with loss having no quadratic", "simple_problem container with quadratic", "dual problem with loss having a quadratic", "container with loss having a quadratic", ], ): yield ac, aq, p, msg
def solve(self, nboot=2000, solve_args={ 'min_its': 20, 'tol': 1.e-10 }, perturb=None): self.randomize(perturb=perturb) (loss, randomized_loss, epsilon, penalty, randomization) = (self.loss, self.randomized_loss, self.epsilon, self.penalty, self.randomization) # initial solution p = penalty.shape[0] problem = rr.simple_problem(randomized_loss, penalty) self.initial_soln = problem.solve(**solve_args) # find the active groups and their direction vectors # as well as unpenalized groups active_signs = np.sign(self.initial_soln) active = self._active = active_signs != 0 if isinstance(penalty, rr.l1norm): self._lagrange = penalty.lagrange * np.ones(p) unpenalized = np.zeros(p, np.bool) elif isinstance(penalty, rr.weighted_l1norm): self._lagrange = penalty.weights unpenalized = self._lagrange == 0 else: raise ValueError('penalty must be `l1norm` or `weighted_l1norm`') active *= ~unpenalized # solve the restricted problem self._overall = (active + unpenalized) > 0 self._inactive = ~self._overall self._unpenalized = unpenalized _active_signs = active_signs.copy() _active_signs[ unpenalized] = np.nan # don't release sign of unpenalized variables self.selection_variable = { 'sign': _active_signs, 'variables': self._overall } # initial state for opt variables initial_subgrad = -( self.randomized_loss.smooth_objective(self.initial_soln, 'grad') + self.randomized_loss.quadratic.objective(self.initial_soln, 'grad')) # the quadratic of a smooth_atom is not included in computing the smooth_objective self.initial_subgrad = initial_subgrad initial_scalings = np.fabs(self.initial_soln[active]) initial_unpenalized = self.initial_soln[self._unpenalized] self.observed_opt_state = np.concatenate([ initial_scalings, initial_unpenalized, self.initial_subgrad[self._inactive] ], axis=0) # set the _solved bit self._solved = True # Now setup the pieces for linear decomposition (loss, epsilon, penalty, initial_soln, overall, inactive, unpenalized) = (self.loss, self.epsilon, self.penalty, self.initial_soln, self._overall, self._inactive, self._unpenalized) # we are implicitly assuming that # loss is a pairs model _beta_unpenalized = restricted_estimator(loss, overall, solve_args=solve_args) beta_bar = np.zeros(p) beta_bar[overall] = _beta_unpenalized self._beta_full = beta_bar # observed state for score in internal coordinates self.observed_internal_state = np.hstack([ _beta_unpenalized, -loss.smooth_objective(beta_bar, 'grad')[inactive] ]) # form linear part self.num_opt_var = self.observed_opt_state.shape[0] # (\bar{\beta}_{E \cup U}, N_{-E}, c_E, \beta_U, z_{-E}) # E for active # U for unpenalized # -E for inactive _opt_linear_term = np.zeros((p, p)) _score_linear_term = np.zeros((p, p)) # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator est_slice = slice(0, overall.sum()) X, y = loss.data W = self.loss.saturated_loss.hessian(X.dot(beta_bar)) _hessian_active = np.dot(X.T, X[:, active] * W[:, None]) _hessian_unpen = np.dot(X.T, X[:, unpenalized] * W[:, None]) _score_linear_term[:, est_slice] = -np.hstack( [_hessian_active, _hessian_unpen]) # N_{-(E \cup U)} piece -- inactive coordinates of score of M estimator at unpenalized solution null_idx = np.arange(overall.sum(), p) inactive_idx = np.nonzero(inactive)[0] for _i, _n in zip(inactive_idx, null_idx): _score_linear_term[_i, _n] = -1 # c_E piece def signed_basis_vector(p, j, s): v = np.zeros(p) v[j] = s return v active_directions = np.array([ signed_basis_vector(p, j, active_signs[j]) for j in np.nonzero(active)[0] ]).T scaling_slice = slice(0, active.sum()) if np.sum(active) == 0: _opt_hessian = 0 else: _opt_hessian = _hessian_active * active_signs[ None, active] + epsilon * active_directions _opt_linear_term[:, scaling_slice] = _opt_hessian # beta_U piece unpenalized_slice = slice(active.sum(), active.sum() + unpenalized.sum()) unpenalized_directions = np.array([ signed_basis_vector(p, j, 1) for j in np.nonzero(unpenalized)[0] ]).T if unpenalized.sum(): _opt_linear_term[:, unpenalized_slice] = ( _hessian_unpen + epsilon * unpenalized_directions) # subgrad piece subgrad_idx = range(active.sum() + unpenalized.sum(), active.sum() + inactive.sum() + unpenalized.sum()) subgrad_slice = slice( active.sum() + unpenalized.sum(), active.sum() + inactive.sum() + unpenalized.sum()) for _i, _s in zip(inactive_idx, subgrad_idx): _opt_linear_term[_i, _s] = 1 # form affine part _opt_affine_term = np.zeros(p) idx = 0 _opt_affine_term[ active] = active_signs[active] * self._lagrange[active] # two transforms that encode score and optimization # variable roles self.opt_transform = (_opt_linear_term, _opt_affine_term) self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) # everything now expressed in observed_score_state self.observed_score_state = _score_linear_term.dot( self.observed_internal_state) # now store everything needed for the projections # the projection acts only on the optimization # variables # we form a dual group lasso object # to do the projection self._setup = True self.subgrad_slice = subgrad_slice self.scaling_slice = scaling_slice self.unpenalized_slice = unpenalized_slice self.ndim = loss.shape[0] self.nboot = nboot
def test_cv(n=100, p=50, s=5, signal=7.5, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., glmnet=True, loss='gaussian', bootstrap=False, condition_on_CVR=True, marginalize_subgrad=True, ndraw=10000, burnin=2000, nboot=nboot): print(n, p, s, condition_on_CVR, scale1, scale2) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) glm_loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) glm_loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) # view 1 cv = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) if glmnet: try: cv.solve(glmnet=glmnet) except ImportError: cv.solve(glmnet=False) else: cv.solve(glmnet=False) # for the test make sure we also run the python code cv_py = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) cv_py.solve(glmnet=False) lam = cv.lam_CVR print("lam", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = cv.one_SD_rule(direction="up") print("new lam", lam) # non-randomized Lasso, just looking how many vars it selects problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized lasso ", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(glm_loss, epsilon, penalty, randomizer) if nboot > 0: cv.nboot = M_est.nboot = nboot mv = multiple_queries([cv, M_est]) mv.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est.decompose_subgradient(conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) selected_features = np.zeros(p, np.bool) selected_features[active_set] = True unpenalized_mle = restricted_Mest(M_est.loss, selected_features) form_covariances = glm_nonparametric_bootstrap(n, n) target_info, target_observed = pairs_bootstrap_glm(M_est.loss, selected_features, inactive=None) cov_info = M_est.setup_sampler() target_cov, score_cov = form_covariances(target_info, cross_terms=[cov_info], nsample=M_est.nboot) opt_sample = M_est.sampler.sample(ndraw, burnin) pvalues = M_est.sampler.coefficient_pvalues( unpenalized_mle, target_cov, score_cov, parameter=np.zeros(selected_features.sum()), sample=opt_sample) intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample) L, U = intervals.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(np.diag(target_cov), target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(np.diag(target_cov), target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j] - L[j] naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def solve(self, scaling=1, solve_args={'min_its': 20, 'tol': 1.e-10}): self.randomize() (loss, randomized_loss, epsilon, penalty, randomization, solve_args) = (self.loss, self.randomized_loss, self.epsilon, self.penalty, self.randomization, self.solve_args) # initial solution problem = rr.simple_problem(randomized_loss, penalty) self.initial_soln = problem.solve(**solve_args) # find the active groups and their direction vectors # as well as unpenalized groups groups = np.unique(penalty.groups) active_groups = np.zeros(len(groups), np.bool) unpenalized_groups = np.zeros(len(groups), np.bool) active_directions = [] active = np.zeros(loss.shape, np.bool) unpenalized = np.zeros(loss.shape, np.bool) initial_scalings = [] for i, g in enumerate(groups): group = penalty.groups == g active_groups[i] = (np.linalg.norm(self.initial_soln[group]) > 1.e-6 * penalty.weights[g]) and ( penalty.weights[g] > 0) unpenalized_groups[i] = (penalty.weights[g] == 0) if active_groups[i]: active[group] = True z = np.zeros(active.shape, np.float) z[group] = self.initial_soln[group] / np.linalg.norm( self.initial_soln[group]) active_directions.append(z) initial_scalings.append( np.linalg.norm(self.initial_soln[group])) if unpenalized_groups[i]: unpenalized[group] = True # solve the restricted problem self._overall = active + unpenalized self._inactive = ~self._overall self._unpenalized = unpenalized self._active_directions = np.array(active_directions).T self._active_groups = np.array(active_groups, np.bool) self._unpenalized_groups = np.array(unpenalized_groups, np.bool) self.selection_variable = { 'groups': self._active_groups, 'variables': self._overall, 'directions': self._active_directions } # initial state for opt variables initial_subgrad = -( self.randomized_loss.smooth_objective(self.initial_soln, 'grad') + self.randomized_loss.quadratic.objective(self.initial_soln, 'grad')) # the quadratic of a smooth_atom is not included in computing the smooth_objective initial_subgrad = initial_subgrad[self._inactive] initial_unpenalized = self.initial_soln[self._unpenalized] self.observed_opt_state = np.concatenate( [initial_scalings, initial_unpenalized, initial_subgrad], axis=0) # set the _solved bit self._solved = True # Now setup the pieces for linear decomposition (loss, epsilon, penalty, initial_soln, overall, inactive, unpenalized, active_groups, active_directions) = (self.loss, self.epsilon, self.penalty, self.initial_soln, self._overall, self._inactive, self._unpenalized, self._active_groups, self._active_directions) # scaling should be chosen to be Lipschitz constant for gradient of Gaussian part # we are implicitly assuming that # loss is a pairs model _sqrt_scaling = np.sqrt(scaling) _beta_unpenalized = restricted_Mest(loss, overall, solve_args=solve_args) beta_full = np.zeros(overall.shape) beta_full[overall] = _beta_unpenalized _hessian = loss.hessian(beta_full) self._beta_full = beta_full # observed state for score self.observed_score_state = np.hstack([ _beta_unpenalized * _sqrt_scaling, -loss.smooth_objective(beta_full, 'grad')[inactive] / _sqrt_scaling ]) # form linear part self.num_opt_var = p = loss.shape[0] # shorthand for p # (\bar{\beta}_{E \cup U}, N_{-E}, c_E, \beta_U, z_{-E}) # E for active # U for unpenalized # -E for inactive _opt_linear_term = np.zeros( (p, self._active_groups.sum() + unpenalized.sum() + inactive.sum())) _score_linear_term = np.zeros((p, p)) # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator Mest_slice = slice(0, overall.sum()) _Mest_hessian = _hessian[:, overall] _score_linear_term[:, Mest_slice] = -_Mest_hessian / _sqrt_scaling # N_{-(E \cup U)} piece -- inactive coordinates of score of M estimator at unpenalized solution null_idx = range(overall.sum(), p) inactive_idx = np.nonzero(inactive)[0] for _i, _n in zip(inactive_idx, null_idx): _score_linear_term[_i, _n] = -_sqrt_scaling # c_E piece scaling_slice = slice(0, active_groups.sum()) if len(active_directions) == 0: _opt_hessian = 0 else: _opt_hessian = (_hessian + epsilon * np.identity(p)).dot(active_directions) _opt_linear_term[:, scaling_slice] = _opt_hessian / _sqrt_scaling self.observed_opt_state[scaling_slice] *= _sqrt_scaling # beta_U piece unpenalized_slice = slice(active_groups.sum(), active_groups.sum() + unpenalized.sum()) unpenalized_directions = np.identity(p)[:, unpenalized] if unpenalized.sum(): _opt_linear_term[:, unpenalized_slice] = ( _hessian + epsilon * np.identity(p)).dot(unpenalized_directions) / _sqrt_scaling self.observed_opt_state[unpenalized_slice] *= _sqrt_scaling # subgrad piece subgrad_idx = range( active_groups.sum() + unpenalized.sum(), active_groups.sum() + inactive.sum() + unpenalized.sum()) subgrad_slice = slice( active_groups.sum() + unpenalized.sum(), active_groups.sum() + inactive.sum() + unpenalized.sum()) for _i, _s in zip(inactive_idx, subgrad_idx): _opt_linear_term[_i, _s] = _sqrt_scaling self.observed_opt_state[subgrad_slice] /= _sqrt_scaling # form affine part _opt_affine_term = np.zeros(p) idx = 0 groups = np.unique(penalty.groups) for i, g in enumerate(groups): if active_groups[i]: group = penalty.groups == g _opt_affine_term[group] = active_directions[:, idx][ group] * penalty.weights[g] idx += 1 # two transforms that encode score and optimization # variable roles # later, we will modify `score_transform` # in `linear_decomposition` self.opt_transform = (_opt_linear_term, _opt_affine_term) self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) # now store everything needed for the projections # the projection acts only on the optimization # variables self.scaling_slice = scaling_slice # weights are scaled here because the linear terms scales them by scaling new_groups = penalty.groups[inactive] new_weights = dict([(g, penalty.weights[g] / _sqrt_scaling) for g in penalty.weights.keys() if g in np.unique(new_groups)]) # we form a dual group lasso object # to do the projection self.group_lasso_dual = rr.group_lasso_dual(new_groups, weights=new_weights, bound=1.) self.subgrad_slice = subgrad_slice self._setup = True
def test_solve_QP(): """ Check the R coordinate descent LASSO solver """ n, p = 100, 50 lam = 0.08 X = np.random.standard_normal((n, p)) loss = rr.squared_error(X, np.zeros(n), coef=1. / n) pen = rr.l1norm(p, lagrange=lam) E = np.zeros(p) E[2] = 1 Q = rr.identity_quadratic(0, 0, E, 0) problem = rr.simple_problem(loss, pen) soln = problem.solve(Q, min_its=500, tol=1.e-12) numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('E', E) rpy.r.assign('lam', lam) R_code = """ library(selectiveInference) p = ncol(X) n = nrow(X) soln_R = rep(0, p) grad = 1. * E ever_active = as.integer(c(1, rep(0, p-1))) nactive = as.integer(1) kkt_tol = 1.e-12 objective_tol = 1.e-16 parameter_tol = 1.e-10 maxiter = 500 soln_R = selectiveInference:::solve_QP(t(X) %*% X / n, lam, maxiter, soln_R, E, grad, ever_active, nactive, kkt_tol, objective_tol, parameter_tol, p, TRUE, TRUE, TRUE)$soln # test wide solver Xtheta = rep(0, n) nactive = as.integer(1) ever_active = as.integer(c(1, rep(0, p-1))) soln_R_wide = rep(0, p) grad = 1. * E soln_R_wide = selectiveInference:::solve_QP_wide(X, rep(lam, p), 0, maxiter, soln_R_wide, E, grad, Xtheta, ever_active, nactive, kkt_tol, objective_tol, parameter_tol, p, TRUE, TRUE, TRUE)$soln """ rpy.r(R_code) soln_R = np.asarray(rpy.r('soln_R')) soln_R_wide = np.asarray(rpy.r('soln_R_wide')) numpy2ri.deactivate() tol = 1.e-5 print(soln - soln_R) print(soln_R - soln_R_wide) G = X.T.dot(X).dot(soln) / n + E yield np.testing.assert_allclose, soln, soln_R, tol, tol, False, 'checking coordinate QP solver' yield np.testing.assert_allclose, soln, soln_R_wide, tol, tol, False, 'checking wide coordinate QP solver' yield np.testing.assert_allclose, G[soln != 0], -np.sign( soln[soln != 0] ) * lam, tol, tol, False, 'checking active coordinate KKT for QP solver' yield nt.assert_true, np.fabs( G).max() < lam * (1. + 1.e-6), 'testing linfinity norm'
# IPython log file import numpy as np import regreg.smooth.mglm as M import regreg.smooth.mglm as M import regreg.api as rr np.random.seed(0) n, p = 2000, 4 Y = np.random.multinomial(1, [0.1, 0.4, 0.5], size=(n, )) q = Y.shape[1] X = np.random.standard_normal((n, p)) pen = rr.l1_l2((p, q), lagrange=0.4 * np.sqrt(n)) loss = M.mglm.multinomial(X, Y) problem = rr.simple_problem(loss, pen) problem.solve(debug=True, min_its=50, tol=1e-12) loss_baseline = M.mglm.multinomial(X, Y, baseline=True) pen_baseline = rr.l1_l2((p, q - 1), lagrange=0.4 * np.sqrt(n)) problem_baseline = rr.simple_problem(loss_baseline, pen_baseline) problem_baseline.solve(debug=True, min_its=50, tol=1e-12)
def fit(self, tol=1.e-12, min_its=50, **solve_args): """ Fit the lasso using `regreg`. This sets the attributes `soln`, `onestep` and forms the constraints necessary for post-selection inference by calling `form_constraints()`. Parameters ---------- solve_args : keyword args Passed to `regreg.problems.simple_problem.solve`. Returns ------- soln : np.float Solution to lasso. """ penalty = weighted_l1norm(self.feature_weights, lagrange=1.) problem = simple_problem(self.loglike, penalty) lasso_solution = problem.solve(tol=tol, min_its=min_its, **solve_args) self.lasso_solution = lasso_solution if not np.all(lasso_solution == 0): self.active = np.nonzero(lasso_solution != 0)[0] self.inactive = lasso_solution == 0 self.active_signs = np.sign(lasso_solution[self.active]) self._active_soln = lasso_solution[self.active] H = self.loglike.hessian(self.lasso_solution) H_AA = H[self.active][:,self.active] H_AAinv = np.linalg.inv(H_AA) Q = self.loglike.quadratic G_Q = Q.objective(self.lasso_solution, 'grad') G = self.loglike.gradient(self.lasso_solution) + G_Q G_A = G[self.active] G_I = self._G_I = G[self.inactive] dbeta_A = H_AAinv.dot(G_A) self.onestep_estimator = self._active_soln - dbeta_A self.active_penalized = self.feature_weights[self.active] != 0 self._constraints = constraints(-np.diag(self.active_signs)[self.active_penalized], (self.active_signs * dbeta_A)[self.active_penalized], covariance=H_AAinv) if self.inactive.sum(): # inactive constraints H_IA = H[self.inactive][:,self.active] H_II = H[self.inactive][:,self.inactive] inactive_cov = H_II - H_IA.dot(H_AAinv).dot(H_IA.T) irrepresentable = H_IA.dot(H_AAinv) inactive_mean = irrepresentable.dot(-G_A) self._inactive_constraints = constraints(np.vstack([np.identity(self.inactive.sum()), -np.identity(self.inactive.sum())]), np.hstack([self.feature_weights[self.inactive], self.feature_weights[self.inactive]]), covariance=inactive_cov, mean=inactive_mean) if not self._inactive_constraints(G_I): warnings.warn('inactive constraint of KKT conditions not satisfied -- perhaps need to solve with more accuracy') if self.covariance_estimator is not None: # make full constraints _cov_FA = self.covariance_estimator(self.onestep_estimator, self.active, self.inactive) _cov_IA = _cov_FA[len(self.active):] _cov_AA = _cov_FA[:len(self.active)] # X_{-E}^T(y - X_E \bar{\beta}_E) _inactive_score = - G_I - inactive_mean _beta_bar = self.onestep_estimator _indep_linear_part = _cov_IA.dot(np.linalg.inv(_cov_AA)) # we "fix" _nuisance, effectively conditioning on it _nuisance = _inactive_score - _indep_linear_part.dot(_beta_bar) _upper_lim = (self.feature_weights[self.inactive] - _nuisance - inactive_mean) _lower_lim = (_nuisance + self.feature_weights[self.inactive] + inactive_mean) _upper_linear = _indep_linear_part _lower_linear = -_indep_linear_part C = self._constraints _full_linear = np.vstack([C.linear_part, _upper_linear, _lower_linear]) _full_offset = np.hstack([C.offset, _upper_lim, _lower_lim]) self._constraints = constraints(_full_linear, _full_offset, covariance=_cov_AA) if not self._constraints(_beta_bar): warnings.warn('constraints of KKT conditions on one-step estimator ' + ' not satisfied -- perhaps need to solve with more' + 'accuracy') else: self._inactive_constraints = None else: self.active = [] self.inactive = np.arange(lasso_solution.shape[0]) self._constraints = None self._inactive_constraints = None return self.lasso_solution
def solveit(atom, Z, W, U, linq, L, FISTA, coef_stop): p2 = copy(atom) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) d = atom.conjugate q = rr.identity_quadratic(1, Z, 0, 0) yield ac, Z - atom.proximal(q), d.proximal(q), "testing duality of projections starting from atom %s " % atom q = rr.identity_quadratic(L, Z, 0, 0) # use simple_problem.nonsmooth p2 = copy(atom) p2.quadratic = atom.quadratic + q problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, FISTA=FISTA, coef_stop=coef_stop) yield ac, atom.proximal( q ), solver.composite.coefs, "solving prox with simple_problem.nonsmooth with monotonicity %s " % atom # use the solve method p2.coefs *= 0 p2.quadratic = atom.quadratic + q soln = p2.solve() yield ac, atom.proximal(q), soln, "solving prox with solve method %s " % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop) yield ac, atom.proximal(q), solver.composite.coefs, "solving prox with simple_problem with monotonicity %s " % atom dproblem2 = rr.dual_problem(loss.conjugate, rr.identity(loss.primal_shape), atom.conjugate) dcoef2 = dproblem2.solve(coef_stop=coef_stop, tol=1.0e-14) yield ac, atom.proximal(q), dcoef2, "solving prox with dual_problem with monotonicity %s " % atom dproblem = rr.dual_problem.fromprimal(loss, atom) dcoef = dproblem.solve(coef_stop=coef_stop, tol=1.0e-14) yield ac, atom.proximal(q), dcoef, "solving prox with dual_problem.fromprimal with monotonicity %s " % atom # write the loss in terms of a quadratic for the smooth loss and a smooth function... lossq = rr.quadratic.shift(-Z, coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, Z, 0, 0) problem = rr.simple_problem(lossq, atom) yield ac, atom.proximal(q), problem.solve( coef_stop=coef_stop, FISTA=FISTA, tol=1.0e-12 ), "solving prox with simple_problem with monotonicity but loss has identity_quadratic %s " % atom problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, "solving prox with simple_problem.nonsmooth with no monotonocity %s " % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, "solving prox with simple_problem %s no monotonicity_restart" % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.separable_problem.singleton(atom, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal(q), solver.composite.coefs, "solving atom prox with separable_atom.singleton %s " % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal(q), solver.composite.coefs, "solving atom prox with container %s " % atom # write the loss in terms of a quadratic for the smooth loss and a smooth function... lossq = rr.quadratic.shift(-Z, coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, Z, 0, 0) problem = rr.container(lossq, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop) yield ( ac, atom.proximal(q), problem.solve(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop), "solving prox with container with monotonicity but loss has identity_quadratic %s " % atom, ) loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, d) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) # ac(d.proximal(q), solver.composite.coefs, 'solving dual prox with simple_problem no monotonocity %s ' % atom) yield ( ac, d.proximal(q), problem.solve(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop, monotonicity_restart=False), "solving dual prox with simple_problem no monotonocity %s " % atom, ) problem = rr.container(d, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, d.proximal(q), solver.composite.coefs, "solving dual prox with container %s " % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.separable_problem.singleton(d, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, d.proximal(q), solver.composite.coefs, "solving atom prox with separable_atom.singleton %s " % atom
def CV_err(self, penalty, loss=None, residual_randomization=None, scale=None, solve_args={ 'min_its': 20, 'tol': 1.e-1 }): """ Computes the non-randomized CV error and the one with added residual randomization """ if loss is None: loss = copy.copy(self.loss) X, y = loss.data n, p = X.shape CV_err = 0 CV_err_squared = 0 if residual_randomization is not None: CV_err_randomized = 0 CV_err_squared_randomized = 0 if scale is None: scale = 1. for fold in np.unique(self.folds): test = self.folds == fold train = ~test loss_train = loss.subsample(train) loss_test = loss.subsample(test) X_test, y_test = X[test], y[test] n_test = y_test.shape[0] if self.objective_randomization is not None: randomized_train_loss = self.objective_randomization.randomize( loss_train, self.epsilon)[0] # randomized train loss problem = rr.simple_problem(randomized_train_loss, penalty) else: problem = rr.simple_problem(loss_train, penalty) beta_train = problem.solve(**solve_args) _mu = lambda X, beta: loss_test.saturated_loss.mean_function( X.dot(beta)) resid = y_test - _mu(X_test, beta_train) cur = (resid**2).sum() / n_test CV_err += cur CV_err_squared += (cur**2) if residual_randomization is not None: random_noise = scale * np.random.standard_normal(n_test) cur_randomized = ((resid + random_noise)**2).sum() / n_test CV_err_randomized += cur_randomized CV_err_squared_randomized += cur_randomized**2 SD_CV = np.sqrt( (CV_err_squared - ((CV_err**2) / self.K)) / float(self.K - 1)) if residual_randomization is not None: SD_CV_randomized = np.sqrt( (CV_err_squared_randomized - (CV_err_randomized**2 / self.K)) / (self.K - 1)) return CV_err, SD_CV, CV_err_randomized, SD_CV_randomized else: return CV_err, SD_CV
def test_sqrt_lasso(n=500, p=20, s=3, signal=10, K=5, rho=0., randomizer = 'gaussian', randomizer_scale = 1., scale1 = 0.1, scale2 = 0.2, lam_frac = 1., bootstrap = False, condition_on_CVR = False, marginalize_subgrad = True, ndraw = 10000, burnin = 2000): print(n,p,s) if randomizer == 'laplace': randomizer = randomization.laplace((p,), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p,),randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p,), scale=randomizer_scale) X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam_nonrandom = choose_lambda(X) lam_random = choose_lambda_with_randomization(X, randomizer) loss = l2norm_glm(X, y) #sqloss = rr.glm.gaussian(X, y) epsilon = 1./n # non-randomized sqrt-Lasso, just looking how many vars it selects problem = rr.simple_problem(loss, rr.l1norm(p, lagrange=lam_nonrandom)) beta_hat = problem.solve() active_hat = beta_hat !=0 print("non-randomized sqrt-root Lasso active set", np.where(beta_hat)[0]) print("non-randomized sqrt-lasso", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam_random penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1. / np.sqrt(n)) M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) mv.solve() #active = soln != 0 active_union = M_est1._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive==0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, dtype=bool), marginalizing_groups=np.ones(p, bool)) target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) #pivots_mle = target_sampler.coefficient_pvalues(target_observed, # parameter=target_sampler.reference, # sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues(target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) L, U = LU.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(target_sampler, target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j]-L[j] naive_length[j] = LU_naive[j,1]-LU_naive[j,0] active_var[j] = active_set[j] in nonzero print("individual coverage", np.true_divide(sel_covered.sum(),nactive)) from statsmodels.sandbox.stats.multicomp import multipletests q = 0.1 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def solveit(atom, Z, W, U, linq, L, FISTA, coef_stop): p2 = copy(atom) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) d = atom.conjugate q = rr.identity_quadratic(1, Z, 0, 0) yield ac, Z - atom.proximal(q), d.proximal( q), 'testing duality of projections starting from atom %s ' % atom q = rr.identity_quadratic(L, Z, 0, 0) # use simple_problem.nonsmooth p2 = copy(atom) p2.quadratic = atom.quadratic + q problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, FISTA=FISTA, coef_stop=coef_stop) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem.nonsmooth with monotonicity %s ' % atom # use the solve method p2.coefs *= 0 p2.quadratic = atom.quadratic + q soln = p2.solve() yield ac, atom.proximal( q), soln, 'solving prox with solve method %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem with monotonicity %s ' % atom dproblem2 = rr.dual_problem(loss.conjugate, rr.identity(loss.shape), atom.conjugate) dcoef2 = dproblem2.solve(coef_stop=coef_stop, tol=1.e-14) yield ac, atom.proximal( q ), dcoef2, 'solving prox with dual_problem with monotonicity %s ' % atom dproblem = rr.dual_problem.fromprimal(loss, atom) dcoef = dproblem.solve(coef_stop=coef_stop, tol=1.0e-14) yield ac, atom.proximal( q ), dcoef, 'solving prox with dual_problem.fromprimal with monotonicity %s ' % atom # write the loss in terms of a quadratic for the smooth loss and a smooth function... lossq = rr.quadratic.shift(-Z, coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, Z, 0, 0) problem = rr.simple_problem(lossq, atom) yield ac, atom.proximal(q), problem.solve( coef_stop=coef_stop, FISTA=FISTA, tol=1.0e-12 ), 'solving prox with simple_problem with monotonicity but loss has identity_quadratic %s ' % atom problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem.nonsmooth with no monotonocity %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem %s no monotonicity_restart' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.separable_problem.singleton(atom, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving atom prox with separable_atom.singleton %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving atom prox with container %s ' % atom # write the loss in terms of a quadratic for the smooth loss and a smooth function... lossq = rr.quadratic.shift(-Z, coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, Z, 0, 0) problem = rr.container(lossq, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop) yield ( ac, atom.proximal(q), problem.solve(tol=1.e-12, FISTA=FISTA, coef_stop=coef_stop), 'solving prox with container with monotonicity but loss has identity_quadratic %s ' % atom) loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, d) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) # ac(d.proximal(q), solver.composite.coefs, 'solving dual prox with simple_problem no monotonocity %s ' % atom) yield (ac, d.proximal(q), problem.solve(tol=1.e-12, FISTA=FISTA, coef_stop=coef_stop, monotonicity_restart=False), 'solving dual prox with simple_problem no monotonocity %s ' % atom) problem = rr.container(d, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, d.proximal( q ), solver.composite.coefs, 'solving dual prox with container %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.separable_problem.singleton(d, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, d.proximal( q ), solver.composite.coefs, 'solving atom prox with separable_atom.singleton %s ' % atom
def test_quadratic_for_smooth(): ''' this test is a check to ensure that the quadratic part of the smooth functions are being used in the proximal step ''' L = 0.45 W = np.random.standard_normal(40) Z = np.random.standard_normal(40) U = np.random.standard_normal(40) atomq = rr.identity_quadratic(0.4, U, W, 0) atom = rr.l1norm(40, quadratic=atomq, lagrange=0.12) # specifying in this way should be the same as if we put 0.5*L below loss = rr.quadratic_loss.shift(Z, coef=0.6*L) lq = rr.identity_quadratic(0.4*L, Z, 0, 0) loss.quadratic = lq ww = np.random.standard_normal(40) # specifying in this way should be the same as if we put 0.5*L below loss2 = rr.quadratic_loss.shift(Z, coef=L) yield all_close, loss2.objective(ww), loss.objective(ww), 'checking objective', None yield all_close, lq.objective(ww, 'func'), loss.nonsmooth_objective(ww), 'checking nonsmooth objective', None yield all_close, loss2.smooth_objective(ww, 'func'), 0.5 / 0.3 * loss.smooth_objective(ww, 'func'), 'checking smooth objective func', None yield all_close, loss2.smooth_objective(ww, 'grad'), 0.5 / 0.3 * loss.smooth_objective(ww, 'grad'), 'checking smooth objective grad', None problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12) problem3 = rr.simple_problem(loss, atom) solver3 = rr.FISTA(problem3) solver3.fit(tol=1.0e-12, coef_stop=True) loss4 = rr.quadratic_loss.shift(Z, coef=0.6*L) problem4 = rr.simple_problem(loss4, atom) problem4.quadratic = lq solver4 = rr.FISTA(problem4) solver4.fit(tol=1.0e-12) gg_soln = rr.gengrad(problem, L) loss6 = rr.quadratic_loss.shift(Z, coef=0.6*L) loss6.quadratic = lq + atom.quadratic atomcp = copy(atom) atomcp.quadratic = rr.identity_quadratic(0,0,0,0) problem6 = rr.dual_problem(loss6.conjugate, rr.identity(loss6.shape), atomcp.conjugate) problem6.lipschitz = L + atom.quadratic.coef dsoln2 = problem6.solve(coef_stop=True, tol=1.e-10, max_its=100) problem2 = rr.container(loss2, atom) solver2 = rr.FISTA(problem2) solver2.fit(tol=1.0e-12, coef_stop=True) q = rr.identity_quadratic(L, Z, 0, 0) yield all_close, problem.objective(ww), atom.nonsmooth_objective(ww) + q.objective(ww,'func'), '', None atom = rr.l1norm(40, quadratic=atomq, lagrange=0.12) aq = atom.solve(q) for p, msg in zip([solver3.composite.coefs, gg_soln, solver2.composite.coefs, dsoln2, solver.composite.coefs, solver4.composite.coefs], ['simple_problem with loss having no quadratic', 'gen grad', 'container with loss having no quadratic', 'dual problem with loss having a quadratic', 'container with loss having a quadratic', 'simple_problem having a quadratic']): yield all_close, aq, p, msg, None
def test_lasso(s=1, n=100, p=10): X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0) print 'sigma', sigma lam_frac = 1. randomization = laplace(loc=0, scale=1.) loss = randomized.gaussian_Xfixed(X, y) random_Z = randomization.rvs(p) epsilon = 1. lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) random_Z = randomization.rvs(p) penalty = randomized.selective_l1norm_lan(p, lagrange=lam) #sampler1 = randomized.selective_sampler_MH_lan(loss, # random_Z, # epsilon, # randomization, # penalty) #loss_args = {'mean': np.zeros(n), # 'sigma': sigma, # 'linear_part':np.identity(y.shape[0]), # 'value': 0} #sampler1.setup_sampling(y, loss_args=loss_args) # data, opt_vars = sampler1.state # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon, 0, random_Z, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) initial_grad = loss.smooth_objective(initial_soln, mode='grad') betaE, cube = penalty.setup_sampling(initial_grad, initial_soln, random_Z, epsilon) data = y.copy() active = penalty.active_set if (np.sum(active)==0): print 'here' return [-1], [-1] inactive = ~active #betaE, cube = opt_vars ndata = data.shape[0]; nactive = betaE.shape[0]; ninactive = cube.shape[0] init_vec_state = np.zeros(ndata+nactive+ninactive) init_vec_state[:ndata] = data init_vec_state[ndata:(ndata+nactive)] = betaE init_vec_state[(ndata+nactive):] = cube def bootstrap_samples(y, P, R): nsample = 50 boot_samples = [] for _ in range(nsample): indices = np.random.choice(n, size=(n,), replace=True) y_star = y[indices] boot_samples.append(np.dot(P,y)+np.dot(R,y_star-y)) return boot_samples #boot_samples = bootstrap_samples(y) def move_data(vec_state, boot_samples, ndata = ndata, nactive = nactive, ninactive = ninactive, loss=loss): weights = [] betaE = vec_state[ndata:(ndata+nactive)] cube = vec_state[(ndata+nactive):] opt_vars = [betaE, cube] params, _, opt_vec = penalty.form_optimization_vector(opt_vars) # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty for i in range(len(boot_samples)): gradient = loss.gradient(boot_samples[i], params) weights.append(np.exp(-np.sum(np.abs(gradient + opt_vec)))) weights /= np.sum(weights) #m = max(weights) #idx = [i for i, j in enumerate(weights) if j == m][0] idx = np.nonzero(np.random.multinomial(1, weights, size=1)[0])[0][0] return boot_samples[idx] def full_projection(vec_state, penalty=penalty, ndata=ndata, nactive=nactive, ninactive = ninactive): data = vec_state[:ndata].copy() betaE = vec_state[ndata:(ndata+nactive)] cube = vec_state[(ndata+nactive):] signs = penalty.signs projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate((data, projected_betaE, projected_cube), 0) def full_gradient(vec_state, loss=loss, penalty =penalty, X=X, lam=lam, epsilon=epsilon, ndata=ndata, active=active, inactive=inactive): nactive = np.sum(active); ninactive=np.sum(inactive) data = vec_state[:ndata] betaE = vec_state[ndata:(ndata + nactive)] cube = vec_state[(ndata + nactive):] opt_vars = [betaE, cube] params , _ , opt_vec = penalty.form_optimization_vector(opt_vars) # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty gradient = loss.gradient(data, params) hessian = loss.hessian() ndata = data.shape[0] nactive = betaE.shape[0] ninactive = cube.shape[0] sign_vec = - np.sign(gradient + opt_vec) # sign(w), w=grad+\epsilon*beta+lambda*u B = hessian + epsilon * np.identity(nactive + ninactive) A = B[:, active] _gradient = np.zeros(ndata + nactive + ninactive) _gradient[:ndata] = 0 #- (data + np.dot(X, sign_vec)) _gradient[ndata:(ndata + nactive)] = np.dot(A.T, sign_vec) _gradient[(ndata + nactive):] = lam * sign_vec[inactive] return _gradient null, alt = pval(init_vec_state, full_gradient, full_projection, move_data, bootstrap_samples, X, y, nonzero, active) return null, alt
def test_lasso(s=3, n=1000, p=10, scale=True): X, y, true_beta, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1., rho=0, scale=scale) print 'true beta', true_beta lam_frac = 1. randomization = laplace(loc=0, scale=1.) loss = lasso_randomX.lasso_randomX(X, y) random_Z = randomization.rvs(p) epsilon = 1. lam = sigma * lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) if (scale == False): random_Z = np.sqrt(n) * random_Z lam = np.sqrt(n) * lam random_Z = randomization.rvs(p) penalty = randomized.selective_l1norm_lan(p, lagrange=lam) # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon, 0, random_Z, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) initial_grad = loss.smooth_objective(initial_soln, mode='grad') betaE, cube = penalty.setup_sampling(initial_grad, initial_soln, random_Z, epsilon) print initial_soln active = penalty.active_set inactive = ~active loss.fit_E(active) beta_unpenalized = loss._beta_unpenalized residual = y - np.dot(X[:, active], beta_unpenalized) # y-X_E\bar{\beta}^E N = np.dot(X[:, inactive].T, residual) # X_{-E}^T(y-X_E\bar{\beta}_E), null statistic data = np.concatenate((beta_unpenalized, N), axis=0) ndata = data.shape[0] nactive = betaE.shape[0] ninactive = cube.shape[0] # parametric coveriance estimate XE_pinv = np.linalg.pinv(X[:, active]) mat = np.zeros((nactive + ninactive, n)) mat[:nactive, :] = XE_pinv mat[nactive:, :] = X[:, inactive].T.dot( np.identity(n) - X[:, active].dot(XE_pinv)) Sigma_full = mat.dot(mat.T) Sigma_full_inv = np.linalg.inv(Sigma_full) # non-parametric covariance estimate #Sigma_full = loss._Sigma_full #Sigma_full_inv = np.linalg.inv(Sigma_full) init_vec_state = np.zeros(ndata + nactive + ninactive) init_vec_state[:ndata] = data init_vec_state[ndata:(ndata + nactive)] = betaE init_vec_state[(ndata + nactive):] = cube def bootstrap_samples(data0, P, R, X=X): nsample = 200 boot_samples = [] X_E = X[:, active] for _ in range(nsample): indices = np.random.choice(n, size=(n, ), replace=True) data_star = np.zeros_like(data0) data_star[:nactive] = np.linalg.lstsq(X_E[indices, :], y[indices])[0] data_star[nactive:] = 0 boot_samples.append( np.dot(P, data0) + np.dot(R, data_star - data0)) return boot_samples # boot_samples = bootstrap_samples(y) def move_data(vec_state, boot_samples, ndata=ndata, nactive=nactive, ninactive=ninactive, loss=loss): weights = [] betaE = vec_state[ndata:(ndata + nactive)] cube = vec_state[(ndata + nactive):] opt_vars = [betaE, cube] params, _, opt_vec = penalty.form_optimization_vector( opt_vars ) # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty for i in range(len(boot_samples)): gradient = loss.gradient(boot_samples[i], params) weights.append(np.exp(-np.sum(np.abs(gradient + opt_vec)))) weights /= np.sum(weights) idx = np.nonzero(np.random.multinomial(1, weights, size=1)[0])[0][0] return boot_samples[idx] def full_projection(vec_state, penalty=penalty, ndata=ndata, nactive=nactive, ninactive=ninactive): data = vec_state[:ndata].copy() betaE = vec_state[ndata:(ndata + nactive)] cube = vec_state[(ndata + nactive):] signs = penalty.signs projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate((data, projected_betaE, projected_cube), 0) def full_gradient(vec_state, loss=loss, penalty=penalty, Sigma_full_inv=Sigma_full_inv, lam=lam, epsilon=epsilon, ndata=ndata, active=active, inactive=inactive): nactive = np.sum(active) ninactive = np.sum(inactive) data = vec_state[:ndata] betaE = vec_state[ndata:(ndata + nactive)] cube = vec_state[(ndata + nactive):] opt_vars = [betaE, cube] params, _, opt_vec = penalty.form_optimization_vector( opt_vars ) # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty gradient = loss.gradient(data, params) hessian = loss.hessian ndata = data.shape[0] nactive = betaE.shape[0] ninactive = cube.shape[0] sign_vec = -np.sign( gradient + opt_vec) # sign(w), w=grad+\epsilon*beta+lambda*u A = hessian + epsilon * np.identity(nactive + ninactive) A_restricted = A[:, active] T = data[:nactive] _gradient = np.zeros(ndata + nactive + ninactive) # saturated model _gradient[:ndata] = -np.dot(Sigma_full_inv, data) _gradient[:nactive] -= hessian[:, active].T.dot(sign_vec) _gradient[nactive:(ndata)] -= sign_vec[inactive] # selected model #_gradient[:nactive] = - (np.dot(Sigma_T_inv, data[:nactive]) + np.dot(hessian[:, active].T, sign_vec)) _gradient[ndata:(ndata + nactive)] = np.dot(A_restricted.T, sign_vec) _gradient[(ndata + nactive):] = lam * sign_vec[inactive] return _gradient null, alt = pval(init_vec_state, full_gradient, full_projection, bootstrap_samples, move_data, Sigma_full[:nactive, :nactive], data, nonzero, active) return null, alt
def test_sqrt_lasso(n=500, p=20, s=3, signal=10, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., bootstrap=False, condition_on_CVR=False, marginalize_subgrad=True, ndraw=10000, burnin=2000): print(n, p, s) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam_nonrandom = choose_lambda(X) lam_random = choose_lambda_with_randomization(X, randomizer) loss = l2norm_glm(X, y) #sqloss = rr.glm.gaussian(X, y) epsilon = 1. / n # non-randomized sqrt-Lasso, just looking how many vars it selects problem = rr.simple_problem(loss, rr.l1norm(p, lagrange=lam_nonrandom)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized sqrt-root Lasso active set", np.where(beta_hat)[0]) print("non-randomized sqrt-lasso", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam_random penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1. / np.sqrt(n)) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active_set = M_est._overall nactive = np.sum(active_set) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_set)[0]): active_set = np.nonzero(active_set)[0] true_vec = beta[active_set] if marginalize_subgrad == True: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) selected_features = np.zeros(p, np.bool) selected_features[active_set] = True unpenalized_mle = restricted_Mest(M_est.loss, selected_features) form_covariances = glm_nonparametric_bootstrap(n, n) boot_target, boot_target_observed = pairs_bootstrap_glm( M_est.loss, selected_features, inactive=None) target_info = boot_target cov_info = M_est.setup_sampler() target_cov, score_cov = form_covariances(target_info, cross_terms=[cov_info], nsample=M_est.nboot) opt_sample = M_est.sampler.sample(ndraw, burnin) pvalues = M_est.sampler.coefficient_pvalues( unpenalized_mle, target_cov, score_cov, parameter=np.zeros(selected_features.sum()), sample=opt_sample) intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample) true_vec = beta[M_est.selection_variable['variables']] L, U = intervals.T covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 active_var[j] = active_set[j] in nonzero return pvalues, covered, active_var
def solve(self): (loss, epsilon, penalty, randomization, solve_args) = (self.loss, self.epsilon, self.penalty, self.randomization, self.solve_args) # initial solution problem = rr.simple_problem(loss, penalty) self._randomZ = self.randomization.sample() self._random_term = rr.identity_quadratic(epsilon, 0, -self._randomZ, 0) self.initial_soln = problem.solve(self._random_term, **solve_args) # find the active groups and their direction vectors # as well as unpenalized groups groups = np.unique(penalty.groups) active_groups = np.zeros(len(groups), np.bool) unpenalized_groups = np.zeros(len(groups), np.bool) active_directions = [] active = np.zeros(loss.shape, np.bool) unpenalized = np.zeros(loss.shape, np.bool) initial_scalings = [] for i, g in enumerate(groups): group = penalty.groups == g active_groups[i] = (np.linalg.norm(self.initial_soln[group]) > 1.e-6 * penalty.weights[g]) and (penalty.weights[g] > 0) unpenalized_groups[i] = (penalty.weights[g] == 0) if active_groups[i]: active[group] = True z = np.zeros(active.shape, np.float) z[group] = self.initial_soln[group] / np.linalg.norm(self.initial_soln[group]) active_directions.append(z) initial_scalings.append(np.linalg.norm(self.initial_soln[group])) if unpenalized_groups[i]: unpenalized[group] = True # solve the restricted problem self.overall = active + unpenalized self.inactive = ~self.overall self.unpenalized = unpenalized self.active_directions = np.array(active_directions).T self.active_groups = np.array(active_groups, np.bool) self.unpenalized_groups = np.array(unpenalized_groups, np.bool) self.selection_variable = (self.active_groups, self.active_directions) # initial state for opt variables initial_subgrad = -(self.loss.smooth_objective(self.initial_soln, 'grad') + self._random_term.objective(self.initial_soln, 'grad') + epsilon * self.initial_soln) initial_subgrad = initial_subgrad[self.inactive] initial_unpenalized = self.initial_soln[self.unpenalized] self.observed_opt_state = np.concatenate([initial_scalings, initial_unpenalized, initial_subgrad], axis=0)