def test_l1prox_bound(): ''' this test verifies that the l1 prox in bound form can be solved by a primal/dual specification obviously, we don't to solve the l1 prox this way, but it verifies that specification is working correctly ''' l1 = rr.l1norm(4, bound=2.) ww = np.random.standard_normal(4)*2 ab = l1.proximal(rr.identity_quadratic(0.5, ww, 0, 0)) l1c = copy(l1) l1c.quadratic = rr.identity_quadratic(0.5, ww, None, 0.) a = rr.simple_problem.nonsmooth(l1c) solver = rr.FISTA(a) solver.fit(min_its=100) l1c = copy(l1) l1c.quadratic = rr.identity_quadratic(0.5, ww, None, 0.) a = rr.dual_problem.fromprimal(l1c) solver = rr.FISTA(a) solver.fit(min_its=100) ac = a.primal np.testing.assert_allclose(ac + 0.1, ab + 0.1, rtol=1.e-4)
def test_group_lasso_atom(): ps = np.array([0]*5 + [3]*3) weights = {3:2., 0:2.3} lagrange = 1.5 lipschitz = 0.2 p = gl.group_lasso(ps, weights=weights, lagrange=lagrange) z = 30 * np.random.standard_normal(8) q = rr.identity_quadratic(lipschitz, z, 0, 0) x = p.solve(q) a = ml.mixed_lasso_lagrange_prox(z, lagrange, lipschitz, np.array([],np.int), np.array([],np.int), np.array([], np.int), np.array([], np.int), np.array([0,0,0,0,0,1,1,1]), np.array([np.sqrt(5), 2])) result = np.zeros_like(a) result[:5] = z[:5] / np.linalg.norm(z[:5]) * max(np.linalg.norm(z[:5]) - weights[0] * lagrange/lipschitz, 0) result[5:] = z[5:] / np.linalg.norm(z[5:]) * max(np.linalg.norm(z[5:]) - weights[3] * lagrange/lipschitz, 0) lipschitz = 1. q = rr.identity_quadratic(lipschitz, z, 0, 0) x2 = p.solve(q) pc = p.conjugate a2 = pc.solve(q) np.testing.assert_allclose(z-a2, x2)
def test_l1prox(): ''' this test verifies that the l1 prox in lagrange form can be solved by a primal/dual specification obviously, we don't to solve the l1 prox this way, but it verifies that specification is working correctly ''' l1 = rr.l1norm(4, lagrange=0.3) ww = np.random.standard_normal(4)*3 ab = l1.proximal(rr.identity_quadratic(0.5, ww, 0,0)) l1c = copy(l1) l1c.quadratic = rr.identity_quadratic(0.5, ww, None, 0.) a = rr.simple_problem.nonsmooth(l1c) solver = rr.FISTA(a) solver.fit(tol=1.e-10) ad = a.coefs l1c = copy(l1) l1c.quadratic = rr.identity_quadratic(0.5, ww, None, 0.) a = rr.dual_problem.fromprimal(l1c) solver = rr.FISTA(a) solver.fit(tol=1.0e-14) ac = a.primal np.testing.assert_allclose(ac, ab, rtol=1.0e-4) np.testing.assert_allclose(ac, ad, rtol=1.0e-4)
def test_simple_problem(self): tests = [] atom, q, prox_center, L = self.atom, self.q, self.prox_center, self.L loss = self.loss problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, min_its=100) tests.append((atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem with monotonicity\n %s' % str(self))) # write the loss in terms of a quadratic for the smooth loss and a smooth function... q = rr.identity_quadratic(L, prox_center, 0, 0) lossq = rr.quadratic.shift(prox_center.copy(), coef=0.6*L) lossq.quadratic = rr.identity_quadratic(0.4*L, prox_center.copy(), 0, 0) problem = rr.simple_problem(lossq, atom) tests.append((atom.proximal(q), problem.solve(coef_stop=self.coef_stop, FISTA=self.FISTA, tol=1.0e-12), 'solving prox with simple_problem ' + 'with monotonicity but loss has identity_quadratic %s\n ' % str(self))) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append((atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem no monotonicity_restart\n %s' % str(self))) d = atom.conjugate problem = rr.simple_problem(loss, d) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append((d.proximal(q), problem.solve(tol=1.e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, monotonicity_restart=False), 'solving dual prox with simple_problem no monotonocity\n %s ' % str(self))) if not self.interactive: for test in tests: yield (all_close,) + test + (self,) else: for test in tests: yield all_close(*((test + (self,))))
def test_adding_quadratic_lasso(): X, y, beta, active, sigma = instance(n=300, p=200) Q = rr.identity_quadratic(0.01, 0, np.random.standard_normal(X.shape[1]), 0) L1 = lasso.gaussian(X, y, 20, quadratic=Q) beta1 = L1.fit(solve_args={'min_its':500, 'tol':1.e-12}) G1 = X[:,L1.active].T.dot(X.dot(beta1) - y) + Q.objective(beta1,'grad')[L1.active] np.testing.assert_allclose(G1 * np.sign(beta1[L1.active]), -20) lin = rr.identity_quadratic(0.0, 0, np.random.standard_normal(X.shape[1]), 0) L2 = lasso.gaussian(X, y, 20, quadratic=lin) beta2 = L2.fit(solve_args={'min_its':500, 'tol':1.e-12}) G2 = X[:,L2.active].T.dot(X.dot(beta2) - y) + lin.objective(beta2,'grad')[L2.active] np.testing.assert_allclose(G2 * np.sign(beta2[L2.active]), -20)
def test_conjugate_sqerror(): """ This verifies the conjugate class can compute the conjugate of a quadratic function. """ ridge_coef = 0.4 X = np.random.standard_normal((10,4)) Y = np.random.standard_normal(10) l = rr.squared_error(X, Y) q = rr.identity_quadratic(ridge_coef,0,0,0) atom_conj = rr.conjugate(l, q, tol=1.e-12, min_its=100) w = np.random.standard_normal(4) u11, u12 = atom_conj.smooth_objective(w) # check that objective is half of squared error np.testing.assert_allclose(l.smooth_objective(w, mode='func'), 0.5 * np.linalg.norm(Y - np.dot(X, w))**2) np.testing.assert_allclose(atom_conj.atom.smooth_objective(w, mode='func'), 0.5 * np.linalg.norm(Y - np.dot(X, w))**2) XTX = np.dot(X.T, X) XTXi = np.linalg.pinv(XTX) quadratic_term = XTX + ridge_coef * np.identity(4) linear_term = np.dot(X.T, Y) + w b = u22 = np.linalg.solve(quadratic_term, linear_term) u21 = (w*u12).sum() - l.smooth_objective(u12, mode='func') - q.objective(u12, mode='func') np.testing.assert_allclose(u12, u22, rtol=1.0e-05) np.testing.assert_approx_equal(u11, u21)
def test_gaussian(n=100, p=20): y = np.random.standard_normal(n) X = np.random.standard_normal((n,p)) lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0) weights_with_zeros = 0.5*lam_theor * np.ones(p) weights_with_zeros[:3] = 0. huge_weights = weights_with_zeros * 10000 for q, fw in product([Q, None], [0.5*lam_theor, weights_with_zeros, huge_weights]): L = lasso.gaussian(X, y, fw, 1., quadratic=Q) L.fit() C = L.constraints sandwich = gaussian_sandwich_estimator(X, y) L = lasso.gaussian(X, y, fw, 1., quadratic=Q, covariance_estimator=sandwich) L.fit() C = L.constraints S = L.summary('onesided', compute_intervals=True) S = L.summary('twosided') nt.assert_raises(ValueError, L.summary, 'none') print(L.active) yield (np.testing.assert_array_less, np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset)
def test_conjugate_l1norm(): ''' this test verifies that numerically computing the conjugate is essentially the same as using the smooth_conjugate of the atom ''' q = rr.identity_quadratic(1.2,0,0,0) l1 = rr.l1norm(4, lagrange=0.3) pen2 = copy(l1) pen2.set_quadratic(q) v1 = rr.smooth_conjugate(l1, q) v2 = rr.conjugate(l1, q, tol=1.e-12, min_its=100) v3 = rr.conjugate(pen2, None, tol=1.e-12, min_its=100) w = np.random.standard_normal(4) u11, u12 = v1.smooth_objective(w) u21, u22 = v2.smooth_objective(w) u31, u32 = v3.smooth_objective(w) np.testing.assert_approx_equal(u11, u21) np.testing.assert_allclose(u12, u22, rtol=1.0e-05) np.testing.assert_approx_equal(u11, u31) np.testing.assert_allclose(u12, u32, rtol=1.0e-05) v2.smooth_objective(w, mode='func') v2.smooth_objective(w, mode='grad') nt.assert_raises(ValueError, v2.smooth_objective, w, 'blah')
def test_sqrt_lasso(n=100, p=20): y = np.random.standard_normal(n) X = np.random.standard_normal((n,p)) lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) / np.sqrt(n) Q = rr.identity_quadratic(0.01, 0, np.random.standard_normal(p) / 5., 0) weights_with_zeros = 0.5*lam_theor * np.ones(p) weights_with_zeros[:3] = 0. huge_weights = weights_with_zeros * 10000 for q, fw in product([None, Q], [0.5*lam_theor, weights_with_zeros, huge_weights]): L = lasso.sqrt_lasso(X, y, fw, quadratic=q, solve_args={'min_its':300, 'tol':1.e-12}) L.fit(solve_args={'min_its':300, 'tol':1.e-12}) C = L.constraints S = L.summary('onesided', compute_intervals=True) S = L.summary('twosided') yield (np.testing.assert_array_less, np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset)
def __iter__(self): for offset, FISTA, coef_stop, L, q, groups in itertools.product(self.offset_choices, self.FISTA_choices, self.coef_stop_choices, self.L_choices, self.quadratic_choices, self.group_choices): self.FISTA = FISTA self.coef_stop = coef_stop self.L = L if self.mode == 'lagrange': atom = self.klass(groups, lagrange=self.lagrange) else: atom = self.klass(groups, bound=self.bound) if q: atom.quadratic = rr.identity_quadratic(0,0,np.random.standard_normal(atom.shape)*0.02) if offset: atom.offset = 0.02 * np.random.standard_normal(atom.shape) solver = Solver(atom, interactive=self.interactive, coef_stop=coef_stop, FISTA=FISTA, L=L) yield solver
def test_proximal_maps(): bound = 0.14 lagrange = 0.13 shape = 20 Z = np.random.standard_normal(shape) * 4 W = 0.02 * np.random.standard_normal(shape) U = 0.02 * np.random.standard_normal(shape) linq = rr.identity_quadratic(0, 0, W, 0) for L, atom, q, offset, FISTA, coef_stop in itertools.product( [0.5, 1, 0.1], [A.l1norm, A.supnorm, A.l2norm, A.positive_part, A.constrained_max], [None, linq], [None, U], [False, True], [True, False], ): p = atom(shape, lagrange=lagrange, quadratic=q, offset=offset) d = p.conjugate yield ac, p.lagrange_prox(Z, lipschitz=L), Z - d.bound_prox( Z * L, lipschitz=1.0 / L ) / L, "testing lagrange_prox and bound_prox starting from atom %s " % atom # some arguments of the constructor nt.assert_raises(AttributeError, setattr, p, "bound", 4.0) nt.assert_raises(AttributeError, setattr, d, "lagrange", 4.0) nt.assert_raises(AttributeError, setattr, p, "bound", 4.0) nt.assert_raises(AttributeError, setattr, d, "lagrange", 4.0) for t in solveit(p, Z, W, U, linq, L, FISTA, coef_stop): yield t b = atom(shape, bound=bound, quadratic=q, offset=offset) for t in solveit(b, Z, W, U, linq, L, FISTA, coef_stop): yield t lagrange = 0.1 for L, atom, q, offset, FISTA, coef_stop in itertools.product( [0.5, 1, 0.1], sorted(A.nonpaired_atoms), [None, linq], [None, U], [False, True], [False, True] ): p = atom(shape, lagrange=lagrange, quadratic=q, offset=offset) d = p.conjugate yield ac, p.lagrange_prox(Z, lipschitz=L), Z - d.bound_prox( Z * L, lipschitz=1.0 / L ) / L, "testing lagrange_prox and bound_prox starting from atom %s " % atom # some arguments of the constructor nt.assert_raises(AttributeError, setattr, p, "bound", 4.0) nt.assert_raises(AttributeError, setattr, d, "lagrange", 4.0) nt.assert_raises(AttributeError, setattr, p, "bound", 4.0) nt.assert_raises(AttributeError, setattr, d, "lagrange", 4.0) for t in solveit(p, Z, W, U, linq, L, FISTA, coef_stop): yield t
def step_valid(self, max_trials=10): """ Try and move Y_valid by accept reject stopping after `max_trials`. """ X, L, mults = self.X, self.L, self.mults n, p = X.shape count = 0 Q_old = self.Q_valid while True: count += 1 self.Q_valid = self.Q_inter + identity_quadratic(0, 0, self.randomization.rvs(size=self.X.shape[1]) * self.scale_valid, 0) if len(self.mults) > 0: proposal_value = self.choose_lambda(self.Y, shift_size=0) if proposal_value[0] in self.accept_values: break else: break if count >= max_trials: self.Q_valid = Q_old break
def __init__(self, loss, linear_randomization, quadratic_coef, randomization, penalty, solve_args={'tol':1.e-10, 'min_its':100, 'max_its':500}): (self.loss, self.linear_randomization, self.randomization, self.quadratic_coef) = (loss, linear_randomization, randomization, quadratic_coef) # initialize optimization problem self.penalty = penalty self.problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic( quadratic_coef, 0, self.linear_randomization, 0) self.initial_soln = self.problem.solve(random_term, **solve_args) self.initial_grad = self.loss.smooth_objective(self.initial_soln, mode='grad') self.opt_vars = self.penalty.setup_sampling( \ self.initial_grad, self.initial_soln, self.linear_randomization, self.quadratic_coef)
def test_gaussian(n=100, p=20): y = np.random.standard_normal(n) X = np.random.standard_normal((n,p)) lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) Q = identity_quadratic(0.01, 0, np.ones(p), 0) weights_with_zeros = 0.1 * np.ones(p) weights_with_zeros[:3] = 0. for q, fw in product([Q, None], [0.5*lam_theor, weights_with_zeros]): L = lasso.gaussian(X, y, fw, 1., quadratic=Q) L.fit() C = L.constraints I = L.intervals S = L.summary('onesided') S = L.summary('twosided') yield (np.testing.assert_array_less, np.dot(L.constraints.linear_part, L._onestep), L.constraints.offset)
def test_lasso(): ''' this test verifies that the l1 prox can be solved by a primal/dual specification obviously, we don't to solve the l1 prox this way, but it verifies that specification is working correctly ''' l1 = rr.l1norm(4, lagrange=2.) l1.quadratic = rr.identity_quadratic(0.5, 0, None, 0.) X = np.random.standard_normal((10,4)) Y = np.random.standard_normal(10) + 3 loss = rr.quadratic.affine(X, -Y, coef=0.5) p2 = rr.separable_problem.singleton(l1, loss) solver2 = rr.FISTA(p2) solver2.fit(tol=1.0e-14, min_its=100) f = p2.objective ans = scipy.optimize.fmin_powell(f, np.zeros(4), ftol=1.0e-12, xtol=1.e-10) print(f(solver2.composite.coefs), f(ans)) np.testing.assert_allclose(ans + 0.1, solver2.composite.coefs + 0.1, rtol=1.e-3)
def __iter__(self): for offset, FISTA, coef_stop, L, q, w in itertools.product(self.offset_choices, self.FISTA_choices, self.coef_stop_choices, self.L_choices, self.quadratic_choices, self.weight_choices): self.FISTA = FISTA self.coef_stop = coef_stop self.L = L if self.mode == 'lagrange': atom = self.klass(w, lagrange=self.lagrange) else: atom = self.klass(w, bound=self.bound) atom.use_sklearn = self.use_sklearn and have_sklearn_iso # test out both prox maps if available if q: atom.quadratic = rr.identity_quadratic(0, 0, np.random.standard_normal(atom.shape)*0.02) if offset: atom.offset = 0.02 * np.random.standard_normal(atom.shape) solver = Solver(atom, interactive=self.interactive, coef_stop=coef_stop, FISTA=FISTA, L=L) yield solver
def test_proximal_method(): X = np.random.standard_normal((100, 50)) X[:,:7] *= 5 qX = identity_quadratic(1,X,0,0) P = FM.nuclear_norm(X.shape, lagrange=1) RP = todense(P.proximal(qX)) B = FM.nuclear_norm(X.shape, bound=1) RB = todense(B.proximal(qX)) BO = FM.operator_norm(X.shape, bound=1) PO = FM.operator_norm(X.shape, lagrange=1) RPO = todense(PO.proximal(qX)) RBO = todense(BO.proximal(qX)) D = np.linalg.svd(X, full_matrices=0)[1] lD = np.linalg.svd(RP, full_matrices=0)[1] lagrange_rank = (lD > 1.e-10).sum() all_close(lD[:lagrange_rank] + P.lagrange, D[:lagrange_rank], 'proximal method lagrange', None) bD = np.linalg.svd(RB, full_matrices=0)[1] bound_rank = (bD > 1.e-10).sum() all_close(bD[:bound_rank], projl1(D, B.bound)[:bound_rank], 'proximal method bound', None) nt.assert_true(np.linalg.norm(RPO+RB-X) / np.linalg.norm(X) < 0.01) nt.assert_true(np.linalg.norm(RBO+RP-X) / np.linalg.norm(X) < 0.01)
def test_sqrt_lasso_pvals(n=100, p=200, s=7, sigma=5, rho=0.3, snr=7.): counter = 0 while True: counter += 1 X, y, beta, active, sigma = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, snr=snr) lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) / np.sqrt(n) Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0) weights_with_zeros = 0.7*lam_theor * np.ones(p) weights_with_zeros[:3] = 0. L = lasso.sqrt_lasso(X, y, weights_with_zeros) L.fit() v = {1:'twosided', 0:'onesided'}[counter % 2] if set(active).issubset(L.active): S = L.summary(v) return [p for p, v in zip(S['pval'], S['variable']) if v not in active]
def __iter__(self): for offset, FISTA, coef_stop, L, q in itertools.product(self.offset_choices, self.FISTA_choices, self.coef_stop_choices, self.L_choices, self.quadratic_choices): self.FISTA = FISTA self.coef_stop = coef_stop self.L = L if self.mode == 'lagrange': atom = self.klass(self.shape, lagrange=self.lagrange) else: atom = self.klass(self.shape, bound=self.bound) if q: atom.quadratic = rr.identity_quadratic(0,0,np.random.standard_normal(atom.shape)*0.02) if offset: atom.offset = 0.02 * np.random.standard_normal(atom.shape) solver = Solver(atom, interactive=self.interactive, coef_stop=coef_stop, FISTA=FISTA, L=L) # make sure certain lines of code are tested assert(atom == atom) atom.latexify(), atom.dual, atom.conjugate yield solver
def test_proximal_maps(): X = np.random.standard_normal((100, 50)) X[:,:7] *= 5 P = FM.nuclear_norm(X.shape, lagrange=1) RP = todense(P.lagrange_prox(X)) B = FM.nuclear_norm(X.shape, bound=1) RB = todense(B.bound_prox(X)) BO = FM.operator_norm(X.shape, bound=1) PO = FM.operator_norm(X.shape, lagrange=1) RPO = todense(PO.lagrange_prox(X)) RBO = todense(BO.bound_prox(X)) D = np.linalg.svd(X, full_matrices=0)[1] lD = np.linalg.svd(RP, full_matrices=0)[1] lagrange_rank = (lD > 1.e-10).sum() all_close(lD[:lagrange_rank] + P.lagrange, D[:lagrange_rank], 'proximal lagrange', None) bD = np.linalg.svd(RB, full_matrices=0)[1] bound_rank = (bD > 1.e-10).sum() all_close(bD[:bound_rank], projl1(D, B.bound)[:bound_rank], 'proximal bound', None) nt.assert_true(np.linalg.norm(RPO+RB-X) / np.linalg.norm(X) < 0.01) nt.assert_true(np.linalg.norm(RBO+RP-X) / np.linalg.norm(X) < 0.01) # running code to ensure it is tested P.conjugate P.quadratic = identity_quadratic(1, 0, 0, 0) P.conjugate BO.conjugate BO.quadratic = identity_quadratic(1, 0, 0, 0) BO.conjugate B.conjugate B.quadratic = identity_quadratic(1, 0, 0, 0) B.conjugate PO.conjugate PO.quadratic = identity_quadratic(1, 0, 0, 0) PO.conjugate
def test_simple(): Z = np.random.standard_normal(100) * 4 p = rr.l1norm(100, lagrange=0.13) L = 0.14 loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, p) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10, debug=True) simple_coef = solver.composite.coefs prox_coef = p.proximal(rr.identity_quadratic(L, Z, 0, 0)) p2 = rr.l1norm(100, lagrange=0.13) p2 = copy(p) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, debug=True) simple_nonsmooth_coef = solver.composite.coefs p = rr.l1norm(100, lagrange=0.13) p.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p) simple_nonsmooth_gengrad = gengrad(problem, L, tol=1.0e-10) p = rr.l1norm(100, lagrange=0.13) problem = rr.separable_problem.singleton(p, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10) separable_coef = solver.composite.coefs loss2 = rr.quadratic.shift(-Z, coef=0.6*L) loss2.quadratic = rr.identity_quadratic(0.4*L, Z, 0, 0) p.coefs *= 0 problem2 = rr.simple_problem(loss2, p) loss2_coefs = problem2.solve(coef_stop=True) solver2 = rr.FISTA(problem2) solver2.fit(tol=1.0e-10, debug=True, coef_stop=True) yield ac, prox_coef, simple_nonsmooth_gengrad, 'prox to nonsmooth gengrad' yield ac, prox_coef, separable_coef, 'prox to separable' yield ac, prox_coef, simple_nonsmooth_coef, 'prox to simple_nonsmooth' yield ac, prox_coef, simple_coef, 'prox to simple' yield ac, prox_coef, loss2_coefs, 'simple where loss has quadratic 1' yield ac, prox_coef, solver2.composite.coefs, 'simple where loss has quadratic 2'
def test_conjugate_l1norm(): ''' this test verifies that numerically computing the conjugate is essentially the same as using the smooth_conjugate of the atom ''' l1 = rr.l1norm(4, lagrange=0.3) v1=rr.smooth_conjugate(l1, rr.identity_quadratic(0.3,None,None,0)) v2 = rr.conjugate(l1, rr.identity_quadratic(0.3,None,None,0), tol=1.e-12) w=np.random.standard_normal(4) u11, u12 = v1.smooth_objective(w) u21, u22 = v2.smooth_objective(w) np.testing.assert_approx_equal(u11, u21) np.testing.assert_allclose(u12, u22, rtol=1.0e-05)
def test_nonnegative_positive_part(debug=False): """ This test verifies that using nonnegative constraint with a linear term, with some unpenalized terms yields the same result as using separable with constrained_positive_part and nonnegative """ import numpy as np import regreg.api as rr import regreg.atoms as rra # N - number of data points # P - number of columns in design == number of betas N, P = 40, 30 # an arbitrary positive offset for data and design offset = 2 # data Y = np.random.normal(size=(N,)) + offset # design - with ones as last column X = np.ones((N,P)) X[:,:-1] = np.random.normal(size=(N,P-1)) + offset # coef for loss coef = 0.5 # lagrange for penalty lagrange = .1 # Loss function (squared difference between fitted and actual data) loss = rr.quadratic.affine(X, -Y, coef=coef) # Penalty using nonnegative, leave the last 5 unpenalized but # nonnegative weights = np.ones(P) * lagrange weights[-5:] = 0 linq = rr.identity_quadratic(0,0,weights,0) penalty = rr.nonnegative(P, quadratic=linq) # Solution composite_form = rr.separable_problem.singleton(penalty, loss) solver = rr.FISTA(composite_form) solver.debug = debug solver.fit(tol=1.0e-12, min_its=200) coefs = solver.composite.coefs # using the separable penalty, only penalize the first # 25 coefficients with constrained_positive_part penalties_s = [rr.constrained_positive_part(25, lagrange=lagrange), rr.nonnegative(5)] groups_s = [slice(0,25), slice(25,30)] penalty_s = rr.separable((P,), penalties_s, groups_s) composite_form_s = rr.separable_problem.singleton(penalty_s, loss) solver_s = rr.FISTA(composite_form_s) solver_s.debug = debug solver_s.fit(tol=1.0e-12, min_its=200) coefs_s = solver_s.composite.coefs nt.assert_true(np.linalg.norm(coefs - coefs_s) / np.linalg.norm(coefs) < 1.0e-02)
def randomize(self): """ Carry out the randomization, finding the value of lambda as well as the selected variables and signs. Initiailizes the attributes: [Y_inter, Y_valid, Y_select]. """ n = self.Y.shape[0] # intermediate between # CV and model selection # and the actual data self.Q_inter = identity_quadratic(0, 0, self.randomization.rvs(size=self.X.shape[1]) * self.scale_inter, 0) self.Q_valid = self.Q_inter + identity_quadratic(0, 0, self.randomization.rvs(size=self.X.shape[1]) * self.scale_valid, 0) self.Q_select = self.Q_inter + identity_quadratic(0, 0, self.randomization.rvs(size=self.X.shape[1]) * self.scale_select, 0)
def test_lasso_separable(): """ This test verifies that the specification of a separable penalty yields the same results as having two linear_atoms with selector matrices. The penalty here is a lasso, i.e. l1 penalty. """ X = np.random.standard_normal((100,20)) Y = np.random.standard_normal((100,)) + np.dot(X, np.random.standard_normal(20)) penalty1 = rr.l1norm(10, lagrange=1.2) penalty2 = rr.l1norm(10, lagrange=1.2) penalty = rr.separable((20,), [penalty1, penalty2], [slice(0,10), slice(10,20)], test_for_overlap=True) # ensure code is tested print(penalty1.latexify()) print(penalty.latexify()) print(penalty.conjugate) print(penalty.dual) print(penalty.seminorm(np.ones(penalty.shape))) print(penalty.constraint(np.ones(penalty.shape), bound=2.)) pencopy = copy(penalty) pencopy.set_quadratic(rr.identity_quadratic(1,0,0,0)) pencopy.conjugate # solve using separable loss = rr.quadratic.affine(X, -Y, coef=0.5) problem = rr.separable_problem.fromatom(penalty, loss) solver = rr.FISTA(problem) solver.fit(min_its=200, tol=1.0e-12) coefs = solver.composite.coefs # solve using the usual composite penalty_all = rr.l1norm(20, lagrange=1.2) problem_all = rr.container(loss, penalty_all) solver_all = rr.FISTA(problem_all) solver_all.fit(min_its=100, tol=1.0e-12) coefs_all = solver_all.composite.coefs # solve using the selectors penalty_s = [rr.linear_atom(p, rr.selector(g, (20,))) for p, g in zip(penalty.atoms, penalty.groups)] problem_s = rr.container(loss, *penalty_s) solver_s = rr.FISTA(problem_s) solver_s.fit(min_its=500, tol=1.0e-12) coefs_s = solver_s.composite.coefs np.testing.assert_almost_equal(coefs, coefs_all) np.testing.assert_almost_equal(coefs, coefs_s)
def test_duality_of_projections(self): if self.atom.quadratic == rr.identity_quadratic(0,0,0,0) or self.atom.quadratic is None: tests = [] d = self.atom.conjugate q = rr.identity_quadratic(1, self.prox_center, 0, 0) tests.append((self.prox_center-self.atom.proximal(q), d.proximal(q), 'testing duality of projections starting from atom\n %s ' % str(self))) if hasattr(self.atom, 'check_subgradient') and self.atom.offset is None: # check subgradient condition v1, v2 = self.atom.check_subgradient(self.atom, self.prox_center) tests.append((v1, v2, 'checking subgradient condition\n %s' % str(self))) if not self.interactive: for test in tests: yield (all_close,) + test + (self,) else: for test in tests: yield all_close(*((test + (self,))))
def test_quadratic(): l = rr.quadratic(5, coef=3., offset=np.arange(5)) l.quadratic = rr.identity_quadratic(1,np.ones(5), 2*np.ones(5), 3.) c1 = l.get_conjugate(as_quadratic=True) q1 = rr.identity_quadratic(3, -np.arange(5), 0, 0) q2 = q1 + l.quadratic c2 = rr.zero(5, quadratic=q2.collapsed()).conjugate ww = np.random.standard_normal(5) np.testing.assert_almost_equal(c2.smooth_objective(ww, 'grad'), c1.smooth_objective(ww, 'grad')) np.testing.assert_almost_equal(c2.objective(ww), c1.objective(ww)) np.testing.assert_almost_equal(c2.smooth_objective(ww, 'func') + c2.nonsmooth_objective(ww), c1.smooth_objective(ww, 'func') + c1.nonsmooth_objective(ww))
def test_container(self): tests = [] atom, q, prox_center, L = self.atom, self.q, self.prox_center, self.L loss = self.loss problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=self.coef_stop, FISTA=self.FISTA) tests.append((atom.proximal(q), solver.composite.coefs, 'solving atom prox with container\n %s ' % str(self))) # write the loss in terms of a quadratic for the smooth loss and a smooth function... q = rr.identity_quadratic(L, prox_center, 0, 0) lossq = rr.quadratic.shift(prox_center.copy(), coef=0.6*L) lossq.quadratic = rr.identity_quadratic(0.4*L, prox_center.copy(), 0, 0) problem = rr.container(lossq, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=self.FISTA, coef_stop=self.coef_stop) tests.append((atom.proximal(q), problem.solve(tol=1.e-12,FISTA=self.FISTA,coef_stop=self.coef_stop), 'solving prox with container with monotonicity ' + 'but loss has identity_quadratic\n %s ' % str(self))) d = atom.conjugate problem = rr.container(d, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=self.coef_stop, FISTA=self.FISTA) tests.append((d.proximal(q), solver.composite.coefs, 'solving dual prox with container\n %s ' % str(self))) if not self.interactive: for test in tests: yield (all_close,) + test + (self,) else: for test in tests: yield all_close(*((test + (self,))))
def test_conjugate_sqerror(): X = np.random.standard_normal((10,4)) Y = np.random.standard_normal(10) l = rr.quadratic.affine(X,-Y, coef=0.5) v = rr.conjugate(l, rr.identity_quadratic(0.3,None,None,0), tol=1.e-12) w=np.random.standard_normal(4) u11, u12 = v.smooth_objective(w) XTX = np.dot(X.T, X) b = u22 = np.linalg.solve(XTX + 0.3 * np.identity(4), np.dot(X.T, Y) + w) u21 = - np.dot(b.T, np.dot(XTX + 0.3 * np.identity(4), b)) / 2. + (w*b).sum() + (np.dot(X.T, Y) * b).sum() - np.linalg.norm(Y)**2/2. np.testing.assert_approx_equal(u11, u21) np.testing.assert_allclose(u12, u22, rtol=1.0e-05)
def test_gengrad_blocknorms(): Z = np.random.standard_normal((10, 10)) * 4 p = rr.l1_l2((10, 10), lagrange=0.13) dual = p.conjugate L = 0.23 loss = rr.quadratic_loss.shift(Z, coef=L) problem = rr.simple_problem(loss, p) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10, debug=True) simple_coef = solver.composite.coefs q = rr.identity_quadratic(L, Z, 0, 0) prox_coef = p.proximal(q) p2 = copy(p) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, debug=True) simple_nonsmooth_coef = solver.composite.coefs p = rr.l1_l2((10, 10), lagrange=0.13) p.quadratic = rr.identity_quadratic(L, Z, 0, 0) problem = rr.simple_problem.nonsmooth(p) simple_nonsmooth_gengrad = rr.gengrad(problem, L, tol=1.0e-10) p = rr.l1_l2((10, 10), lagrange=0.13) problem = rr.separable_problem.singleton(p, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-10) separable_coef = solver.composite.coefs yield (all_close, prox_coef, simple_coef, "prox to simple", None) yield (all_close, prox_coef, simple_nonsmooth_gengrad, "prox to nonsmooth gengrad", None) yield (all_close, prox_coef, separable_coef, "prox to separable", None) yield (all_close, prox_coef, simple_nonsmooth_coef, "prox to simple_nonsmooth", None)
def _solve_randomized_problem(self, perturb=None, solve_args={ 'tol': 1.e-12, 'min_its': 50 }): p = self.nfeature # take a new perturbation if supplied if perturb is not None: self._initial_omega = perturb if self._initial_omega is None: self._initial_omega = self.randomizer.sample() quad = rr.identity_quadratic(self.ridge_term, 0, -self._initial_omega, 0) problem = rr.simple_problem(self.loglike, self.penalty) initial_soln = problem.solve(quad, **solve_args) initial_subgrad = -(self.loglike.smooth_objective( initial_soln, 'grad') + quad.objective(initial_soln, 'grad')) return initial_soln, initial_subgrad
def test_proximal_maps(): shape = (5, 4) bound = 0.14 lagrange = 0.13 Z = np.random.standard_normal(shape) * 2 W = 0.02 * np.random.standard_normal(shape) U = 0.02 * np.random.standard_normal(shape) linq = rr.identity_quadratic(0, 0, W, 0) basis = np.linalg.svd(np.random.standard_normal((4, 20)), full_matrices=0)[2] for L, atom, q, offset, FISTA, coef_stop in itertools.product( [0.5, 1, 0.1], sorted(S.conjugate_svd_pairs.keys()), [None, linq], [None, U], [False, True], [False, True]): p = atom(shape, quadratic=q, lagrange=lagrange, offset=offset) d = p.conjugate yield ac, p.lagrange_prox(Z, lipschitz=L), Z - d.bound_prox( Z * L, lipschitz=1. / L ) / L, 'testing lagrange_prox and bound_prox starting from atom %s ' % atom # some arguments of the constructor nt.assert_raises(AttributeError, setattr, p, 'bound', 4.) nt.assert_raises(AttributeError, setattr, d, 'lagrange', 4.) nt.assert_raises(AttributeError, setattr, p, 'bound', 4.) nt.assert_raises(AttributeError, setattr, d, 'lagrange', 4.) for t in solveit(p, Z, W, U, linq, L, FISTA, coef_stop): yield t b = atom(shape, bound=bound, quadratic=q, offset=offset) for t in solveit(b, Z, W, U, linq, L, FISTA, coef_stop): yield t
def test_class(): n, p = (10, 5) D = np.random.standard_normal((n,p)) v = np.random.standard_normal(n) pen = rr.l1norm.affine(D, v, lagrange=0.4) pen2 = rr.l1norm(n, lagrange=0.4, offset=np.random.standard_normal(n)) pen2.quadratic = None cls = type(pen) pen_aff = cls(pen2, rr.affine_transform(D, v)) for _pen in [pen, pen_aff]: # Run to ensure code gets executed in tests (smoke test) print(_pen.dual) print(_pen.latexify()) print(str(_pen)) print(repr(_pen)) print(_pen._repr_latex_()) _pen.nonsmooth_objective(np.random.standard_normal(p)) q = rr.identity_quadratic(0.5,0,0,0) smoothed_pen = _pen.smoothed(q)
def test_lasso_dual_from_primal(l1=.1, L=2.): """ Check that the solution of the lasso signal approximator dual composite is soft-thresholding, when call from primal composite object """ sparsity = R.l1norm(500, lagrange=l1) x = np.random.normal(0, 1, 500) y = np.random.normal(0, 1, 500) X = np.random.standard_normal((1000, 500)) Y = np.random.standard_normal((1000, )) regloss = R.quadratic.affine(-X, Y) p = R.container(regloss, sparsity) z = x - y / L soln = p.proximal(R.identity_quadratic(L, z, 0, 0)) st = np.maximum(np.fabs(z) - l1 / L, 0) * np.sign(z) print x[range(10)] print soln[range(10)] print st[range(10)] np.testing.assert_almost_equal(soln, st, decimal=3)
def test_coxph(): Q = rr.identity_quadratic(0.01, 0, np.ones(5), 0) X = np.random.standard_normal((100, 5)) T = np.random.standard_exponential(100) S = np.random.binomial(1, 0.5, size=(100, )) L = lasso.coxph(X, T, S, 0.1, quadratic=Q) L.fit() L = lasso.coxph(X, T, S, 0.1, quadratic=Q) L.fit() C = L.constraints np.testing.assert_array_less( \ np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset) P = L.summary()['pval'] return L, C, P
def test_gaussian(n=100, p=20): y = np.random.standard_normal(n) X = np.random.standard_normal((n, p)) lam_theor = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0) weights_with_zeros = 0.5 * lam_theor * np.ones(p) weights_with_zeros[:3] = 0. huge_weights = weights_with_zeros * 10000 for q, fw in product([Q, None], [0.5 * lam_theor, weights_with_zeros, huge_weights]): L = lasso.gaussian(X, y, fw, 1., quadratic=Q) L.fit() C = L.constraints sandwich = glm_sandwich_estimator(L.loglike, B=5000) L = lasso.gaussian(X, y, fw, 1., quadratic=Q, covariance_estimator=sandwich) L.fit() C = L.constraints S = L.summary('onesided', compute_intervals=True) S = L.summary('twosided') nt.assert_raises(ValueError, L.summary, 'none') print(L.active) yield (np.testing.assert_array_less, np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset)
def __init__(self, affine_con, direction_of_interest, offset=None, quadratic=None, initial=None): rr.smooth_atom.__init__(self, affine_con.linear_part.shape[1] + 1, offset=offset, quadratic=quadratic, initial=initial) self.affine_con = affine_con self.direction_of_interest = eta = direction_of_interest design = self.design = np.hstack( [np.identity(affine_con.dim), eta.reshape((-1, 1))]) sqrt_inv = affine_con.covariance_factors()[1] Si = np.dot(sqrt_inv.T, sqrt_inv) self.Q = np.dot(design.T, np.dot(Si, design)) gamma = affine_con.mean linear_part = np.dot(affine_con.linear_part, design) offset = affine_con.offset - np.dot(affine_con.linear_part, affine_con.mean) scaling = np.sqrt((linear_part**2).sum(1)) linear_part /= scaling[:, None] offset /= scaling self.linear_objective = 0. smoothing_quadratic = rr.identity_quadratic(1.e-2, 0, 0, 0) self.smooth_constraint = rr.nonpositive.affine( linear_part, -offset).smoothed(smoothing_quadratic)
def test_conjugate(): z = np.random.standard_normal(10) w = np.random.standard_normal(10) y = np.random.standard_normal(10) for atom_c in [ R.l1norm, R.l2norm, R.positive_part, R.supnorm, R.constrained_positive_part ]: linq = R.identity_quadratic(0, 0, w, 0) atom = atom_c(10, quadratic=linq, offset=y, lagrange=2.345) np.testing.assert_almost_equal( atom.conjugate.conjugate.nonsmooth_objective(z), atom.nonsmooth_objective(z), decimal=3) for atom_c in [R.nonnegative, R.nonpositive]: atom = atom_c(10, quadratic=linq, offset=y) np.testing.assert_almost_equal( atom.conjugate.conjugate.nonsmooth_objective(z), atom.nonsmooth_objective(z), decimal=3)
def selection(X, y, random_Z, randomization_scale=1, sigma=None, method="theoretical"): n, p = X.shape loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) lam_frac = 1.2 if sigma is None: sigma = 1. if method == "theoretical": lam = 1. * sigma * lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon, 0, -randomization_scale * random_Z, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) active = (initial_soln != 0) if np.sum(active) == 0: return None initial_grad = loss.smooth_objective(initial_soln, mode='grad') betaE = initial_soln[active] subgradient = -(initial_grad + epsilon * initial_soln - randomization_scale * random_Z) cube = subgradient[~active] / lam return lam, epsilon, active, betaE, cube, initial_soln
def test_gaussian_unknown(): n, p = 20, 5 X = np.random.standard_normal((n, p)) Y = np.random.standard_normal(n) T = X.T.dot(Y) N = -(Y**2).sum() / 2. sufficient_stat = np.hstack([T, N]) cumulant = gaussian_cumulant(X) conj = gaussian_cumulant_conjugate(X) MLE = cumulant.regression_parameters( conj.smooth_objective(sufficient_stat, 'grad')) linear = rr.identity_quadratic(0, 0, -sufficient_stat, 0) cumulant.coefs[:] = 1. MLE2 = cumulant.solve(linear, tol=1.e-12, min_its=400) np.testing.assert_allclose(MLE2, conj.smooth_objective(sufficient_stat, 'grad'), rtol=1.e-4, atol=1.e-4) beta_hat = np.linalg.pinv(X).dot(Y) sigmasq_hat = np.sum(((Y - X.dot(beta_hat))**2) / n) np.testing.assert_allclose(beta_hat, MLE[0]) np.testing.assert_allclose(sigmasq_hat, MLE[1]) G = conj.smooth_objective(sufficient_stat, 'grad') M = cumulant.smooth_objective(G, 'grad') np.testing.assert_allclose(sufficient_stat, M) G = cumulant.smooth_objective(MLE2, 'grad') M = conj.smooth_objective(G, 'grad') np.testing.assert_allclose(MLE2, M)
def _find_row_approx_inverse(Sigma, j, delta, solve_args={ 'min_its': 100, 'tol': 1.e-6, 'max_its': 500 }): """ Find an approximation of j-th row of inverse of Sigma. Solves the problem .. math:: \text{min}_{\theta} \frac{1}{2} \theta^TS\theta subject to $\|\Sigma \hat{\theta} - e_j\|_{\infty} \leq \delta$ with $e_j$ the $j$-th elementary basis vector and `S` as $\Sigma$, and `delta` as $\delta$. Described in Table 1, display (4) of https://arxiv.org/pdf/1306.3171.pdf """ p = Sigma.shape[0] elem_basis = np.zeros(p, np.float) elem_basis[j] = 1. loss = quadratic_loss(p, Q=Sigma) penalty = l1norm(p, lagrange=delta) iq = identity_quadratic(0, 0, elem_basis, 0) problem = simple_problem(loss, penalty) dual_soln = problem.solve(iq, **solve_args) soln = -dual_soln # check feasibility -- if it fails miserably # presume delta was too small feasibility_gap = np.fabs(Sigma.dot(soln) - elem_basis).max() if feasibility_gap > (1.01) * delta: raise ValueError( 'does not seem to be a feasible point -- try increasing delta') return soln
def test_different_dim(): """ This test checks that the reshape argument of separable works properly. """ X = np.random.standard_normal((100, 20)) Y = (np.random.standard_normal( (100, )) + np.dot(X, np.random.standard_normal(20))) penalty1 = rr.nuclear_norm((5, 2), lagrange=1.2) penalty2 = rr.l1norm(10, lagrange=1.2) penalty = rr.separable((20, ), [penalty1, penalty2], [slice(0, 10), slice(10, 20)], test_for_overlap=True, shapes=[(5, 2), None]) # ensure code is tested print(penalty1.latexify()) print(penalty.latexify()) print(penalty.conjugate) print(penalty.dual) print(penalty.seminorm(np.ones(penalty.shape))) print(penalty.constraint(np.ones(penalty.shape), bound=2.)) pencopy = copy(penalty) pencopy.set_quadratic(rr.identity_quadratic(1, 0, 0, 0)) pencopy.conjugate # solve using separable loss = rr.quadratic_loss.affine(X, -Y, coef=0.5) problem = rr.separable_problem.fromatom(penalty, loss) solver = rr.FISTA(problem) solver.fit(min_its=200, tol=1.0e-12) coefs = solver.composite.coefs
def __iter__(self): for offset, FISTA, coef_stop, L, q, groups in itertools.product(self.offset_choices, self.FISTA_choices, self.coef_stop_choices, self.L_choices, self.quadratic_choices, self.group_choices): self.FISTA = FISTA self.coef_stop = coef_stop self.L = L atom = self.klass(groups) if q: atom.quadratic = rr.identity_quadratic(0,0,np.random.standard_normal(atom.shape)*0.02) if offset: atom.offset = 0.02 * np.random.standard_normal(atom.shape) solver = Solver(atom, interactive=self.interactive, coef_stop=coef_stop, FISTA=FISTA, L=L) yield solver
def test_sqrt_lasso_pvals(n=100, p=200, s=7, sigma=5, rho=0.3, signal=7.): X, y, beta, true_active, sigma, _ = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, signal=signal) lam_theor = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 1000)))).max(0)) / np.sqrt(n) Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0) weights_with_zeros = 0.7 * lam_theor * np.ones(p) weights_with_zeros[:3] = 0. lasso.sqrt_lasso(X, y, weights_with_zeros, covariance='parametric') L = lasso.sqrt_lasso(X, y, weights_with_zeros) L.fit() if set(true_active).issubset(L.active): S = L.summary('onesided') S = L.summary('twosided') return S['pval'], [v in true_active for v in S['variable']]
def _solve_conjugate_problem(self, natural_param, niter=500, tol=1.e-10): affine_con = self.affine_con loss = softmax(affine_con, sigma=self.sigma) L = rr.identity_quadratic(0, 0, -natural_param, 0) # linear_term A = affine_con.linear_part b = affine_con.offset mean_param = self.feasible_point.copy() step = 1. / self.sigma f_cur = np.inf for i in range(niter): G = -natural_param + loss.smooth_objective(mean_param, 'grad') proposed = mean_param - step * G slack = b - A.dot(proposed) if i % 5 == 0: step *= 2. if np.any(slack < 0): step *= 0.5 else: f_proposed = (-(natural_param * proposed).sum() + loss.smooth_objective(proposed, 'func')) if f_proposed > f_cur * (1 + tol): step *= 0.5 else: mean_param = proposed if np.fabs(f_cur - f_proposed) < tol * max( [1, np.fabs(f_cur), np.fabs(f_proposed)]): break f_cur = f_proposed return -f_proposed, mean_param
def test_simple_problem(self): tests = [] atom, q, prox_center, L = self.atom, self.q, self.prox_center, self.L loss = self.loss problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, min_its=100) tests.append( (atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem with monotonicity\n %s' % str(self))) # write the loss in terms of a quadratic for the smooth loss and a smooth function... q = rr.identity_quadratic(L, prox_center, 0, 0) lossq = rr.quadratic.shift(prox_center.copy(), coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, prox_center.copy(), 0, 0) problem = rr.simple_problem(lossq, atom) tests.append( (atom.proximal(q), problem.solve(coef_stop=self.coef_stop, FISTA=self.FISTA, tol=1.0e-12), 'solving prox with simple_problem ' + 'with monotonicity but loss has identity_quadratic %s\n ' % str(self))) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append( (atom.proximal(q), solver.composite.coefs, 'solving prox with simple_problem no monotonicity_restart\n %s' % str(self))) d = atom.conjugate problem = rr.simple_problem(loss, d) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=self.coef_stop, FISTA=self.FISTA, min_its=100) tests.append( (d.proximal(q), problem.solve(tol=1.e-12, FISTA=self.FISTA, coef_stop=self.coef_stop, monotonicity_restart=False), 'solving dual prox with simple_problem no monotonocity\n %s ' % str(self))) if not self.interactive: for test in tests: yield (all_close, ) + test + (self, ) else: for test in tests: yield all_close(*((test + (self, ))))
def test_lasso(s=0, n=100, p=20, weights = "neutral", randomization_dist = "logistic", randomization_scale = 1, Langevin_steps = 10000, burning = 2000, X_scaled = True, covariance_estimate = "nonparametric", noise = "uniform"): """ weights: exponential, gamma, normal, gumbel randomization_dist: logistic, laplace """ step_size = 1./p X, y, true_beta, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0, scale=X_scaled, noise=noise) print 'true beta', true_beta lam_frac = 1. if randomization_dist == "laplace": randomization = laplace(loc=0, scale=1.) random_Z = randomization.rvs(p) if randomization_dist == "logistic": random_Z = np.random.logistic(loc=0, scale = 1, size = p) if randomization_dist== "normal": random_Z = np.random.standard_normal(p) print 'randomization', random_Z*randomization_scale loss = lasso_randomX.lasso_randomX(X, y) epsilon = 1./np.sqrt(n) #epsilon = 1. lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))+randomization_scale*np.random.logistic(size=(p,10000))).max(0)) lam_scaled = lam.copy() random_Z_scaled = random_Z.copy() epsilon_scaled = epsilon if (X_scaled == False): random_Z_scaled *= np.sqrt(n) lam_scaled *= np.sqrt(n) epsilon_scaled *= np.sqrt(n) penalty = randomized.selective_l1norm_lan(p, lagrange=lam_scaled) # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon_scaled, 0, -randomization_scale*random_Z_scaled, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) print 'initial solution', initial_soln active = (initial_soln != 0) if np.sum(active)==0: return [-1], [-1] inactive = ~active betaE = initial_soln[active] signs = np.sign(betaE) initial_grad = -np.dot(X.T, y - np.dot(X, initial_soln)) if (X_scaled==False): initial_grad /= np.sqrt(n) print 'initial_gradient', initial_grad subgradient = random_Z - initial_grad - epsilon * initial_soln cube = np.divide(subgradient[inactive], lam) nactive = betaE.shape[0] ninactive = cube.shape[0] beta_unpenalized = np.linalg.lstsq(X[:, active], y)[0] print 'beta_OLS onto E', beta_unpenalized obs_residuals = y - np.dot(X[:, active], beta_unpenalized) # y-X_E\bar{\beta}^E N = np.dot(X[:, inactive].T, obs_residuals) # X_{-E}^T(y-X_E\bar{\beta}_E), null statistic full_null = np.zeros(p) full_null[nactive:] = N # parametric coveriance estimate if covariance_estimate == "parametric": XE_pinv = np.linalg.pinv(X[:, active]) mat = np.zeros((nactive+ninactive, n)) mat[:nactive,:] = XE_pinv mat[nactive:,:] = X[:, inactive].T.dot(np.identity(n)-X[:, active].dot(XE_pinv)) Sigma_full = mat.dot(mat.T) else: Sigma_full = bootstrap_covariance(X,y,active, beta_unpenalized) init_vec_state = np.zeros(n+nactive+ninactive) if weights =="exponential": init_vec_state[:n] = np.ones(n) else: init_vec_state[:n] = np.zeros(n) #init_vec_state[:n] = np.random.standard_normal(n) #init_vec_state[:n] = np.ones(n) init_vec_state[n:(n+nactive)] = betaE init_vec_state[(n+nactive):] = cube def full_projection(vec_state, signs = signs, nactive=nactive, ninactive = ninactive): alpha = vec_state[:n].copy() betaE = vec_state[n:(n+nactive)].copy() cube = vec_state[(n+nactive):].copy() projected_alpha = alpha.copy() projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) if weights == "exponential": projected_alpha = np.clip(alpha, 0, np.inf) if weights == "gamma": projected_alpha = np.clip(alpha, -2+1./n, np.inf) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate((projected_alpha, projected_betaE, projected_cube), 0) Sigma = np.linalg.inv(np.dot(X[:, active].T, X[:, active])) null, alt = pval(init_vec_state, full_projection, X, obs_residuals, beta_unpenalized, full_null, signs, lam, epsilon, nonzero, active, Sigma, weights, randomization_dist, randomization_scale, Langevin_steps, step_size, burning, X_scaled) # Sigma_full[:nactive, :nactive]) return null, alt
def solveit(atom, Z, W, U, linq, L, FISTA, coef_stop): p2 = copy(atom) p2.quadratic = rr.identity_quadratic(L, Z, 0, 0) d = atom.conjugate q = rr.identity_quadratic(1, Z, 0, 0) yield ac, Z - atom.proximal(q), d.proximal( q), 'testing duality of projections starting from atom %s ' % atom q = rr.identity_quadratic(L, Z, 0, 0) # use simple_problem.nonsmooth p2 = copy(atom) p2.quadratic = atom.quadratic + q problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, FISTA=FISTA, coef_stop=coef_stop) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem.nonsmooth with monotonicity %s ' % atom # use the solve method p2.coefs *= 0 p2.quadratic = atom.quadratic + q soln = p2.solve() yield ac, atom.proximal( q), soln, 'solving prox with solve method %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem with monotonicity %s ' % atom dproblem2 = rr.dual_problem(loss.conjugate, rr.identity(loss.shape), atom.conjugate) dcoef2 = dproblem2.solve(coef_stop=coef_stop, tol=1.e-14) yield ac, atom.proximal( q ), dcoef2, 'solving prox with dual_problem with monotonicity %s ' % atom dproblem = rr.dual_problem.fromprimal(loss, atom) dcoef = dproblem.solve(coef_stop=coef_stop, tol=1.0e-14) yield ac, atom.proximal( q ), dcoef, 'solving prox with dual_problem.fromprimal with monotonicity %s ' % atom # write the loss in terms of a quadratic for the smooth loss and a smooth function... lossq = rr.quadratic.shift(-Z, coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, Z, 0, 0) problem = rr.simple_problem(lossq, atom) yield ac, atom.proximal(q), problem.solve( coef_stop=coef_stop, FISTA=FISTA, tol=1.0e-12 ), 'solving prox with simple_problem with monotonicity but loss has identity_quadratic %s ' % atom problem = rr.simple_problem.nonsmooth(p2) solver = rr.FISTA(problem) solver.fit(tol=1.0e-14, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem.nonsmooth with no monotonocity %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving prox with simple_problem %s no monotonicity_restart' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.separable_problem.singleton(atom, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving atom prox with separable_atom.singleton %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, atom.proximal( q ), solver.composite.coefs, 'solving atom prox with container %s ' % atom # write the loss in terms of a quadratic for the smooth loss and a smooth function... lossq = rr.quadratic.shift(-Z, coef=0.6 * L) lossq.quadratic = rr.identity_quadratic(0.4 * L, Z, 0, 0) problem = rr.container(lossq, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, FISTA=FISTA, coef_stop=coef_stop) yield ( ac, atom.proximal(q), problem.solve(tol=1.e-12, FISTA=FISTA, coef_stop=coef_stop), 'solving prox with container with monotonicity but loss has identity_quadratic %s ' % atom) loss = rr.quadratic.shift(-Z, coef=L) problem = rr.simple_problem(loss, d) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, monotonicity_restart=False, coef_stop=coef_stop, FISTA=FISTA) # ac(d.proximal(q), solver.composite.coefs, 'solving dual prox with simple_problem no monotonocity %s ' % atom) yield (ac, d.proximal(q), problem.solve(tol=1.e-12, FISTA=FISTA, coef_stop=coef_stop, monotonicity_restart=False), 'solving dual prox with simple_problem no monotonocity %s ' % atom) problem = rr.container(d, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, d.proximal( q ), solver.composite.coefs, 'solving dual prox with container %s ' % atom loss = rr.quadratic.shift(-Z, coef=L) problem = rr.separable_problem.singleton(d, loss) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12, coef_stop=coef_stop, FISTA=FISTA) yield ac, d.proximal( q ), solver.composite.coefs, 'solving atom prox with separable_atom.singleton %s ' % atom
def __init__( self, X, feasible_point, active, # the active set chosen by randomized lasso active_sign, # the set of signs of active coordinates chosen by lasso lagrange, # in R^p mean_parameter, # in R^n noise_variance, # noise_level in data randomizer, # specified randomization epsilon, # ridge penalty for randomized lasso coef=1., offset=None, quadratic=None, nstep=10): n, p = X.shape E = active.sum() self._X = X self.active = active self.noise_variance = noise_variance self.randomization = randomizer self.CGF_randomization = randomizer.CGF if self.CGF_randomization is None: raise ValueError( 'randomization must know its cgf -- currently only isotropic_gaussian and laplace are implemented and are assumed to be randomization with IID coordinates' ) self.inactive_lagrange = lagrange[~active] initial = feasible_point self.feasible_point = feasible_point rr.smooth_atom.__init__(self, (p, ), offset=offset, quadratic=quadratic, initial=initial, coef=coef) self.coefs[:] = feasible_point mean_parameter = np.squeeze(mean_parameter) self.active = active X_E = self.X_E = X[:, active] self.X_permute = np.hstack([self.X_E, self._X[:, ~active]]) B = X.T.dot(X_E) B_E = B[active] B_mE = B[~active] self.active_slice = np.zeros_like(active, np.bool) self.active_slice[:active.sum()] = True self.B_active = np.hstack([ (B_E + epsilon * np.identity(E)) * active_sign[None, :], np.zeros((E, p - E)) ]) self.B_inactive = np.hstack( [B_mE * active_sign[None, :], np.identity((p - E))]) self.B_p = np.vstack((self.B_active, self.B_inactive)) self.B_p_inv = np.linalg.inv(self.B_p.T) self.offset_active = active_sign * lagrange[active] self.inactive_subgrad = np.zeros(p - E) self.cube_bool = np.zeros(p, np.bool) self.cube_bool[E:] = 1 self.dual_arg = self.B_p_inv.dot( np.append(self.offset_active, self.inactive_subgrad)) self._opt_selector = rr.selector(~self.cube_bool, (p, )) self.set_parameter(mean_parameter, noise_variance) _barrier_star = barrier_conjugate_softmax_scaled_rr( self.cube_bool, self.inactive_lagrange) self.conjugate_barrier = rr.affine_smooth(_barrier_star, np.identity(p)) self.CGF_randomizer = rr.affine_smooth(self.CGF_randomization, -self.B_p_inv) self.constant = np.true_divide(mean_parameter.dot(mean_parameter), 2 * noise_variance) self.linear_term = rr.identity_quadratic(0, 0, self.dual_arg, -self.constant) self.total_loss = rr.smooth_sum([ self.conjugate_barrier, self.CGF_randomizer, self.likelihood_loss ]) self.total_loss.quadratic = self.linear_term
def test_sqrt_highdim_lasso(n=500, p=200, signal_fac=1.5, s=5, sigma=3, full=True, rho=0.4, randomizer_scale=1., ndraw=5000, burnin=1000, ridge_term=None, compare_to_lasso=True): """ Compare to R randomized lasso """ inst, const = gaussian_instance, lasso.sqrt_lasso signal = np.sqrt(signal_fac * 2 * np.log(p)) X, Y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] if ridge_term is None: mean_diag = np.mean((X**2).sum(0)) ridge_term = (np.sqrt(mean_diag) / np.sqrt(n)) * np.sqrt(n / (n - 1.)) W = np.ones(X.shape[1]) * choose_lambda(X) * 0.7 perturb = np.random.standard_normal(p) * randomizer_scale / np.sqrt(n) conv = const(X, Y, W, randomizer_scale=randomizer_scale / np.sqrt(n), perturb=perturb, ridge_term=ridge_term) signs = conv.fit() nonzero = signs != 0 # sanity check if compare_to_lasso: q_term = rr.identity_quadratic(ridge_term, 0, -perturb, 0) soln2, sqrt_loss = solve_sqrt_lasso(X, Y, W, solve_args={'min_its':1000}, quadratic=q_term, force_fat=True) soln = conv.initial_soln denom = np.linalg.norm(Y - X.dot(soln)) new_weights = W * denom loss = rr.glm.gaussian(X, Y) pen = rr.weighted_l1norm(new_weights, lagrange=1.) prob = rr.simple_problem(loss, pen) rescaledQ = rr.identity_quadratic(ridge_term * denom, 0, -perturb * denom, 0) soln3 = prob.solve(quadratic=rescaledQ, min_its=1000, tol=1.e-12) np.testing.assert_allclose(conv._initial_omega, perturb * denom) np.testing.assert_allclose(soln, soln2) np.testing.assert_allclose(soln, soln3) if full: (observed_target, cov_target, cov_target_score, alternatives) = full_targets(conv.loglike, conv._W, nonzero) else: (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(conv.loglike, conv._W, nonzero) _, pval, intervals = conv.summary(observed_target, cov_target, cov_target_score, alternatives, ndraw=ndraw, burnin=burnin, compute_intervals=False) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0]
def test_quadratic_for_smooth(): ''' this test is a check to ensure that the quadratic part of the smooth functions are being used in the proximal step ''' L = 0.45 W = np.random.standard_normal(40) Z = np.random.standard_normal(40) U = np.random.standard_normal(40) atomq = rr.identity_quadratic(0.4, U, W, 0) atom = rr.l1norm(40, quadratic=atomq, lagrange=0.12) # specifying in this way should be the same as if we put 0.5*L below loss = rr.quadratic.shift(Z, coef=0.6 * L) lq = rr.identity_quadratic(0.4 * L, Z, 0, 0) loss.quadratic = lq ww = np.random.standard_normal(40) # specifying in this way should be the same as if we put 0.5*L below loss2 = rr.quadratic.shift(Z, coef=L) yield all_close, loss2.objective(ww), loss.objective( ww), 'checking objective', None yield all_close, lq.objective(ww, 'func'), loss.nonsmooth_objective( ww), 'checking nonsmooth objective', None yield all_close, loss2.smooth_objective( ww, 'func'), 0.5 / 0.3 * loss.smooth_objective( ww, 'func'), 'checking smooth objective func', None yield all_close, loss2.smooth_objective( ww, 'grad'), 0.5 / 0.3 * loss.smooth_objective( ww, 'grad'), 'checking smooth objective grad', None problem = rr.container(loss, atom) solver = rr.FISTA(problem) solver.fit(tol=1.0e-12) problem3 = rr.simple_problem(loss, atom) solver3 = rr.FISTA(problem3) solver3.fit(tol=1.0e-12, coef_stop=True) loss4 = rr.quadratic.shift(Z, coef=0.6 * L) problem4 = rr.simple_problem(loss4, atom) problem4.quadratic = lq solver4 = rr.FISTA(problem4) solver4.fit(tol=1.0e-12) gg_soln = rr.gengrad(problem, L) loss6 = rr.quadratic.shift(Z, coef=0.6 * L) loss6.quadratic = lq + atom.quadratic atomcp = copy(atom) atomcp.quadratic = rr.identity_quadratic(0, 0, 0, 0) problem6 = rr.dual_problem(loss6.conjugate, rr.identity(loss6.shape), atomcp.conjugate) problem6.lipschitz = L + atom.quadratic.coef dsoln2 = problem6.solve(coef_stop=True, tol=1.e-10, max_its=100) problem2 = rr.container(loss2, atom) solver2 = rr.FISTA(problem2) solver2.fit(tol=1.0e-12, coef_stop=True) q = rr.identity_quadratic(L, Z, 0, 0) yield all_close, problem.objective( ww), atom.nonsmooth_objective(ww) + q.objective(ww, 'func'), '', None atom = rr.l1norm(40, quadratic=atomq, lagrange=0.12) aq = atom.solve(q) for p, msg in zip([ solver3.composite.coefs, gg_soln, solver2.composite.coefs, dsoln2, solver.composite.coefs, solver4.composite.coefs ], [ 'simple_problem with loss having no quadratic', 'gen grad', 'container with loss having no quadratic', 'dual problem with loss having a quadratic', 'container with loss having a quadratic', 'simple_problem having a quadratic' ]): yield all_close, aq, p, msg, None
def sqrt_lasso(X, Y, groups, weights, quadratic=None, ridge_term=None, randomizer_scale=None, solve_args={'min_its': 200}, perturb=None): r""" Use sqrt-LASSO to choose variables. Objective function is (before randomization) .. math:: \beta \mapsto \|Y-X\beta\|_2 + \sum_{i=1}^p \lambda_i |\beta_i| where $\lambda$ is `feature_weights`. After solving the problem treat as if `gaussian` with implied variance and choice of multiplier. See arxiv.org/abs/1504.08031 for details. Parameters ---------- X : ndarray Shape (n,p) -- the design matrix. Y : ndarray Shape (n,) -- the response. feature_weights: [float, sequence] Penalty weights. An intercept, or other unpenalized features are handled by setting those entries of `feature_weights` to 0. If `feature_weights` is a float, then all parameters are penalized equally. quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) An optional quadratic term to be added to the objective. Can also be a linear term by setting quadratic coefficient to 0. covariance : str One of 'parametric' or 'sandwich'. Method used to estimate covariance for inference in second stage. solve_args : dict Arguments passed to solver. ridge_term : float How big a ridge term to add? randomizer_scale : float Scale for IID components of randomizer. randomizer : str One of ['laplace', 'logistic', 'gaussian'] Returns ------- L : `selection.randomized.lasso.lasso` Notes ----- Unlike other variants of LASSO, this solves the problem on construction as the active set is needed to find equivalent gaussian LASSO. Assumes parametric model is correct for inference, i.e. does not accept a covariance estimator. """ n, p = X.shape if np.asarray(feature_weights).shape == (): feature_weights = np.ones(p) * feature_weights mean_diag = np.mean((X**2).sum(0)) if ridge_term is None: ridge_term = np.sqrt(mean_diag) / (n - 1) if randomizer_scale is None: randomizer_scale = 0.5 * np.sqrt(mean_diag) / np.sqrt(n - 1) if perturb is None: perturb = np.random.standard_normal(p) * randomizer_scale randomQ = rr.identity_quadratic(ridge_term, 0, -perturb, 0) # a ridge + linear term if quadratic is not None: totalQ = randomQ + quadratic else: totalQ = randomQ soln, sqrt_loss = solve_sqrt_lasso(X, Y, weights=feature_weights, quadratic=totalQ, solve_args=solve_args, force_fat=True) denom = np.linalg.norm(Y - X.dot(soln)) loglike = rr.glm.gaussian(X, Y) randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale * denom) weights = copy(weights) for k in weights.keys(): weights[k] = weights[k] * denom obj = lasso(loglike, groups, weights, ridge_term * denom, randomizer, perturb=perturb * denom) obj._sqrt_soln = soln return obj
def form_penalty(self): penalty = weighted_l1norm(self.weights, lagrange=1.) penalty.quadratic = identity_quadratic(0, 0, self.random_linear_term, 0) return penalty
def test_lasso_separable(): """ This test verifies that the specification of a separable penalty yields the same results as having two linear_atoms with selector matrices. The penalty here is a lasso, i.e. l1 penalty. """ X = np.random.standard_normal((100, 20)) Y = np.random.standard_normal( (100, )) + np.dot(X, np.random.standard_normal(20)) penalty1 = rr.l1norm(10, lagrange=1.2) penalty2 = rr.l1norm(10, lagrange=1.2) penalty = rr.separable((20, ), [penalty1, penalty2], [slice(0, 10), slice(10, 20)], test_for_overlap=True) # ensure code is tested print(penalty1.latexify()) print(penalty.latexify()) print(penalty.conjugate) print(penalty.dual) print(penalty.seminorm(np.ones(penalty.shape))) print(penalty.constraint(np.ones(penalty.shape), bound=2.)) pencopy = copy(penalty) pencopy.set_quadratic(rr.identity_quadratic(1, 0, 0, 0)) pencopy.conjugate # solve using separable loss = rr.quadratic_loss.affine(X, -Y, coef=0.5) problem = rr.separable_problem.fromatom(penalty, loss) solver = rr.FISTA(problem) solver.fit(min_its=200, tol=1.0e-12) coefs = solver.composite.coefs # solve using the usual composite penalty_all = rr.l1norm(20, lagrange=1.2) problem_all = rr.container(loss, penalty_all) solver_all = rr.FISTA(problem_all) solver_all.fit(min_its=100, tol=1.0e-12) coefs_all = solver_all.composite.coefs # solve using the selectors penalty_s = [ rr.linear_atom(p, rr.selector(g, (20, ))) for p, g in zip(penalty.atoms, penalty.groups) ] problem_s = rr.container(loss, *penalty_s) solver_s = rr.FISTA(problem_s) solver_s.fit(min_its=500, tol=1.0e-12) coefs_s = solver_s.composite.coefs np.testing.assert_almost_equal(coefs, coefs_all) np.testing.assert_almost_equal(coefs, coefs_s)
def solve_sqrt_lasso_skinny(X, Y, weights=None, initial=None, quadratic=None, solve_args={}): """ Solve the square-root LASSO optimization problem: $$ \text{minimize}_{\beta} \|y-X\beta\|_2 + D |\beta|, $$ where $D$ is the diagonal matrix with weights on its diagonal. Parameters ---------- y : np.float((n,)) The target, in the model $y = X\beta$ X : np.float((n, p)) The data, in the model $y = X\beta$ weights : np.float Coefficients of the L-1 penalty in optimization problem, note that different coordinates can have different coefficients. initial : np.float(p) Initial point for optimization. solve_args : dict Arguments passed to regreg solver. quadratic : `regreg.identity_quadratic` A quadratic term added to objective function. """ X = rr.astransform(X) n, p = X.output_shape[0], X.input_shape[0] if weights is None: lam = choose_lambda(X) weights = lam * np.ones((p, )) weight_dict = dict(zip(np.arange(p), 2 * weights)) penalty = rr.mixed_lasso(list(np.arange(p)) + [rr.NONNEGATIVE], lagrange=1., weights=weight_dict) loss = sqlasso_objective_skinny(X, Y) problem = rr.simple_problem(loss, penalty) problem.coefs[-1] = np.linalg.norm(Y) if initial is not None: problem.coefs[:-1] = initial if quadratic is not None: collapsed = quadratic.collapsed() new_linear_term = np.zeros(p + 1) new_linear_term[:p] = collapsed.linear_term new_quadratic = rr.identity_quadratic(collapsed.coef, 0., new_linear_term, collapsed.constant_term) else: new_quadratic = None soln = problem.solve(new_quadratic, **solve_args) _loss = sqlasso_objective(X, Y) return soln[:-1], _loss
def fit(self, solve_args={'tol':1.e-12, 'min_its':50}, perturb=None): """ Fit the randomized lasso using `regreg`. Parameters ---------- solve_args : keyword args Passed to `regreg.problems.simple_problem.solve`. Returns ------- signs : np.float Support and non-zero signs of randomized lasso solution. """ p = self.nfeature # take a new perturbation if supplied if perturb is not None: self._initial_omega = perturb if self._initial_omega is None: self._initial_omega = self.randomizer.sample() quad = rr.identity_quadratic(self.ridge_term, 0, -self._initial_omega, 0) quad_data = rr.identity_quadratic(0, 0, -self.X.T.dot(self.y), 0) problem = rr.simple_problem(self.loss, self.penalty) self.initial_soln = problem.solve(quad + quad_data, **solve_args) active_signs = np.sign(self.initial_soln) active = self._active = active_signs != 0 self._lagrange = self.penalty.weights unpenalized = self._lagrange == 0 active *= ~unpenalized self._overall = overall = (active + unpenalized) > 0 self._inactive = inactive = ~self._overall self._unpenalized = unpenalized _active_signs = active_signs.copy() _active_signs[unpenalized] = np.nan # don't release sign of unpenalized variables self.selection_variable = {'sign':_active_signs, 'variables':self._overall} # initial state for opt variables initial_subgrad = -(self.loss.smooth_objective(self.initial_soln, 'grad') + quad_data.objective(self.initial_soln, 'grad') + quad.objective(self.initial_soln, 'grad')) self.initial_subgrad = initial_subgrad initial_scalings = np.fabs(self.initial_soln[active]) initial_unpenalized = self.initial_soln[self._unpenalized] self.observed_opt_state = np.concatenate([initial_scalings, initial_unpenalized]) E = overall Q_E = self.Q[E][:,E] _beta_unpenalized = np.linalg.inv(Q_E).dot(self.X[:,E].T.dot(self.y)) beta_bar = np.zeros(p) beta_bar[overall] = _beta_unpenalized self._beta_full = beta_bar # observed state for score in internal coordinates self.observed_internal_state = np.hstack([_beta_unpenalized, -self.loss.smooth_objective(beta_bar, 'grad')[inactive] + quad_data.objective(beta_bar, 'grad')[inactive]]) # form linear part self.num_opt_var = self.observed_opt_state.shape[0] # (\bar{\beta}_{E \cup U}, N_{-E}, c_E, \beta_U, z_{-E}) # E for active # U for unpenalized # -E for inactive _opt_linear_term = np.zeros((p, self.num_opt_var)) _score_linear_term = np.zeros((p, self.num_opt_var)) # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator X, y = self.X, self.y _hessian_active = self.Q[:, active] _hessian_unpen = self.Q[:, unpenalized] _score_linear_term = -np.hstack([_hessian_active, _hessian_unpen]) # set the observed score (data dependent) state self.observed_score_state = _score_linear_term.dot(_beta_unpenalized) self.observed_score_state[inactive] += (self.loss.smooth_objective(beta_bar, 'grad')[inactive] + quad_data.objective(beta_bar, 'grad')[inactive]) def signed_basis_vector(p, j, s): v = np.zeros(p) v[j] = s return v active_directions = np.array([signed_basis_vector(p, j, active_signs[j]) for j in np.nonzero(active)[0]]).T scaling_slice = slice(0, active.sum()) if np.sum(active) == 0: _opt_hessian = 0 else: _opt_hessian = _hessian_active * active_signs[None, active] + self.ridge_term * active_directions _opt_linear_term[:, scaling_slice] = _opt_hessian # beta_U piece unpenalized_slice = slice(active.sum(), self.num_opt_var) unpenalized_directions = np.array([signed_basis_vector(p, j, 1) for j in np.nonzero(unpenalized)[0]]).T if unpenalized.sum(): _opt_linear_term[:, unpenalized_slice] = (_hessian_unpen + self.ridge_term * unpenalized_directions) # two transforms that encode score and optimization # variable roles self.opt_transform = (_opt_linear_term, self.initial_subgrad) self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) # now store everything needed for the projections # the projection acts only on the optimization # variables self._setup = True self.scaling_slice = scaling_slice self.unpenalized_slice = unpenalized_slice self.ndim = self.loss.shape[0] # compute implied mean and covariance opt_linear, opt_offset = self.opt_transform A_scaling = -np.identity(self.num_opt_var) b_scaling = np.zeros(self.num_opt_var) self._setup_sampler(A_scaling, b_scaling, opt_linear, opt_offset) return active_signs
def setup_sampler(self, scaling=1., solve_args={ 'min_its': 50, 'tol': 1.e-10 }, B=2000): M_estimator.setup_sampler(self, scaling=scaling, solve_args=solve_args) # now we need to estimate covariance of # loss.grad(\beta_E^*) - 1/pi * randomized_loss.grad(\beta_E^*) m, n, p = self.subsample_size, self.total_size, self.loss.shape[ 0] # shorthand from .glm import pairs_bootstrap_score # need to correct these imports!!! bootstrap_score = pairs_bootstrap_score( self.loss, self._overall, beta_active=self._beta_full[self._overall], solve_args=solve_args) # find unpenalized MLE on subsample newq, oldq = rr.identity_quadratic(0, 0, 0, 0), self.randomized_loss.quadratic self.randomized_loss.quadratic = newq beta_active_subsample = restricted_Mest(self.randomized_loss, self._overall) bootstrap_score_split = pairs_bootstrap_score( self.loss, self._overall, beta_active=beta_active_subsample, solve_args=solve_args) self.randomized_loss.quadratic = oldq inv_frac = n / m def subsample_diff(m, n, indices): subsample = np.random.choice(indices, size=m, replace=False) full_score = bootstrap_score(indices) # a sum of n terms randomized_score = bootstrap_score_split( subsample) # a sum of m terms return full_score - randomized_score * inv_frac first_moment = np.zeros(p) second_moment = np.zeros((p, p)) _n = np.arange(n) for _ in range(B): indices = np.random.choice(_n, size=n, replace=True) randomized_score = subsample_diff(m, n, indices) first_moment += randomized_score second_moment += np.multiply.outer(randomized_score, randomized_score) first_moment /= B second_moment /= B cov = second_moment - np.multiply.outer(first_moment, first_moment) self.randomization.set_covariance(cov)
def test_solve_QP(): """ Check the R coordinate descent LASSO solver """ n, p = 100, 50 lam = 0.08 X = np.random.standard_normal((n, p)) loss = rr.squared_error(X, np.zeros(n), coef=1. / n) pen = rr.l1norm(p, lagrange=lam) E = np.zeros(p) E[2] = 1 Q = rr.identity_quadratic(0, 0, E, 0) problem = rr.simple_problem(loss, pen) soln = problem.solve(Q, min_its=500, tol=1.e-12) numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('E', E) rpy.r.assign('lam', lam) R_code = """ library(selectiveInference) p = ncol(X) n = nrow(X) soln_R = rep(0, p) grad = 1. * E ever_active = as.integer(c(1, rep(0, p-1))) nactive = as.integer(1) kkt_tol = 1.e-12 objective_tol = 1.e-16 parameter_tol = 1.e-10 maxiter = 500 soln_R = selectiveInference:::solve_QP(t(X) %*% X / n, lam, maxiter, soln_R, E, grad, ever_active, nactive, kkt_tol, objective_tol, parameter_tol, p, TRUE, TRUE, TRUE)$soln # test wide solver Xtheta = rep(0, n) nactive = as.integer(1) ever_active = as.integer(c(1, rep(0, p-1))) soln_R_wide = rep(0, p) grad = 1. * E soln_R_wide = selectiveInference:::solve_QP_wide(X, rep(lam, p), 0, maxiter, soln_R_wide, E, grad, Xtheta, ever_active, nactive, kkt_tol, objective_tol, parameter_tol, p, TRUE, TRUE, TRUE)$soln """ rpy.r(R_code) soln_R = np.asarray(rpy.r('soln_R')) soln_R_wide = np.asarray(rpy.r('soln_R_wide')) numpy2ri.deactivate() tol = 1.e-5 print(soln - soln_R) print(soln_R - soln_R_wide) G = X.T.dot(X).dot(soln) / n + E yield np.testing.assert_allclose, soln, soln_R, tol, tol, False, 'checking coordinate QP solver' yield np.testing.assert_allclose, soln, soln_R_wide, tol, tol, False, 'checking wide coordinate QP solver' yield np.testing.assert_allclose, G[soln != 0], -np.sign( soln[soln != 0] ) * lam, tol, tol, False, 'checking active coordinate KKT for QP solver' yield nt.assert_true, np.fabs( G).max() < lam * (1. + 1.e-6), 'testing linfinity norm'
def test_lasso(s=5, n=200, p=20): X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1., rho=0, snr=10) print 'sigma', sigma lam_frac = 1. randomization = laplace(loc=0, scale=1.) loss = randomized.gaussian_Xfixed(X, y) random_Z = randomization.rvs(p) epsilon = 1. lam = sigma * lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) random_Z = randomization.rvs(p) penalty = randomized.selective_l1norm_lan(p, lagrange=lam) #sampler1 = randomized.selective_sampler_MH_lan(loss, # random_Z, # epsilon, # randomization, # penalty) #loss_args = {'mean': np.zeros(n), # 'sigma': sigma, # 'linear_part':np.identity(y.shape[0]), # 'value': 0} #sampler1.setup_sampling(y, loss_args=loss_args) # data, opt_vars = sampler1.state # initial solution # rr.smooth_atom instead of loss? problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon, 0, -random_Z, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) active = (initial_soln != 0) inactive = ~active initial_grad = -np.dot(X.T, y - np.dot(X, initial_soln)) betaE = initial_soln[active] signs = np.sign(betaE) subgradient = random_Z - initial_grad - epsilon * initial_soln cube = np.divide(subgradient[inactive], lam) #print betaE, cube #initial_grad = loss.smooth_objective(initial_soln, mode='grad') #print penalty.setup_sampling(initial_grad, # initial_soln, # random_Z, # epsilon) data0 = y.copy() #active = penalty.active_set if (np.sum(active) == 0): print 'here' return [-1], [-1] nalpha = n nactive = betaE.shape[0] ninactive = cube.shape[0] alpha = np.ones(n) beta_bar = np.linalg.lstsq(X[:, active], y)[0] obs_residuals = y - np.dot(X[:, active], beta_bar) #obs_residuals -= np.mean(obs_residuals) #betaE, cube = opt_vars init_vec_state = np.zeros(n + nactive + ninactive) init_vec_state[:n] = alpha init_vec_state[n:(n + nactive)] = betaE init_vec_state[(n + nactive):] = cube def full_projection(vec_state, signs=signs, nalpha=nalpha, nactive=nactive, ninactive=ninactive): alpha = vec_state[:nalpha].copy() betaE = vec_state[nalpha:(nalpha + nactive)] cube = vec_state[(nalpha + nactive):] #signs = penalty.signs projected_alpha = alpha.copy() projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) projected_alpha = np.clip(alpha, 0, np.inf) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate( (projected_alpha, projected_betaE, projected_cube), 0) null, alt = pval(init_vec_state, full_projection, X, y, obs_residuals, signs, lam, epsilon, nonzero, active) return null, alt
def test_lasso(s=1, n=100, p=10): X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0) print 'sigma', sigma lam_frac = 1. randomization = laplace(loc=0, scale=1.) loss = randomized.gaussian_Xfixed(X, y) random_Z = randomization.rvs(p) epsilon = 1. lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) random_Z = randomization.rvs(p) penalty = randomized.selective_l1norm_lan(p, lagrange=lam) #sampler1 = randomized.selective_sampler_MH_lan(loss, # random_Z, # epsilon, # randomization, # penalty) #loss_args = {'mean': np.zeros(n), # 'sigma': sigma, # 'linear_part':np.identity(y.shape[0]), # 'value': 0} #sampler1.setup_sampling(y, loss_args=loss_args) # data, opt_vars = sampler1.state # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon, 0, random_Z, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) initial_grad = loss.smooth_objective(initial_soln, mode='grad') betaE, cube = penalty.setup_sampling(initial_grad, initial_soln, random_Z, epsilon) data = y.copy() active = penalty.active_set if (np.sum(active)==0): print 'here' return [-1], [-1] inactive = ~active #betaE, cube = opt_vars ndata = data.shape[0]; nactive = betaE.shape[0]; ninactive = cube.shape[0] init_vec_state = np.zeros(ndata+nactive+ninactive) init_vec_state[:ndata] = data init_vec_state[ndata:(ndata+nactive)] = betaE init_vec_state[(ndata+nactive):] = cube def bootstrap_samples(y, P, R): nsample = 50 boot_samples = [] for _ in range(nsample): indices = np.random.choice(n, size=(n,), replace=True) y_star = y[indices] boot_samples.append(np.dot(P,y)+np.dot(R,y_star-y)) return boot_samples #boot_samples = bootstrap_samples(y) def move_data(vec_state, boot_samples, ndata = ndata, nactive = nactive, ninactive = ninactive, loss=loss): weights = [] betaE = vec_state[ndata:(ndata+nactive)] cube = vec_state[(ndata+nactive):] opt_vars = [betaE, cube] params, _, opt_vec = penalty.form_optimization_vector(opt_vars) # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty for i in range(len(boot_samples)): gradient = loss.gradient(boot_samples[i], params) weights.append(np.exp(-np.sum(np.abs(gradient + opt_vec)))) weights /= np.sum(weights) #m = max(weights) #idx = [i for i, j in enumerate(weights) if j == m][0] idx = np.nonzero(np.random.multinomial(1, weights, size=1)[0])[0][0] return boot_samples[idx] def full_projection(vec_state, penalty=penalty, ndata=ndata, nactive=nactive, ninactive = ninactive): data = vec_state[:ndata].copy() betaE = vec_state[ndata:(ndata+nactive)] cube = vec_state[(ndata+nactive):] signs = penalty.signs projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate((data, projected_betaE, projected_cube), 0) def full_gradient(vec_state, loss=loss, penalty =penalty, X=X, lam=lam, epsilon=epsilon, ndata=ndata, active=active, inactive=inactive): nactive = np.sum(active); ninactive=np.sum(inactive) data = vec_state[:ndata] betaE = vec_state[ndata:(ndata + nactive)] cube = vec_state[(ndata + nactive):] opt_vars = [betaE, cube] params , _ , opt_vec = penalty.form_optimization_vector(opt_vars) # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty gradient = loss.gradient(data, params) hessian = loss.hessian() ndata = data.shape[0] nactive = betaE.shape[0] ninactive = cube.shape[0] sign_vec = - np.sign(gradient + opt_vec) # sign(w), w=grad+\epsilon*beta+lambda*u B = hessian + epsilon * np.identity(nactive + ninactive) A = B[:, active] _gradient = np.zeros(ndata + nactive + ninactive) _gradient[:ndata] = 0 #- (data + np.dot(X, sign_vec)) _gradient[ndata:(ndata + nactive)] = np.dot(A.T, sign_vec) _gradient[(ndata + nactive):] = lam * sign_vec[inactive] return _gradient null, alt = pval(init_vec_state, full_gradient, full_projection, move_data, bootstrap_samples, X, y, nonzero, active) return null, alt