def test_skinny_fat(): X, Y = instance()[:2] n, p = X.shape lam = choose_lambda(X) obj1 = sqlasso_objective(X, Y) obj2 = sqlasso_objective_skinny(X, Y) soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its': 500})[0] soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its': 500})[0] np.testing.assert_allclose(soln1, soln2, rtol=1.e-3) X, Y = instance(p=50)[:2] n, p = X.shape lam = choose_lambda(X) obj1 = sqlasso_objective(X, Y) obj2 = sqlasso_objective_skinny(X, Y) soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its': 500})[0] soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its': 500})[0] np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)
def test_sqrt_lasso_sandwich_pvals(n=200, p=50, s=10, sigma=10, rho=0.3, signal=6., use_lasso_sd=False): X, y, beta, true_active, sigma, _ = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, signal=signal) heteroscedastic_error = sigma * np.random.standard_normal(n) * ( np.fabs(X[:, -1]) + 0.5)**2 heteroscedastic_error += sigma * np.random.standard_normal(n) * ( np.fabs(X[:, -2]) + 0.2)**2 heteroscedastic_error += sigma * np.random.standard_normal(n) * ( np.fabs(X[:, -3]) + 0.5)**2 y += heteroscedastic_error feature_weights = np.ones(p) * choose_lambda(X) feature_weights[10:12] = 0 L_SQ = lasso.sqrt_lasso(X, y, feature_weights, covariance='sandwich') L_SQ.fit() if set(true_active).issubset(L_SQ.active): S = L_SQ.summary('twosided') return S['pval'], [v in true_active for v in S['variable']]
def test_logistic_pvals(n=500, p=200, s=3, sigma=2, rho=0.3, snr=7.): X, y, beta, true_active, sigma = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, snr=snr) z = (y > 0) X = np.hstack([np.ones((n,1)), X]) active = np.array(true_active) active += 1 active = [0] + list(active) L = lasso.logistic(X, z, [0]*1 + [1.2]*p) L.fit() S = L.summary('onesided') true_active = np.nonzero(active)[0] if set(true_active).issubset(L.active) > 0: return S['pval'], [v in true_active for v in S['variable']]
def test_sqrt_lasso_pvals(n=100, p=200, s=7, sigma=5, rho=0.3, snr=7.): X, y, beta, true_active, sigma = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, snr=snr) lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) / np.sqrt(n) Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0) weights_with_zeros = 0.7*lam_theor * np.ones(p) weights_with_zeros[:3] = 0. lasso.sqrt_lasso(X, y, weights_with_zeros, covariance='parametric') L = lasso.sqrt_lasso(X, y, weights_with_zeros) L.fit() if set(true_active).issubset(L.active): S = L.summary('onesided') S = L.summary('twosided') return S['pval'], [v in true_active for v in S['variable']]
def test_gaussian_sandwich_pvals(n=200, p=50, s=10, sigma=10, rho=0.3, snr=6., use_lasso_sd=False): X, y, beta, true_active, sigma = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, snr=snr) heteroscedastic_error = sigma * np.random.standard_normal(n) * (np.fabs(X[:,-1]) + 0.5)**2 heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-2]) + 0.2)**2 heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-3]) + 0.5)**2 y += heteroscedastic_error # two different estimators of variance loss = rr.glm.gaussian(X, y) sandwich = glm_sandwich_estimator(loss, B=5000) # make sure things work with some unpenalized columns feature_weights = np.ones(p) * 3 * sigma feature_weights[10:12] = 0 # try using RSS from LASSO to estimate sigma if use_lasso_sd: L_prelim = lasso.gaussian(X, y, feature_weights) L_prelim.fit() beta_lasso = L_prelim.lasso_solution sigma_hat = np.linalg.norm(y - X.dot(beta_lasso))**2 / (n - len(L_prelim.active)) parametric = glm_parametric_estimator(loss, dispersion=sigma_hat**2) else: parametric = glm_parametric_estimator(loss, dispersion=None) L_P = lasso.gaussian(X, y, feature_weights, covariance_estimator=parametric) L_P.fit() if set(true_active).issubset(L_P.active): S = L_P.summary('twosided') P_P = [p for p, v in zip(S['pval'], S['variable']) if v not in true_active] L_S = lasso.gaussian(X, y, feature_weights, covariance_estimator=sandwich) L_S.fit() S = L_S.summary('twosided') P_S = [p for p, v in zip(S['pval'], S['variable']) if v not in true_active] return P_P, P_S, [v in true_active for v in S['variable']]
def test_intervals(n=100, p=20, s=5): t = [] X, y, beta, true_active, sigma, _ = instance(n=n, p=p, s=s) las = lasso.gaussian(X, y, 4., sigma=sigma) las.fit() # smoke test las.soln las.constraints S = las.summary(compute_intervals=True) nominal_intervals(las)
def test_gaussian_pvals(n=100, p=500, s=7, sigma=5, rho=0.3, signal=8.): X, y, beta, true_active, sigma, _ = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, signal=signal) L = lasso.gaussian(X, y, 20., sigma=sigma) L.fit() L.fit(L.lasso_solution) if set(true_active).issubset(L.active): S = L.summary('onesided') S = L.summary('twosided') return S['pval'], [v in true_active for v in S['variable']]
def test_adding_quadratic_lasso(): X, y, beta, true_active, sigma = instance(n=300, p=200) Q = rr.identity_quadratic(0.01, 0, np.random.standard_normal(X.shape[1]), 0) L1 = lasso.gaussian(X, y, 20, quadratic=Q) beta1 = L1.fit(solve_args={'min_its':500, 'tol':1.e-12}) G1 = X[:,L1.active].T.dot(X.dot(beta1) - y) + Q.objective(beta1,'grad')[L1.active] np.testing.assert_allclose(G1 * np.sign(beta1[L1.active]), -20) lin = rr.identity_quadratic(0.0, 0, np.random.standard_normal(X.shape[1]), 0) L2 = lasso.gaussian(X, y, 20, quadratic=lin) beta2 = L2.fit(solve_args={'min_its':500, 'tol':1.e-12}) G2 = X[:,L2.active].T.dot(X.dot(beta2) - y) + lin.objective(beta2,'grad')[L2.active] np.testing.assert_allclose(G2 * np.sign(beta2[L2.active]), -20)
def test_gaussian(n=100, p=20): X, y, beta = instance(n=n, p=p, sigma=1.)[:3] lam_theor = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) weights = 1.1 * lam_theor * np.ones(p) weights[:3] = 0. L = lasso.gaussian(X, y, weights, sigma=1.) L.ignore_inactive_constraints = True L.fit() print(debiased_lasso_inference(L, L.active, np.sqrt(2 * np.log(p) / n))) print(beta)
def test_data_carving_poisson(n=500, p=300, s=5, sigma=5, rho=0.3, signal=12., split_frac=0.8, lam_frac=1.2, ndraw=8000, burnin=2000, df=np.inf, compute_intervals=True, use_full_cov=True, return_only_screening=True): X, y, beta, true_active, sigma, _ = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, signal=signal, df=df) X = np.hstack([np.ones((n, 1)), X]) y = np.random.poisson(10, size=y.shape) s = 1 true_active = [0] idx = np.arange(n) np.random.shuffle(idx) stage_one = idx[:int(n * split_frac)] n1 = len(stage_one) lam_theor = 3. * np.ones(p + 1) lam_theor[0] = 0. DC = data_carving.poisson(X, y, feature_weights=lam_theor, stage_one=stage_one) DC.fit() if len(DC.active) < n - int(n * split_frac): DS = data_splitting.poisson(X, y, feature_weights=lam_theor, stage_one=stage_one) DS.fit(use_full_cov=True) data_split = True else: print('not enough data for data splitting second stage') print(DC.active) data_split = False print(DC.active) if set(true_active).issubset(DC.active): carve = [] split = [] for var in DC.active: carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw)) if data_split: split.append(DS.hypothesis_test(var)) else: split.append(np.random.sample()) Xa = X[:, DC.active] active = np.zeros(p + 1, np.bool) active[true_active] = 1 v = (carve, split, active) return v
def test_data_carving_sqrt_lasso(n=200, p=200, s=7, sigma=5, rho=0.3, signal=7., split_frac=0.9, lam_frac=1.2, ndraw=8000, burnin=2000, df=np.inf, compute_intervals=True, return_only_screening=True): X, y, beta, true_active, sigma, _ = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, signal=signal, df=df) mu = np.dot(X, beta) idx = np.arange(n) np.random.shuffle(idx) stage_one = idx[:int(n * split_frac)] n1 = len(stage_one) lam_theor = lam_frac * np.mean( np.fabs(np.dot(X[stage_one].T, np.random.standard_normal( (n1, 5000)))).max(0)) / np.sqrt(n1) DC = data_carving.sqrt_lasso(X, y, feature_weights=lam_theor, stage_one=stage_one) DC.fit() if len(DC.active) < n - int(n * split_frac): DS = data_splitting.sqrt_lasso(X, y, feature_weights=lam_theor, stage_one=stage_one) DS.fit(use_full_cov=True) data_split = True else: print('not enough data for second stage data splitting') print(DC.active) data_split = False if set(true_active).issubset(DC.active): carve = [] split = [] for var in DC.active: carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw)) if data_split: split.append(DS.hypothesis_test(var)) else: split.append(np.random.sample()) Xa = X[:, DC.active] truth = np.dot(np.linalg.pinv(Xa), mu) active = np.zeros(p, np.bool) active[true_active] = 1 v = (carve, split, active) return v
selected : [] Sequence of selected variables. active_set : set Set of active variables. Returns ------- idx : int Completion index. >>> selected = [1,3,2,4,6,7,8,23,11,5] >>> active = [1,4,8] >>> completion_index(selected, active) 6 """ active_set = set(active_set) for i in range(len(selected)): if active_set.issubset(selected[:i]): return i - 1 return len(selected) - 1 if __name__ == "__main__": from selection.tests.instance import gaussian_instance as instance X, y, beta, active, sigma = instance(n=100, p=40, signal=0, rho=0.3) R, FS = compute_pvalues(y, X, sigma=sigma, maxstep=20) print(R) print(completion_index(R['variable_selected'], active))
selected : [] Sequence of selected variables. active_set : set Set of active variables. Returns ------- idx : int Completion index. >>> selected = [1,3,2,4,6,7,8,23,11,5] >>> active = [1,4,8] >>> completion_index(selected, active) 6 """ active_set = set(active_set) for i in range(len(selected)): if active_set.issubset(selected[:i]): return i-1 return len(selected)-1 if __name__ == "__main__": from selection.tests.instance import gaussian_instance as instance X, y, beta, active, sigma = instance(n=100, p=40, signal=0, rho=0.3) R, FS = compute_pvalues(y, X, sigma=sigma, maxstep=20) print(R) print(completion_index(R['variable_selected'], active))
def test_data_carving_logistic(n=700, p=300, s=5, sigma=5, rho=0.05, snr=4., split_frac=0.8, ndraw=8000, burnin=2000, df=np.inf, compute_intervals=True, use_full_cov=False, return_only_screening=True): X, y, beta, true_active, sigma = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, snr=snr, df=df) mu = X.dot(beta) prob = np.exp(mu) / (1 + np.exp(mu)) X = np.hstack([np.ones((n,1)), X]) z = np.random.binomial(1, prob) active = np.array(true_active) active += 1 s += 1 active = [0] + list(active) true_active = np.nonzero(active)[0] idx = np.arange(n) np.random.shuffle(idx) stage_one = idx[:int(n*split_frac)] n1 = len(stage_one) lam_theor = 1.0 * np.ones(p+1) lam_theor[0] = 0. DC = data_carving.logistic(X, z, feature_weights=lam_theor, stage_one=stage_one) DC.fit() if len(DC.active) < n - int(n*split_frac): DS = data_splitting.logistic(X, z, feature_weights=lam_theor, stage_one=stage_one) DS.fit(use_full_cov=True) data_split = True else: print('not enough data for data splitting second stage') print(DC.active) data_split = False if set(true_active).issubset(DC.active): carve = [] split = [] for var in DC.active: carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw)) if data_split: split.append(DS.hypothesis_test(var)) else: split.append(np.random.sample()) Xa = X[:,DC.active] active = np.zeros_like(DC.active, np.bool) active[true_active] = 1 v = (carve, split, active) return v return return_value
def test_data_carving_gaussian(n=200, p=200, s=7, sigma=5, rho=0.3, snr=7., split_frac=0.8, lam_frac=2., ndraw=8000, burnin=2000, df=np.inf, compute_intervals=True, use_full_cov=True, return_only_screening=True): X, y, beta, true_active, sigma = instance(n=n, p=p, s=s, sigma=sigma, rho=rho, snr=snr, df=df) mu = np.dot(X, beta) idx = np.arange(n) np.random.shuffle(idx) stage_one = idx[:int(n*split_frac)] lam_theor = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 5000)))).max(0)) * sigma DC = data_carving.gaussian(X, y, feature_weights=lam_theor, sigma=sigma, stage_one=stage_one) DC.fit() if len(DC.active) < n - int(n*split_frac): DS = data_splitting.gaussian(X, y, feature_weights=lam_theor, sigma=sigma, stage_one=stage_one) DS.fit(use_full_cov=True) DS.fit(use_full_cov=False) DS.fit(use_full_cov=use_full_cov) data_split = True else: print('not enough data for second stage data splitting') print(DC.active) data_split = False if set(true_active).issubset(DC.active): carve = [] split = [] for var in DC.active: carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw)) if data_split: split.append(DS.hypothesis_test(var)) else: split.append(np.random.sample()) # appropriate p-value if data splitting can't estimate 2nd stage Xa = X[:,DC.active] truth = np.dot(np.linalg.pinv(Xa), mu) active = np.zeros_like(DC.active, np.bool) active[true_active] = 1 v = (carve, split, active) return v