def test_group_lasso_weightedl1_lagrange(): n, p = 100, 50 X = np.random.standard_normal((n, p)) Y = np.random.standard_normal(n) loss = rr.glm.gaussian(X, Y) weights = np.ones(p) weights[-2:] = np.inf weights[:2] = 0 weight_dict = dict([(i, w) for i, w in enumerate(weights)]) pen1 = rr.weighted_l1norm(weights, lagrange=0.5 * np.sqrt(n)) pen2 = rr.group_lasso(np.arange(p), weights=weight_dict, lagrange=0.5 * np.sqrt(n)) problem1 = rr.simple_problem(loss, pen1) problem2 = rr.simple_problem(loss, pen2) beta1 = problem1.solve(tol=1.e-14, min_its=500) beta2 = problem2.solve(tol=1e-14, min_its=500) npt.assert_allclose(beta1, beta2) bound_val = pen1.seminorm(beta1, lagrange=1) bound1 = rr.weighted_l1norm(weights, bound=bound_val) bound2 = rr.group_lasso(np.arange(p), weights=weight_dict, bound=bound_val) problem3 = rr.simple_problem(loss, bound1) problem4 = rr.simple_problem(loss, bound2) beta3 = problem3.solve(tol=1.e-14, min_its=500) beta4 = problem4.solve(tol=1.e-14, min_its=500) npt.assert_allclose(beta3, beta4) npt.assert_allclose(beta3, beta1)
def test_multiple_queries_individual_coeff(ndraw=10000, burnin=2000): s, n, p = 3, 120, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=5) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] nview = 5 for i in range(nview): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nview): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] pvalues = [] true_beta = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): for j in range(nactive): subset = np.zeros(p, np.bool) subset[active_set[j]] = True target_sampler, target_observed = glm_target(loss, active_union * ~subset, mv, subset=subset, reference=np.zeros((1,))) test_stat = lambda x: np.atleast_1d(x) pval = target_sampler.hypothesis_test(test_stat, np.atleast_1d(target_observed-true_beta[j]), alternative='twosided', ndraw=ndraw, burnin=burnin) pvalues.append(pval) active_var = np.zeros_like(pvalues, np.bool) _nonzero = np.array([i in nonzero for i in active_set]) active_var[_nonzero] = True return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_parametric_covariance(ndraw=10000, burnin=2000): s, n, p = 3, 120, 10 randomizer = randomization.laplace((p, ), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, signal=12) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer) # second randomization M_est2 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1, M_est2]) mv.solve() target = M_est1.selection_variable['variables'].copy() if target[-1] or M_est2.selection_variable['variables'][-1]: return None if target[-2] or M_est2.selection_variable['variables'][-2]: return None # we should check they are different sizes target[-2:] = 1 if set(nonzero).issubset(np.nonzero(target)[0]): form_covariances = glm_parametric_covariance(loss) mv.setup_sampler(form_covariances) target_observed = restricted_Mest(loss, target) linear_func = np.zeros((2, target_observed.shape[0])) linear_func[0, -1] = 1. # we know this one is null linear_func[1, -2] = 1. # also null target_observed = linear_func.dot(target_observed) target_sampler = mv.setup_target((target, linear_func), target_observed, parametric=True) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test(test_stat, test_stat(target_observed), alternative='greater', ndraw=ndraw, burnin=burnin) return [pval], [False]
def data_splitting_screening(frac=0.5, snr=15, s=5, n=200, p=20, rho=0.1): count = 0 while True: count += 1 X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) n2 = int(frac * n) X = X[:n2] y = y[:n2] nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n2) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n2, 10000)))).max(0)) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) problem = rr.simple_problem(loss, penalty) quadratic = rr.identity_quadratic(epsilon, 0, 0, 0) soln = problem.solve(quadratic) active_set = np.nonzero(soln != 0)[0] if set(nonzero).issubset(active_set): return count
def __init__( self, loglike, groups, weights, ridge_term, randomizer, use_lasso=True, # should lasso solver be used where applicable - defaults to True perturb=None): _check_groups(groups) # make sure groups looks sensible # log likelihood : quadratic loss self.loglike = loglike self.nfeature = self.loglike.shape[0] # ridge parameter self.ridge_term = ridge_term # group lasso penalty (from regreg) # use regular lasso penalty if all groups are size 1 if use_lasso and groups.size == np.unique(groups).size: # need to provide weights an an np.array rather than a dictionary weights_np = np.array([w[1] for w in sorted(weights.items())]) self.penalty = rr.weighted_l1norm(weights=weights_np, lagrange=1.) else: self.penalty = rr.group_lasso(groups, weights=weights, lagrange=1.) # store groups as a class variable since the non-group lasso doesn't self.groups = groups self._initial_omega = perturb # gaussian randomization self.randomizer = randomizer
def randomization_screening(scale=1., snr=15, s=5, n=200, p=20, rho=0.1): count = 0 randomizer = randomization.laplace((p, ), scale=scale) while True: count += 1 X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est.solve() active_set = np.nonzero(M_est.initial_soln != 0)[0] if set(nonzero).issubset(active_set): return count
def selection_nonrandomized(X, y, sigma=None, method="theoretical"): n, p = X.shape loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) lam_frac = 1. if sigma is None: sigma = 1. if method == "theoretical": lam = 1. * sigma * lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon, 0, 0, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) active = (initial_soln != 0) if np.sum(active) == 0: return None initial_grad = loss.smooth_objective(initial_soln, mode='grad') betaE = initial_soln[active] subgradient = -(initial_grad + epsilon * initial_soln) cube = subgradient[~active] / lam return lam, epsilon, active, betaE, cube, initial_soln
def test_reconstruction(s=3, n=200, p=50, snr=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=100, burnin=200, bootstrap=True, solve_args={ 'min_its': 50, 'tol': 1.e-10 }, reference_known=False): X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) m = int(split_frac * n) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = split_glm_group_lasso(loss, epsilon, m, penalty) mv = multiple_queries([M_est]) mv.solve() M_est.selection_variable['variables'] = M_est.selection_variable[ 'variables'] nactive = np.sum(M_est.selection_variable['variables']) if nactive == 0: return None if set(nonzero).issubset( np.nonzero(M_est.selection_variable['variables'])[0]): active_set = np.nonzero(M_est.selection_variable['variables'])[0] target_sampler, target_observed = glm_target( loss, M_est.selection_variable['variables'], mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) reconstruction = target_sampler.reconstruction_map(target_sample) logdens = target_sampler.log_randomization_density(target_sample) return logdens.shape
def test_multiple_queries_individual_coeff_small(ndraw=10000, burnin=2000, bootstrap=True): s, n, p = 3, 100, 20 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=20.) nonzero = np.where(beta)[0] lam_frac = 3. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # randomization M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active_vars = M_est.selection_variable['variables'] nactive = np.sum(active_vars) active_set = np.nonzero(active_vars)[0] pvalues = [] true_beta = beta[active_vars] print(nonzero, active_set) if set(nonzero).issubset(active_set): for j in range(nactive): print(j) subset = np.zeros(p, np.bool) subset[active_set[j]] = True target_sampler, target_observed = glm_target(loss, active_vars, mv, subset=subset, bootstrap=bootstrap, reference=np.zeros((1,))) test_stat = lambda x: x pval = target_sampler.hypothesis_test(test_stat, target_observed, alternative='twosided', ndraw=ndraw, burnin=burnin) pvalues.append(pval) return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_approximate_mle(n=100, p=10, s=3, snr=5, rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) if randomizer == 'gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer == 'laplace': randomization = randomization.laplace((p,), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer) M_est.solve_approx() inf = approximate_conditional_density(M_est) inf.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: mle_active = np.zeros(nactive) for j in range(nactive): mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0] print("mle for target", mle_active)
def test_selection(): n = 500 p = 100 s = 0 signal = 0. np.random.seed(3) # ensures different y X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, sigma=1., rho=0, signal=signal) lam = 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma n, p = X.shape loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomizer = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomizer, 'gaussian', 'parametric') M_est.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) prior_variance = 1000. noise_variance = sigma**2 generative_mean = np.zeros(p) generative_mean[:nactive] = M_est.initial_soln[active] sel_split = selection_probability_random_lasso(M_est, generative_mean) min = sel_split.minimize2(nstep=200) print(min[0], min[1]) test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall])) print("value of likelihood", sel_split.likelihood_loss.smooth_objective(test_point, mode="func")) inv_cov = np.linalg.inv(M_est.score_cov) lik = (M_est.observed_score_state - generative_mean).T.dot(inv_cov).dot(M_est.observed_score_state - generative_mean) / 2. print("value of likelihood check", lik) grad = inv_cov.dot(M_est.observed_score_state - generative_mean) print("grad at likelihood loss", grad)
def test_path_group_lasso(): ''' this test looks at the paths of three different parameterizations of the same problem ''' n = 100 X = np.random.standard_normal((n, 10)) U = np.random.standard_normal((n, 2)) Y = np.random.standard_normal(100) betaX = np.array([3, 4, 5, 0, 0] + [0] * 5) betaU = np.array([10, -5]) Y += (np.dot(X, betaX) + np.dot(U, betaU)) * 5 Xn = rr.normalize(np.hstack([np.ones((100, 1)), X]), inplace=True, center=True, scale=True, intercept_column=0).normalized_array() lasso = rr.lasso.squared_error(Xn[:, 1:], Y, penalty_structure=[0] * 7 + [1] * 3, nstep=10) sol = lasso.main(inner_tol=1.e-12, verbose=True) beta = np.array(sol['beta'].todense()) sols = [] sols_sep = [] for l in sol['lagrange']: loss = rr.squared_error(Xn, Y, coef=1. / n) penalty = rr.group_lasso([rr.UNPENALIZED] + [0] * 7 + [1] * 3, l) # matrix contains an intercept... problem = rr.simple_problem(loss, penalty) sols.append(problem.solve(tol=1.e-12).copy()) sep = rr.separable((11, ), [ rr.l2norm((7, ), np.sqrt(7) * l), rr.l2norm((3, ), np.sqrt(3) * l) ], [np.arange(1, 8), np.arange(8, 11)]) sep_problem = rr.simple_problem(loss, sep) sols_sep.append(sep_problem.solve(tol=1.e-12).copy()) sols = np.array(sols).T sols_sep = np.array(sols_sep).T nt.assert_true( np.linalg.norm(beta - sols) / (1 + np.linalg.norm(beta)) <= 1.e-4) nt.assert_true( np.linalg.norm(beta - sols_sep) / (1 + np.linalg.norm(beta)) <= 1.e-4)
def test_multiple_views(): s, n, p = 5, 200, 20 randomizer = randomization.laplace((p, ), scale=0.5) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=7) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) # second randomization M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_views([M_est1, M_est2]) mv.solve() active = M_est1.overall + M_est2.overall if set(nonzero).issubset(np.nonzero(active)[0]): active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] boot_target, target_observed = pairs_bootstrap_glm(loss, active) inactive_target = lambda indices: boot_target(indices)[ inactive_selected] inactive_observed = target_observed[inactive_selected] sampler = lambda: np.random.choice(n, size=(n, ), replace=True) mv.setup_sampler(sampler) target_sampler = mv.setup_target(inactive_target, inactive_observed) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test(test_stat, inactive_observed, alternative='greater') return pval
def __init__(self, loglike, groups, weights, ridge_term, randomizer, perturb=None): r""" Create a new post-selection object for the LASSO problem Parameters ---------- loglike : `regreg.smooth.glm.glm` A (negative) log-likelihood as implemented in `regreg`. feature_weights : np.ndarray Feature weights for L-1 penalty. If a float, it is brodcast to all features. ridge_term : float How big a ridge term to add? randomizer : object Randomizer -- contains representation of randomization density. perturb : np.ndarray Random perturbation subtracted as a linear term in the objective function. """ self.loglike = loglike self.nfeature = p = self.loglike.shape[0] self.ridge_term = ridge_term self.penalty = rr.group_lasso(groups, weights=weights, lagrange=1.) self._initial_omega = perturb # random perturbation self.randomizer = randomizer
def test_group_lasso_weightedl1_bound(): n, p = 100, 50 X = np.random.standard_normal((n, p)) Y = np.random.standard_normal(n) loss = rr.glm.gaussian(X, Y) weights = np.ones(p) weights[-2:] = np.inf weights[:2] = 0 weight_dict = dict([(i, w) for i, w in enumerate(weights)]) bound1 = rr.weighted_l1norm(weights, bound=2) bound2 = rr.group_lasso(np.arange(p), weights=weight_dict, bound=2) problem1 = rr.simple_problem(loss, bound1) problem2 = rr.simple_problem(loss, bound2) beta1 = problem1.solve(tol=1.e-14, min_its=500) beta2 = problem2.solve(tol=1e-14, min_its=500) npt.assert_allclose(beta1, beta2)
def test_group_lasso_equivalent(): """ with 0 as lasso weights should be group lasso """ pen1 = sparse_group_lasso([1,1,2,2,2], np.zeros(5), weights={1:0.2, 2:0.1}, lagrange=0.4) pen2 = rr.group_lasso([1,1,2,2,2], {1:0.2, 2:0.1}, lagrange=0.4) Z = np.array([3,2,4,6,7]) np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z)) Z = np.random.standard_normal(5) * 100 np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z)) dual1 = pen1.conjugate dual2 = pen2.conjugate np.testing.assert_allclose(Z, pen1.lagrange_prox(Z) + dual1.bound_prox(Z)) np.testing.assert_allclose(dual1.bound_prox(Z), dual2.bound_prox(Z))
def test_path_group_lasso(): ''' this test looks at the paths of three different parameterizations of the same problem ''' n = 100 X = np.random.standard_normal((n,10)) U = np.random.standard_normal((n,2)) Y = np.random.standard_normal(100) betaX = np.array([3,4,5,0,0] + [0]*5) betaU = np.array([10,-5]) Y += (np.dot(X, betaX) + np.dot(U, betaU)) * 5 Xn = rr.normalize(np.hstack([np.ones((100,1)),X]), inplace=True, center=True, scale=True, intercept_column=0).normalized_array() lasso = rr.lasso.squared_error(Xn[:,1:] ,Y, penalty_structure=[0]*7+[1]*3, nstep=10) sol = lasso.main(inner_tol=1.e-12, verbose=True) beta = np.array(sol['beta'].todense()) sols = [] sols_sep = [] for l in sol['lagrange']: loss = rr.squared_error(Xn, Y, coef=1./n) penalty = rr.group_lasso([rr.UNPENALIZED] + [0]*7 + [1]*3, l) # matrix contains an intercept... problem = rr.simple_problem(loss, penalty) sols.append(problem.solve(tol=1.e-12).copy()) sep = rr.separable((11,), [rr.l2norm((7,),np.sqrt(7)*l), rr.l2norm((3,),np.sqrt(3)*l)],[np.arange(1,8),np.arange(8,11)]) sep_problem = rr.simple_problem(loss, sep) sols_sep.append(sep_problem.solve(tol=1.e-12).copy()) sols = np.array(sols).T sols_sep = np.array(sols_sep).T nt.assert_true(np.linalg.norm(beta - sols) / (1 + np.linalg.norm(beta)) <= 1.e-4) nt.assert_true(np.linalg.norm(beta - sols_sep) / (1 + np.linalg.norm(beta)) <= 1.e-4)
def test_group_lasso_equivalent(): """ with 0 as lasso weights should be group lasso """ pen1 = sparse_group_lasso([1, 1, 2, 2, 2], np.zeros(5), weights={ 1: 0.2, 2: 0.1 }, lagrange=0.4) pen2 = rr.group_lasso([1, 1, 2, 2, 2], {1: 0.2, 2: 0.1}, lagrange=0.4) Z = np.array([3, 2, 4, 6, 7]) np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z)) Z = np.random.standard_normal(5) * 100 np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z)) dual1 = pen1.conjugate dual2 = pen2.conjugate np.testing.assert_allclose(Z, pen1.lagrange_prox(Z) + dual1.bound_prox(Z)) np.testing.assert_allclose(dual1.bound_prox(Z), dual2.bound_prox(Z))
def __init__(self, loglike, feature_weights, candidate, randomizer_scale, active=None, randomizer='gaussian', parametric_cov_estimator=False): r""" Create a new post-selection for the stepwise problem Parameters ---------- loglike : `regreg.smooth.glm.glm` A (negative) log-likelihood as implemented in `regreg`. feature_weights : np.ndarray Feature weights for L-1 penalty. If a float, it is brodcast to all features. candidate : np.bool Which groups of variables are candidates for inclusion in this step. randomizer_scale : float Scale for IID components of randomization. active : np.bool (optional) Which groups of variables make up $E$, the set of variables we partially minimize over. randomizer : str (optional) One of ['laplace', 'logistic', 'gaussian'] """ self.active = active self.candidate = candidate self.loglike = loglike self.nfeature = p = loglike.shape[0] if np.asarray(feature_weights).shape == (): feature_weights = np.ones(loglike.shape) * feature_weights self.feature_weights = np.asarray(feature_weights) self.parametric_cov_estimator = parametric_cov_estimator nrandom = candidate.sum() if randomizer == 'laplace': self.randomizer = randomization.laplace((nrandom, ), scale=randomizer_scale) elif randomizer == 'gaussian': self.randomizer = randomization.isotropic_gaussian( (nrandom, ), randomizer_scale) elif randomizer == 'logistic': self.randomizer = randomization.logistic((nrandom, ), scale=randomizer_scale) self.penalty = rr.group_lasso(np.arange(p), weights=dict( zip(np.arange(p), self.feature_weights)), lagrange=1.)
def test_without_screening(s=10, n=300, p=100, rho=0., signal=3.5, lam_frac=1., ndraw=10000, burnin=2000, loss='gaussian', randomizer='laplace', randomizer_scale=1., scalings=False, subgrad=True, check_screen=False): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1, random_signs=False) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) X_indep, y_indep, _, _, _ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) loss_indep = rr.glm.gaussian(X_indep, y_indep) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) X_indep, y_indep, _, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, random_signs=False) loss_indep = rr.glm.logistic(X_indep, y_indep) nonzero = np.where(beta)[0] if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=randomizer_scale) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] print("active set", active_set) print("true nonzero", np.nonzero(beta)[0]) views = [M_est] queries = multiple_queries(views) queries.solve() screened = False if set(nonzero).issubset(np.nonzero(active_union)[0]): screened = True if check_screen == False or (check_screen == True and screened == True): #if nactive==s: # return None if scalings: # try condition on some scalings M_est.condition_on_subgradient() M_est.condition_on_scalings() if subgrad: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) boot_target1, boot_target_observed1 = pairs_bootstrap_glm( loss, active_union, inactive=~active_union) boot_target2, boot_target_observed2 = pairs_bootstrap_glm( loss_indep, active_union, inactive=~active_union) target_observed = (boot_target_observed1 - boot_target_observed2)[:nactive] def _target(indices): return boot_target1(indices)[:nactive] - boot_target2( indices)[:nactive] form_covariances = glm_nonparametric_bootstrap(n, n) queries.setup_sampler(form_covariances) queries.setup_opt_state() target_sampler = queries.setup_target(_target, target_observed, reference=target_observed) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros(nactive), sample=target_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) true_vec = np.zeros(nactive) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def test_cv(n=100, p=50, s=5, signal=7.5, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., glmnet=True, loss='gaussian', bootstrap=False, condition_on_CVR=True, marginalize_subgrad=True, ndraw=10000, burnin=2000, nboot=nboot): print(n, p, s, condition_on_CVR, scale1, scale2) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) glm_loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) glm_loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) # view 1 cv = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) if glmnet: try: cv.solve(glmnet=glmnet) except ImportError: cv.solve(glmnet=False) else: cv.solve(glmnet=False) # for the test make sure we also run the python code cv_py = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) cv_py.solve(glmnet=False) lam = cv.lam_CVR print("lam", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = cv.one_SD_rule(direction="up") print("new lam", lam) # non-randomized Lasso, just looking how many vars it selects problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized lasso ", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(glm_loss, epsilon, penalty, randomizer) if nboot > 0: cv.nboot = M_est.nboot = nboot mv = multiple_queries([cv, M_est]) mv.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est.decompose_subgradient(conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) selected_features = np.zeros(p, np.bool) selected_features[active_set] = True unpenalized_mle = restricted_Mest(M_est.loss, selected_features) form_covariances = glm_nonparametric_bootstrap(n, n) target_info, target_observed = pairs_bootstrap_glm(M_est.loss, selected_features, inactive=None) cov_info = M_est.setup_sampler() target_cov, score_cov = form_covariances(target_info, cross_terms=[cov_info], nsample=M_est.nboot) opt_sample = M_est.sampler.sample(ndraw, burnin) pvalues = M_est.sampler.coefficient_pvalues( unpenalized_mle, target_cov, score_cov, parameter=np.zeros(selected_features.sum()), sample=opt_sample) intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample) L, U = intervals.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(np.diag(target_cov), target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(np.diag(target_cov), target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j] - L[j] naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def __init__(self, loglike, feature_weights, ridge_term, randomizer_scale, randomizer='gaussian', covariance_estimator=None): r""" Create a new post-selection dor the LASSO problem Parameters ---------- loglike : `regreg.smooth.glm.glm` A (negative) log-likelihood as implemented in `regreg`. feature_weights : np.ndarray Feature weights for L-1 penalty. If a float, it is brodcast to all features. ridge_term : float How big a ridge term to add? randomizer_scale : float Scale for IID components of randomization. randomizer : str One of ['laplace', 'logistic', 'gaussian'] covariance_estimator : callable (optional) If None, use the parameteric covariance estimate of the selected model. Notes ----- If not None, `covariance_estimator` should take arguments (beta, active, inactive) and return an estimate of the covariance of $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$, the unpenalized estimator and the inactive coordinates of the gradient of the likelihood at the unpenalized estimator. """ self.loglike = loglike self.nfeature = p = self.loglike.shape[0] if np.asarray(feature_weights).shape == (): feature_weights = np.ones(loglike.shape) * feature_weights self.feature_weights = np.asarray(feature_weights) self.covariance_estimator = covariance_estimator if randomizer == 'laplace': self.randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': self.randomizer = randomization.isotropic_gaussian( (p, ), randomizer_scale) elif randomizer == 'logistic': self.randomizer = randomization.logistic((p, ), scale=randomizer_scale) self.ridge_term = ridge_term self.penalty = rr.group_lasso(np.arange(p), weights=dict( zip(np.arange(p), self.feature_weights)), lagrange=1.)
def test_multiple_queries(s=3, n=300, p=20, signal=7, rho=0.1, lam_frac=0.7, nviews=4, intervals='new', ndraw=10000, burnin=2000, solve_args={ 'min_its': 50, 'tol': 1.e-10 }, check_screen=True): randomizer = randomization.laplace((p, ), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] for i in range(nviews): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nviews): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None screen = set(nonzero).issubset(np.nonzero(active_union)[0]) if check_screen and not screen: return None if True: active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] ## bootstrap target_sampler_boot, target_observed = glm_target(loss, active_union, mv, bootstrap=True) if intervals == 'old': target_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin) LU_boot = target_sampler_boot.confidence_intervals( target_observed, sample=target_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample_boot) else: full_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU_boot = target_sampler_boot.confidence_intervals_translate( target_observed, sample=full_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample_boot) ## CLT plugin target_sampler, _ = glm_target(loss, active_union, mv, bootstrap=False) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) covered_boot, ci_length_boot = coverage(LU_boot) covered_naive, ci_length_naive = coverage(LU_naive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): active_var[j] = active_set[j] in nonzero return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \ active_var, covered_naive, ci_length_naive
def test_scaling( snr=15, s=5, n=200, p=20, rho=0.1, burnin=20000, ndraw=30000, scale=0.9, nsim=None, # needed for decorator frac=0.5 ): # 0.9 has roughly same screening probability as 50% data splitting, i.e. around 10% randomizer = randomization.laplace((p, ), scale=scale) X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active = M_est.selection_variable['variables'] nactive = active.sum() if set(nonzero).issubset(np.nonzero(active)[0]): pvalues = [] active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] active_selected = A = [ i for i in np.arange(active_set.shape[0]) if active_set[i] in nonzero ] if not I: return None idx = I[0] inactive = ~M_est.selection_variable['variables'] boot_target, target_observed = pairs_bootstrap_glm(loss, active, inactive=inactive) if DEBUG: sampler = lambda: np.random.choice(n, size=(n, ), replace=True) print(boot_target(sampler())[-3:], 'boot target') form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) # null saturated def null_target(indices): result = boot_target(indices) return result[idx] null_observed = np.zeros(1) null_observed[0] = target_observed[idx] target_sampler = mv.setup_target(null_target, null_observed) #target_scaling = 5 * np.linalg.svd(target_sampler.target_transform[0][0])[1].max()**2# should have something do with noise scale too print(target_sampler.crude_lipschitz(), 'crude') test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(null_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true saturated idx = A[0] def active_target(indices): result = boot_target(indices) return result[idx] active_observed = np.zeros(1) active_observed[0] = target_observed[idx] target_sampler = mv.setup_target(active_target, active_observed) target_scaling = 5 * np.linalg.svd( target_sampler.target_transform[0] [0])[1].max()**2 # should have something do with noise scale too test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(active_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # null selected idx = I[0] def null_target(indices): result = boot_target(indices) return np.hstack([result[idx], result[nactive:]]) null_observed = np.zeros_like(null_target(range(n))) null_observed[0] = target_observed[idx] null_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(null_target, null_observed) #, target_set=[0]) target_scaling = 5 * np.linalg.svd( target_sampler.target_transform[0] [0])[1].max()**2 # should have something do with noise scale too print(target_sampler.crude_lipschitz(), 'crude') test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(null_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true selected idx = A[0] def active_target(indices): result = boot_target(indices) return np.hstack([result[idx], result[nactive:]]) active_observed = np.zeros_like(active_target(range(n))) active_observed[0] = target_observed[idx] active_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(active_target, active_observed) #, target_set=[0]) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(active_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # condition on opt variables ### NOT WORKING -- need to implement conditioning within M_estimator!!! if False: # null saturated idx = I[0] def null_target(indices): result = boot_target(indices) return result[idx] null_observed = np.zeros(1) null_observed[0] = target_observed[idx] target_sampler = mv.setup_target(null_target, null_observed) print(target_sampler.crude_lipschitz(), 'crude') test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(null_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true saturated idx = A[0] def active_target(indices): result = boot_target(indices) return result[idx] active_observed = np.zeros(1) active_observed[0] = target_observed[idx] sampler = lambda: np.random.choice(n, size=(n, ), replace=True) target_sampler = mv.setup_target(active_target, active_observed) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(active_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true selected # oracle p-value -- draws a new data set X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) X_E = X[:, active_set] active_var = [False, True, False, True] if statsmodels_available: try: model = sm.GLM(y, X_E, family=sm.families.Binomial()) model_results = model.fit() pvalues.extend( [model_results.pvalues[I[0]], model_results.pvalues[A[0]]]) active_var.extend([False, True]) except sm.tools.sm_exceptions.PerfectSeparationError: pass else: pass # data splitting-ish p-value -- draws a new data set of smaller size # frac is presumed to be how much data was used in stage 1, we get (1-frac)*n for stage 2 # frac defaults to 0.5 Xs, ys, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) Xs = Xs[:int((1 - frac) * n)] ys = ys[:int((1 - frac) * n)] X_Es = Xs[:, active_set] if statsmodels_available: try: model = sm.GLM(ys, X_Es, family=sm.families.Binomial()) model_results = model.fit() pvalues.extend( [model_results.pvalues[I[0]], model_results.pvalues[A[0]]]) active_var.extend([False, False]) except sm.tools.sm_exceptions.PerfectSeparationError: pass else: pass return pvalues, active_var
def test_intervals(s=0, n=200, p=10, signal=7, rho=0., lam_frac=6., ndraw=10000, burnin=2000, bootstrap=True, loss='gaussian', intervals='old', randomizer='laplace', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=1.) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=1.) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=1.) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam # W[0] = 0 # use at least some unpenalized groups = np.concatenate([np.arange(10) for i in range(p / 10)]) #print(groups) #groups = np.arange(p) penalty = rr.group_lasso(groups, weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) # second randomization #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) #mv = multiple_queries([M_est1, M_est2]) mv.solve() active_union = M_est1.selection_variable['variables'] print("active set", np.nonzero(active_union)[0]) nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues_translate( target_observed, parameter=target_sampler.reference, sample=full_sample) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) L, U = LU.T ci_length_sel = np.zeros(nactive) covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) ci_length_naive = np.zeros(nactive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length_sel[j] = U[j] - L[j] if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\ naive_pvals, naive_covered, ci_length_naive, active_var
def test_sqrt_lasso(n=500, p=20, s=3, signal=10, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., bootstrap=False, condition_on_CVR=False, marginalize_subgrad=True, ndraw=10000, burnin=2000): print(n, p, s) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam_nonrandom = choose_lambda(X) lam_random = choose_lambda_with_randomization(X, randomizer) loss = l2norm_glm(X, y) #sqloss = rr.glm.gaussian(X, y) epsilon = 1. / n # non-randomized sqrt-Lasso, just looking how many vars it selects problem = rr.simple_problem(loss, rr.l1norm(p, lagrange=lam_nonrandom)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized sqrt-root Lasso active set", np.where(beta_hat)[0]) print("non-randomized sqrt-lasso", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam_random penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1. / np.sqrt(n)) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active_set = M_est._overall nactive = np.sum(active_set) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_set)[0]): active_set = np.nonzero(active_set)[0] true_vec = beta[active_set] if marginalize_subgrad == True: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) selected_features = np.zeros(p, np.bool) selected_features[active_set] = True unpenalized_mle = restricted_Mest(M_est.loss, selected_features) form_covariances = glm_nonparametric_bootstrap(n, n) boot_target, boot_target_observed = pairs_bootstrap_glm( M_est.loss, selected_features, inactive=None) target_info = boot_target cov_info = M_est.setup_sampler() target_cov, score_cov = form_covariances(target_info, cross_terms=[cov_info], nsample=M_est.nboot) opt_sample = M_est.sampler.sample(ndraw, burnin) pvalues = M_est.sampler.coefficient_pvalues( unpenalized_mle, target_cov, score_cov, parameter=np.zeros(selected_features.sum()), sample=opt_sample) intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample) true_vec = beta[M_est.selection_variable['variables']] L, U = intervals.T covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 active_var[j] = active_set[j] in nonzero return pvalues, covered, active_var
def test_nonrandomized(s=0, n=200, p=10, signal=7, rho=0, lam_frac=0.8, loss='gaussian', solve_args={ 'min_its': 20, 'tol': 1.e-10 }): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] print("lam", lam) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) true_vec = beta M_est = M_estimator(lam, loss, penalty) M_est.solve() active = M_est._overall nactive = np.sum(active) print("nactive", nactive) if nactive == 0: return None #score_mean = M_est.observed_internal_state.copy() #score_mean[nactive:] = 0 M_est.setup_sampler(score_mean=np.zeros(p)) #M_est.setup_sampler(score_mean=score_mean) #M_est.sample(ndraw = 1000, burnin=1000, stepsize=1./p) if set(nonzero).issubset(np.nonzero(active)[0]): check_screen = True #test_stat = lambda x: np.linalg.norm(x) #return M_est.hypothesis_test(test_stat, test_stat(M_est.observed_internal_state), stepsize=1./p) ci = M_est.confidence_intervals(M_est.observed_internal_state) pivots = M_est.coefficient_pvalues(M_est.observed_internal_state) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered = coverage(ci)[0] #print(pivots) #print(coverage) return pivots, covered
def randomized_lasso_trial(X, y, beta, sigma, lam, loss='logistic', randomizer='gaussian', estimation='parametric'): from selection.api import randomization n, p = X.shape if loss == "gaussian": loss = rr.glm.gaussian(X, y) elif loss == "logistic": loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomization = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx_logistic(loss, epsilon, penalty, randomization, randomizer, estimation) M_est.solve_approx() active = M_est._overall #print("here",glm.shape) active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) glm = M_est.observed_score_state[:nactive] prior_variance = 100000. #generative_mean = np.zeros(p) #sel_split = selection_probability_random_lasso(M_est, generative_mean) #test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall])) #print("gradient at test point", sel_split.smooth_objective(test_point, mode= "grad")) class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(M_est.target_cov) unadjusted_intervals = (naive_confidence_intervals( target, M_est.target_observed)).T grad_lasso = sel_inf_random_lasso(M_est, prior_variance) samples = grad_lasso.posterior_samples() adjusted_intervals = np.vstack([ np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0) ]) selective_mean = np.mean(samples, axis=0) true_val = np.zeros(nactive) coverage_ad = np.zeros(nactive) coverage_unad = np.zeros(nactive) ad_length = np.zeros(nactive) unad_length = np.zeros(nactive) for l in range(nactive): if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]): coverage_ad[l] += 1 ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l] if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]): coverage_unad[l] += 1 unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0, l] sel_cov = coverage_ad.sum() / nactive naive_cov = coverage_unad.sum() / nactive ad_len = ad_length.sum() / nactive unad_len = unad_length.sum() / nactive bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive bayes_risk_unad = np.power(glm - true_val, 2.).sum() / nactive return np.vstack( [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def test_intervals(s=3, n=200, p=50, snr=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, intervals='new', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) # second randomization # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) # mv = multiple_queries([M_est1, M_est2]) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable['variables'] nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) if intervals == 'old': LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) unpenalized_mle = restricted_Mest( loss, M_est1.selection_variable['variables'], solve_args=solve_args) L, U = LU.T covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
def test_fixedX(ndraw=10000, burnin=2000): # nsim needed for decorator s, n, p = 5, 200, 20 randomizer = randomization.laplace((p, ), scale=1.) X, Y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0.1, snr=7) lam_frac = 1. lam = lam_frac * np.mean( np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0)) * sigma W = np.ones(p) * lam epsilon = 1. / np.sqrt(n) penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = fixedX_group_lasso(X, Y, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active = M_est.selection_variable['variables'] nactive = active.sum() if set(nonzero).issubset( np.nonzero(active)[0]) and active.sum() > len(nonzero): pvalues = [] active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] active_selected = A = [ i for i in np.arange(active_set.shape[0]) if active_set[i] in nonzero ] if not I: return None idx = I[0] boot_target, target_observed = resid_bootstrap(M_est.loss, active) X_active = X[:, active] beta_hat = np.linalg.pinv(X_active).dot(Y) resid_hat = Y - X_active.dot(beta_hat) form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) # null saturated def null_target(Y_star): result = boot_target(Y_star) return result[idx] null_observed = np.zeros(1) null_observed[0] = target_observed[idx] target_sampler = mv.setup_target(null_target, null_observed) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, null_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) # null selected def null_target(Y_star): result = boot_target(Y_star) return np.hstack([result[idx], result[nactive:]]) null_observed = np.zeros_like(null_target( np.random.standard_normal(n))) null_observed[0] = target_observed[idx] null_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(null_target, null_observed, target_set=[0]) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, null_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) # true saturated idx = A[0] def active_target(Y_star): result = boot_target(Y_star) return result[idx] active_observed = np.zeros(1) active_observed[0] = target_observed[idx] sampler = lambda: np.random.choice(n, size=(n, ), replace=True) target_sampler = mv.setup_target(active_target, active_observed) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, active_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) # true selected def active_target(Y_star): result = boot_target(Y_star) return np.hstack([result[idx], result[nactive:]]) active_observed = np.zeros_like( active_target(np.random.standard_normal(n))) active_observed[0] = target_observed[idx] active_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(active_target, active_observed, target_set=[0]) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, active_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) return pvalues, [False, False, True, True]
ols_fit = sm.OLS(Y, X).fit() sigma_3TC = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1) OLS_3TC = ols_fit.params lam_frac = 1. loss = rr.glm.gaussian(X, Y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma_3TC print(lam) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomization = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer='gaussian') M_est.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) active_set_0 = [NRTI_muts[i] for i in range(p) if active[i]]
def test_split_compare(s=3, n=200, p=20, signal=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, solve_args={'min_its':50, 'tol':1.e-10}, check_screen =True): X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) m = int(split_frac * n) M_est1 = split_glm_group_lasso(loss, epsilon, m, penalty) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable['variables'] #+ M_est2.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if nactive==0: return None leftout_indices = M_est1.randomized_loss.saturated_loss.case_weights == 0 screen = set(nonzero).issubset(np.nonzero(active_union)[0]) if check_screen and not screen: return None if True: active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] ## bootstrap target_sampler_boot, target_observed = glm_target(loss, active_union, mv, bootstrap=True) target_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin) LU_boot = target_sampler_boot.confidence_intervals(target_observed, sample=target_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample_boot) ## CLT plugin target_sampler, _ = glm_target(loss, active_union, mv, bootstrap=False) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) if X.shape[0] - leftout_indices.sum() > nactive: LU_split = standard_split_ci(rr.glm.logistic, X, y, active_union, leftout_indices) else: LU_split = np.ones((nactive, 2)) * np.nan def coverage(LU): L, U = LU[:,0], LU[:,1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j]-L[j] return covered, ci_length covered, ci_length = coverage(LU) covered_boot, ci_length_boot = coverage(LU_boot) covered_split, ci_length_split = coverage(LU_split) covered_naive, ci_length_naive = coverage(LU_naive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): active_var[j] = active_set[j] in nonzero return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \ covered_split, ci_length_split, active_var, covered_naive, ci_length_naive