def test_multiple_views(): s, n, p = 5, 200, 20 randomizer = randomization.laplace((p, ), scale=0.5) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=7) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) # second randomization M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_views([M_est1, M_est2]) mv.solve() active = M_est1.overall + M_est2.overall if set(nonzero).issubset(np.nonzero(active)[0]): active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] boot_target, target_observed = pairs_bootstrap_glm(loss, active) inactive_target = lambda indices: boot_target(indices)[ inactive_selected] inactive_observed = target_observed[inactive_selected] sampler = lambda: np.random.choice(n, size=(n, ), replace=True) mv.setup_sampler(sampler) target_sampler = mv.setup_target(inactive_target, inactive_observed) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test(test_stat, inactive_observed, alternative='greater') return pval
def test_multiple_queries_individual_coeff(ndraw=10000, burnin=2000): s, n, p = 3, 120, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=5) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] nview = 5 for i in range(nview): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nview): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] pvalues = [] true_beta = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): for j in range(nactive): subset = np.zeros(p, np.bool) subset[active_set[j]] = True target_sampler, target_observed = glm_target(loss, active_union * ~subset, mv, subset=subset, reference=np.zeros((1,))) test_stat = lambda x: np.atleast_1d(x) pval = target_sampler.hypothesis_test(test_stat, np.atleast_1d(target_observed-true_beta[j]), alternative='twosided', ndraw=ndraw, burnin=burnin) pvalues.append(pval) active_var = np.zeros_like(pvalues, np.bool) _nonzero = np.array([i in nonzero for i in active_set]) active_var[_nonzero] = True return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_multiple_queries_individual_coeff_small(ndraw=10000, burnin=2000, bootstrap=True): s, n, p = 3, 100, 20 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=20.) nonzero = np.where(beta)[0] lam_frac = 3. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # randomization M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active_vars = M_est.selection_variable['variables'] nactive = np.sum(active_vars) active_set = np.nonzero(active_vars)[0] pvalues = [] true_beta = beta[active_vars] print(nonzero, active_set) if set(nonzero).issubset(active_set): for j in range(nactive): print(j) subset = np.zeros(p, np.bool) subset[active_set[j]] = True target_sampler, target_observed = glm_target(loss, active_vars, mv, subset=subset, bootstrap=bootstrap, reference=np.zeros((1,))) test_stat = lambda x: x pval = target_sampler.hypothesis_test(test_stat, target_observed, alternative='twosided', ndraw=ndraw, burnin=burnin) pvalues.append(pval) return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_intervals(s=3, n=200, p=50, snr=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, intervals='new', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) # second randomization # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) # mv = multiple_queries([M_est1, M_est2]) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable['variables'] nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) if intervals == 'old': LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) unpenalized_mle = restricted_Mest( loss, M_est1.selection_variable['variables'], solve_args=solve_args) L, U = LU.T covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
def test_without_screening(s=10, n=300, p=100, rho=0., signal=3.5, lam_frac=1., ndraw=10000, burnin=2000, loss='gaussian', randomizer='laplace', randomizer_scale=1., scalings=False, subgrad=True, check_screen=False): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1, random_signs=False) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) X_indep, y_indep, _, _, _ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) loss_indep = rr.glm.gaussian(X_indep, y_indep) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) X_indep, y_indep, _, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, random_signs=False) loss_indep = rr.glm.logistic(X_indep, y_indep) nonzero = np.where(beta)[0] if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=randomizer_scale) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] print("active set", active_set) print("true nonzero", np.nonzero(beta)[0]) views = [M_est] queries = multiple_queries(views) queries.solve() screened = False if set(nonzero).issubset(np.nonzero(active_union)[0]): screened = True if check_screen == False or (check_screen == True and screened == True): #if nactive==s: # return None if scalings: # try condition on some scalings M_est.condition_on_subgradient() M_est.condition_on_scalings() if subgrad: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) boot_target1, boot_target_observed1 = pairs_bootstrap_glm( loss, active_union, inactive=~active_union) boot_target2, boot_target_observed2 = pairs_bootstrap_glm( loss_indep, active_union, inactive=~active_union) target_observed = (boot_target_observed1 - boot_target_observed2)[:nactive] def _target(indices): return boot_target1(indices)[:nactive] - boot_target2( indices)[:nactive] form_covariances = glm_nonparametric_bootstrap(n, n) queries.setup_sampler(form_covariances) queries.setup_opt_state() target_sampler = queries.setup_target(_target, target_observed, reference=target_observed) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros(nactive), sample=target_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) true_vec = np.zeros(nactive) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def test_multiple_queries(s=3, n=300, p=20, signal=7, rho=0.1, lam_frac=0.7, nviews=4, intervals='new', ndraw=10000, burnin=2000, solve_args={ 'min_its': 50, 'tol': 1.e-10 }, check_screen=True): randomizer = randomization.laplace((p, ), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] for i in range(nviews): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nviews): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None screen = set(nonzero).issubset(np.nonzero(active_union)[0]) if check_screen and not screen: return None if True: active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] ## bootstrap target_sampler_boot, target_observed = glm_target(loss, active_union, mv, bootstrap=True) if intervals == 'old': target_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin) LU_boot = target_sampler_boot.confidence_intervals( target_observed, sample=target_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample_boot) else: full_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU_boot = target_sampler_boot.confidence_intervals_translate( target_observed, sample=full_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample_boot) ## CLT plugin target_sampler, _ = glm_target(loss, active_union, mv, bootstrap=False) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) covered_boot, ci_length_boot = coverage(LU_boot) covered_naive, ci_length_naive = coverage(LU_naive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): active_var[j] = active_set[j] in nonzero return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \ active_var, covered_naive, ci_length_naive
def test_intervals(s=0, n=200, p=10, signal=7, rho=0., lam_frac=6., ndraw=10000, burnin=2000, bootstrap=True, loss='gaussian', intervals='old', randomizer='laplace', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=1.) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=1.) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=1.) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam # W[0] = 0 # use at least some unpenalized groups = np.concatenate([np.arange(10) for i in range(p / 10)]) #print(groups) #groups = np.arange(p) penalty = rr.group_lasso(groups, weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) # second randomization #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) #mv = multiple_queries([M_est1, M_est2]) mv.solve() active_union = M_est1.selection_variable['variables'] print("active set", np.nonzero(active_union)[0]) nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues_translate( target_observed, parameter=target_sampler.reference, sample=full_sample) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) L, U = LU.T ci_length_sel = np.zeros(nactive) covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) ci_length_naive = np.zeros(nactive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length_sel[j] = U[j] - L[j] if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\ naive_pvals, naive_covered, ci_length_naive, active_var
def test_condition(s=0, n=100, p=200, rho=0.1, signal=10, lam_frac=1.4, ndraw=10000, burnin=2000, loss='logistic', nviews=4, scalings=True): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) randomizer = randomization.laplace((p, ), scale=0.6) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive == s: return None if scalings: # try condition on some scalings for i in range(nviews // 2): conditioning_groups = np.zeros(p, bool) conditioning_groups[:p // 2] = True marginalizing_groups = np.ones(p, bool) marginalizing_groups[:p // 2] = False views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) views[i].condition_on_scalings() else: for i in range(nviews): views[i].decompose_subgradient( conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries) test_stat = lambda x: np.linalg.norm(x - beta[active_union]) observed_test_value = test_stat(target_observed) pivots = target_sampler.hypothesis_test(test_stat, observed_test_value, alternative='twosided', parameter=beta[active_union], ndraw=ndraw, burnin=burnin) return [pivots], [False]
def test_multiple_queries_small(ndraw=10000, burnin=2000, nsim=None): # nsim needed for decorator s, n, p = 2, 100, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=3) nonzero = np.where(beta)[0] lam_frac = .6 loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active_union = M_est.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive==s: return None active_set = np.nonzero(active_union)[0] inactive_selected = I = [i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero] if not I: return None inactive_indicators_mat = np.zeros((len(inactive_selected),nactive)) j = 0 for i in range(nactive): if active_set[i] not in nonzero: inactive_indicators_mat[j,i] = 1 j+=1 form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) boot_target, target_observed = pairs_bootstrap_glm(loss, active_union) inactive_target = lambda indices: boot_target(indices)[inactive_selected] inactive_observed = target_observed[inactive_selected] # param_cov = _parametric_cov_glm(loss, active_union) alpha_mat = set_alpha_matrix(loss, active_union) # target = target_alpha\times alpha+reference_vec target_alpha = np.dot(inactive_indicators_mat, alpha_mat) target_sampler = mv.setup_bootstrapped_target(inactive_target, inactive_observed, target_alpha) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test(test_stat, np.linalg.norm(inactive_observed), alternative='twosided', ndraw=ndraw, burnin=burnin) # testing the global null all_selected = np.arange(active_set.shape[0]) target_gn = lambda indices: boot_target(indices)[:nactive] target_observed_gn = target_observed[:nactive] target_alpha_gn = alpha_mat target_sampler_gn = mv.setup_bootstrapped_target(target_gn, target_observed_gn, target_alpha_gn, reference = beta[active_union]) test_stat_boot_gn = lambda x: np.linalg.norm(x) observed_test_value = np.linalg.norm(target_observed_gn-beta[active_union]) pval_gn = target_sampler_gn.hypothesis_test(test_stat_boot_gn, observed_test_value, alternative='twosided', ndraw=ndraw, burnin=burnin) return [pval, pval_gn], [False, False]
def test_multiple_queries(ndraw=10000, burnin=2000, bootstrap=False, test = 'selected zeros'): s, n, p = 3, 600, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=4) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] nview = 5 for i in range(nview): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nview): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive==s: return None active_set = np.nonzero(active_union)[0] if test == 'selected zeros': inactive_selected = np.array([active_union[i] and i not in nonzero for i in range(p)]) true_active = (beta != 0) reference = np.zeros(inactive_selected.sum()) target_sampler, target_observed = glm_target(loss, true_active, mv, subset=inactive_selected, bootstrap=bootstrap, reference=reference) else: target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) test_stat = lambda x: np.linalg.norm(x) observed_test_value = test_stat(target_observed) pivot = target_sampler.hypothesis_test(test_stat, observed_test_value, alternative='twosided', ndraw=ndraw, burnin=burnin) full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) pivot = target_sampler.hypothesis_test_translate(full_sample, test_stat, target_observed, alternative='twosided') return [pivot], [False]
def test_cv(n=100, p=50, s=5, signal=7.5, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., glmnet=True, loss='gaussian', intervals='old', bootstrap=False, condition_on_CVR=True, marginalize_subgrad=True, ndraw=10000, burnin=2000, nboot=nboot): print(n, p, s, condition_on_CVR, scale1, scale2) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) glm_loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) glm_loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) # view 1 cv = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) if glmnet: try: cv.solve(glmnet=glmnet) except ImportError: cv.solve(glmnet=False) else: cv.solve(glmnet=False) # for the test make sure we also run the python code cv_py = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) cv_py.solve(glmnet=False) lam = cv.lam_CVR print("lam", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = cv.one_SD_rule(direction="up") print("new lam", lam) # non-randomized Lasso, just looking how many vars it selects problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized lasso ", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est1 = glm_group_lasso(glm_loss, epsilon, penalty, randomizer) if nboot > 0: cv.nboot = M_est1.nboot = nboot mv = multiple_queries([cv, M_est1]) mv.solve() active_union = M_est1._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) target_sampler, target_observed = glm_target(glm_loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) L, U = LU.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(target_sampler, target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j] - L[j] naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def test_condition(ndraw=10000, burnin=2000, scalings=True): s, n, p = 6, 600, 40 X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0.2, snr=5) randomizer = randomization.isotropic_gaussian((p, ), scale=sigma) lam_frac = 1.5 loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] nview = 3 for i in range(nview): views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive == s: return None if scalings: # try condition on some scalings views[0].condition_on_scalings() views[0].condition_on_subgradient() views[1].condition_on_subgradient() views[2].condition_on_scalings() else: views[0].condition_on_subgradient() views[1].condition_on_subgradient() views[2].condition_on_subgradient() active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries) pvalues = target_sampler.coefficient_pvalues(target_observed, alternative='twosided', ndraw=ndraw, burnin=burnin) active_var = np.zeros_like(pvalues, np.bool) _nonzero = np.array([i in nonzero for i in active_set]) active_var[_nonzero] = True return pvalues, active_var
def test_marginalize(s=4, n=600, p=200, rho=0., signal=3.5, lam_frac=2.5, ndraw=10000, burnin=2000, loss='gaussian', randomizer='gaussian', randomizer_scale=1., nviews=3, scalings=True, subgrad=True, parametric=False, intervals='old'): print(n, p, s) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): if parametric == False: views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) else: views.append( glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] true_vec = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): check_screen = True if nactive == s: return None # BUG: if this scalings code is moveed after the decompose_subgradient, # code seems to run fine if scalings: # try condition on some scalings for i in range(nviews): views[i].condition_on_scalings() if subgrad: for i in range(nviews): conditioning_groups = np.zeros(p, dtype=bool) conditioning_groups[:(p / 2)] = True marginalizing_groups = np.zeros(p, dtype=bool) marginalizing_groups[(p / 2):] = True views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries, bootstrap=False, parametric=parametric) #reference= beta[active_union]) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) elif intervals == 'new': full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive