def test_intervals(s=3, n=200, p=50, snr=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, intervals='new', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) # second randomization # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) # mv = multiple_queries([M_est1, M_est2]) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable['variables'] nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) if intervals == 'old': LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) unpenalized_mle = restricted_Mest( loss, M_est1.selection_variable['variables'], solve_args=solve_args) L, U = LU.T covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
ci_active = np.zeros((nactive, 2)) ci_length = np.zeros(nactive) mle_active = np.zeros((nactive, 1)) ci = approximate_conditional_density(M_est) ci.solve_approx() class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(M_est.target_cov) ci_naive = naive_confidence_intervals(target, M_est.target_observed) for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) ci_length[j] = ci_active[j, 1] - ci_active[j, 0] mle_active[j, :] = ci.approx_MLE_solver(j, nstep=100)[0] unadjusted_mle = np.zeros((nactive, 1)) for j in range(nactive): unadjusted_mle[j, :] = ci.target_observed[j] adjusted_intervals = np.hstack([mle_active, ci_active]).T unadjusted_intervals = np.hstack([unadjusted_mle, ci_naive]).T print("adjusted confidence", adjusted_intervals) print("naive confidence", unadjusted_intervals)
def test_without_screening(s=10, n=300, p=100, rho=0., signal=3.5, lam_frac=1., ndraw=10000, burnin=2000, loss='gaussian', randomizer='laplace', randomizer_scale=1., scalings=False, subgrad=True, check_screen=False): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1, random_signs=False) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) X_indep, y_indep, _, _, _ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) loss_indep = rr.glm.gaussian(X_indep, y_indep) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) X_indep, y_indep, _, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, random_signs=False) loss_indep = rr.glm.logistic(X_indep, y_indep) nonzero = np.where(beta)[0] if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=randomizer_scale) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] print("active set", active_set) print("true nonzero", np.nonzero(beta)[0]) views = [M_est] queries = multiple_queries(views) queries.solve() screened = False if set(nonzero).issubset(np.nonzero(active_union)[0]): screened = True if check_screen == False or (check_screen == True and screened == True): #if nactive==s: # return None if scalings: # try condition on some scalings M_est.condition_on_subgradient() M_est.condition_on_scalings() if subgrad: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) boot_target1, boot_target_observed1 = pairs_bootstrap_glm( loss, active_union, inactive=~active_union) boot_target2, boot_target_observed2 = pairs_bootstrap_glm( loss_indep, active_union, inactive=~active_union) target_observed = (boot_target_observed1 - boot_target_observed2)[:nactive] def _target(indices): return boot_target1(indices)[:nactive] - boot_target2( indices)[:nactive] form_covariances = glm_nonparametric_bootstrap(n, n) queries.setup_sampler(form_covariances) queries.setup_opt_state() target_sampler = queries.setup_target(_target, target_observed, reference=target_observed) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros(nactive), sample=target_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) true_vec = np.zeros(nactive) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def randomized_lasso_trial(X, y, beta, sigma, lam, loss='logistic', randomizer='gaussian', estimation='parametric'): from selection.api import randomization n, p = X.shape if loss == "gaussian": loss = rr.glm.gaussian(X, y) elif loss == "logistic": loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomization = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx_logistic(loss, epsilon, penalty, randomization, randomizer, estimation) M_est.solve_approx() active = M_est._overall #print("here",glm.shape) active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) glm = M_est.observed_score_state[:nactive] prior_variance = 100000. #generative_mean = np.zeros(p) #sel_split = selection_probability_random_lasso(M_est, generative_mean) #test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall])) #print("gradient at test point", sel_split.smooth_objective(test_point, mode= "grad")) class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(M_est.target_cov) unadjusted_intervals = (naive_confidence_intervals( target, M_est.target_observed)).T grad_lasso = sel_inf_random_lasso(M_est, prior_variance) samples = grad_lasso.posterior_samples() adjusted_intervals = np.vstack([ np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0) ]) selective_mean = np.mean(samples, axis=0) true_val = np.zeros(nactive) coverage_ad = np.zeros(nactive) coverage_unad = np.zeros(nactive) ad_length = np.zeros(nactive) unad_length = np.zeros(nactive) for l in range(nactive): if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]): coverage_ad[l] += 1 ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l] if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]): coverage_unad[l] += 1 unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0, l] sel_cov = coverage_ad.sum() / nactive naive_cov = coverage_unad.sum() / nactive ad_len = ad_length.sum() / nactive unad_len = unad_length.sum() / nactive bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive bayes_risk_unad = np.power(glm - true_val, 2.).sum() / nactive return np.vstack( [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def test_split(s=3, n=200, p=50, signal=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, solve_args={'min_its':50, 'tol':1.e-10}, reference_known=False): X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) m = int(split_frac * n) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = split_glm_group_lasso(loss, epsilon, m, penalty) mv = multiple_queries([M_est]) mv.solve() M_est.selection_variable['variables'] = M_est.selection_variable['variables'] nactive = np.sum(M_est.selection_variable['variables']) if nactive==0: return None if set(nonzero).issubset(np.nonzero(M_est.selection_variable['variables'])[0]): active_set = np.nonzero(M_est.selection_variable['variables'])[0] if bootstrap: target_sampler, target_observed = glm_target(loss, M_est.selection_variable['variables'], mv) else: target_sampler, target_observed = glm_target(loss, M_est.selection_variable['variables'], mv, bootstrap=True) reference_known = True if reference_known: reference = beta[M_est.selection_variable['variables']] else: reference = target_observed target_sampler.reference = reference target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample).T LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues(target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=beta[M_est.selection_variable['variables']], sample=target_sample) true_vec = beta[M_est.selection_variable['variables']] pvalues = target_sampler.coefficient_pvalues(target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) L, U = LU covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j,0] <= true_vec[j]) and (LU_naive[j,1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
def test_intervals(s=0, n=200, p=10, signal=7, rho=0., lam_frac=6., ndraw=10000, burnin=2000, bootstrap=True, loss='gaussian', intervals='old', randomizer='laplace', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=1.) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=1.) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=1.) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam # W[0] = 0 # use at least some unpenalized groups = np.concatenate([np.arange(10) for i in range(p / 10)]) #print(groups) #groups = np.arange(p) penalty = rr.group_lasso(groups, weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) # second randomization #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) #mv = multiple_queries([M_est1, M_est2]) mv.solve() active_union = M_est1.selection_variable['variables'] print("active set", np.nonzero(active_union)[0]) nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues_translate( target_observed, parameter=target_sampler.reference, sample=full_sample) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) L, U = LU.T ci_length_sel = np.zeros(nactive) covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) ci_length_naive = np.zeros(nactive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length_sel[j] = U[j] - L[j] if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\ naive_pvals, naive_covered, ci_length_naive, active_var
def test_cv(n=100, p=50, s=5, signal=7.5, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., glmnet=True, loss='gaussian', intervals='old', bootstrap=False, condition_on_CVR=True, marginalize_subgrad=True, ndraw=10000, burnin=2000, nboot=nboot): print(n, p, s, condition_on_CVR, scale1, scale2) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) glm_loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) glm_loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) # view 1 cv = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) if glmnet: try: cv.solve(glmnet=glmnet) except ImportError: cv.solve(glmnet=False) else: cv.solve(glmnet=False) # for the test make sure we also run the python code cv_py = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) cv_py.solve(glmnet=False) lam = cv.lam_CVR print("lam", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = cv.one_SD_rule(direction="up") print("new lam", lam) # non-randomized Lasso, just looking how many vars it selects problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized lasso ", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est1 = glm_group_lasso(glm_loss, epsilon, penalty, randomizer) if nboot > 0: cv.nboot = M_est1.nboot = nboot mv = multiple_queries([cv, M_est1]) mv.solve() active_union = M_est1._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) target_sampler, target_observed = glm_target(glm_loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) L, U = LU.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(target_sampler, target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j] - L[j] naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def test_approximate_ci(n=200, p=50, s=0, snr=5, threshold = 3., rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) if randomizer=='gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer=='laplace': randomization = randomization.laplace((p,), scale=1.) active_bool = np.zeros(p, np.bool) #active_bool[range(3)] = 1 inactive_bool = ~active_bool TS = threshold_score_approx(loss, threshold, randomization, active_bool, inactive_bool, randomizer) TS.solve_approx() active = TS._overall print("nactive", active.sum()) ci = approximate_conditional_density(TS) ci.solve_approx() active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support))== True: ci_active = np.zeros((nactive, 2)) covered = np.zeros(nactive, np.bool) ci_length = np.zeros(nactive) pivots = np.zeros(nactive) class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(TS.target_cov) ci_naive = naive_confidence_intervals(target, TS.target_observed) naive_pvals = naive_pvalues(target, TS.target_observed, true_vec) naive_covered = np.zeros(nactive) toc = time.time() for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j,1] >= true_vec[j]): covered[j] = 1 ci_length[j] = ci_active[j,1] - ci_active[j,0] print(ci_active[j, :]) pivots[j] = ci.approximate_pvalue(j, true_vec[j]) # naive ci if (ci_naive[j,0]<=true_vec[j]) and (ci_naive[j,1]>=true_vec[j]): naive_covered[j]+=1 tic = time.time() print('ci time now', tic - toc) return covered, ci_length, pivots, naive_covered, naive_pvals
def approximate_inference(X, y, beta, sigma, seed_n = 0, lam_frac = 1., loss='gaussian', randomization_scale = 1.): from selection.api import randomization n, p = X.shape np.random.seed(seed_n) if loss == "gaussian": loss = rr.glm.gaussian(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma elif loss == "logistic": loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) randomization = randomization.isotropic_gaussian((p,), scale=randomization_scale) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) GS = greedy_score_map(loss, penalty, np.zeros(p, dtype=bool), np.ones(p, dtype=bool), randomization, randomization_scale) GS.solve_approx() active = GS._overall nactive = np.sum(active) if nactive == 0: return None else: active_set = np.asarray([i for i in range(p) if active[i]]) s = beta.sum() true_support = np.asarray([i for i in range(p) if i < s]) true_vec = beta[active] if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: ci = approximate_conditional_density(GS) ci.solve_approx() sys.stderr.write("True target to be covered" + str(true_vec) + "\n") class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(GS.target_cov) ci_naive = naive_confidence_intervals(target, GS.target_observed) naive_covered = np.zeros(nactive) naive_risk = np.zeros(nactive) ci_sel = np.zeros((nactive, 2)) sel_MLE = np.zeros(nactive) sel_length = np.zeros(nactive) for j in range(nactive): ci_sel[j, :] = np.array(ci.approximate_ci(j)) sel_MLE[j] = ci.approx_MLE_solver(j, step=1, nstep=150)[0] sel_length[j] = ci_sel[j, 1] - ci_sel[j, 0] sel_covered = np.zeros(nactive, np.bool) sel_risk = np.zeros(nactive) for j in range(nactive): sel_risk[j] = (sel_MLE[j] - true_vec[j]) ** 2. naive_risk[j] = (GS.target_observed[j] - true_vec[j]) ** 2. if (ci_sel[j, 0] <= true_vec[j]) and (ci_sel[j, 1] >= true_vec[j]): sel_covered[j] = 1 if (ci_naive[j, 0] <= true_vec[j]) and (ci_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 print("lengths", sel_length.sum() / nactive) print("selective intervals", ci_sel.T) print("risks", sel_risk.sum() / nactive) return np.transpose(np.vstack((ci_sel[:, 0], ci_sel[:, 1], ci_naive[:, 0], ci_naive[:, 1], sel_MLE, GS.target_observed, sel_covered, naive_covered, sel_risk, naive_risk)))
def test_marginalize(s=4, n=600, p=200, rho=0., signal=3.5, lam_frac=2.5, ndraw=10000, burnin=2000, loss='gaussian', randomizer='gaussian', randomizer_scale=1., nviews=3, scalings=True, subgrad=True, parametric=False, intervals='old'): print(n, p, s) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): if parametric == False: views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) else: views.append( glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] true_vec = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): check_screen = True if nactive == s: return None # BUG: if this scalings code is moveed after the decompose_subgradient, # code seems to run fine if scalings: # try condition on some scalings for i in range(nviews): views[i].condition_on_scalings() if subgrad: for i in range(nviews): conditioning_groups = np.zeros(p, dtype=bool) conditioning_groups[:(p / 2)] = True marginalizing_groups = np.zeros(p, dtype=bool) marginalizing_groups[(p / 2):] = True views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries, bootstrap=False, parametric=parametric) #reference= beta[active_union]) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) elif intervals == 'new': full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def hiv_inference_test(): if not os.path.exists("NRTI_DATA.txt"): NRTI = pandas.read_table( "http://hivdb.stanford.edu/pages/published_analysis/genophenoPNAS2006/DATA/NRTI_DATA.txt", na_values="NA") else: NRTI = pandas.read_table("NRTI_DATA.txt") NRTI_specific = [] NRTI_muts = [] for i in range(1, 241): d = NRTI['P%d' % i] for mut in np.unique(d): if mut not in ['-', '.'] and len(mut) == 1: test = np.equal(d, mut) if test.sum() > 10: NRTI_specific.append(np.array(np.equal(d, mut))) NRTI_muts.append("P%d%s" % (i, mut)) NRTI_specific = NRTI.from_records(np.array(NRTI_specific).T, columns=NRTI_muts) X_NRTI = np.array(NRTI_specific, np.float) Y = NRTI['3TC'] # shorthand keep = ~np.isnan(Y).astype(np.bool) X_NRTI = X_NRTI[np.nonzero(keep)]; Y = Y[keep] Y = np.array(np.log(Y), np.float); Y -= Y.mean() X_NRTI -= X_NRTI.mean(0)[None, :]; X_NRTI /= X_NRTI.std(0)[None, :] X = X_NRTI # shorthand n, p = X.shape X /= np.sqrt(n) ols_fit = sm.OLS(Y, X).fit() sigma_3TC = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1) lam_frac = 1. loss = rr.glm.gaussian(X, Y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma_3TC print(lam) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) from selection.api import randomization randomization = randomization.isotropic_gaussian((p,), scale=1.) #change grid for parameter for HIV data M_est = M_estimator_map(loss, epsilon, penalty, randomization, randomization_scale=0.7) M_est.solve_approx() active = M_est._overall nactive = np.sum(active) ci_active = np.zeros((nactive, 2)) ci_length = np.zeros(nactive) mle_active = np.zeros((nactive, 1)) ci = approximate_conditional_density(M_est) ci.solve_approx() class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(M_est.target_cov) ci_naive = naive_confidence_intervals(target, M_est.target_observed) for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) ci_length[j] = ci_active[j, 1] - ci_active[j, 0] mle_active[j, :] = ci.approx_MLE_solver(j, nstep=100)[0] unadjusted_mle = np.zeros((nactive, 1)) for j in range(nactive): unadjusted_mle[j, :] = ci.target_observed[j] adjusted_intervals = np.hstack([mle_active, ci_active]).T unadjusted_intervals = np.hstack([unadjusted_mle, ci_naive]).T print("adjusted confidence", adjusted_intervals) print("naive confidence", unadjusted_intervals) intervals = np.vstack([unadjusted_intervals, adjusted_intervals]) return intervals
def test_split_compare(ndraw=20000, burnin=10000, solve_args={ 'min_its': 50, 'tol': 1.e-10 }, check_screen=True): # s, n, p = 0, 200, 10 s, n, p = 6, 300, 40 randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=5) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) m = int(0.8 * n) # first randomization M_est1 = split_glm_group_lasso(loss, epsilon, m, penalty) # second randomization # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) # mv = multiple_queries([M_est1, M_est2]) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable[ 'variables'] #+ M_est2.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None leftout_indices = M_est1.randomized_loss.saturated_loss.case_weights == 0 screen = set(nonzero).issubset(np.nonzero(active_union)[0]) if check_screen and not screen: return None if True: active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] ## bootstrap target_sampler_boot, target_observed = glm_target(loss, active_union, mv, bootstrap=True) target_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin) LU_boot = target_sampler_boot.confidence_intervals( target_observed, sample=target_sample_boot) pivots_boot = target_sampler_boot.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample_boot) ## CLT plugin target_sampler, _ = glm_target(loss, active_union, mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) if X.shape[0] - leftout_indices.sum() > nactive: LU_split = standard_ci(X, y, active_union, leftout_indices) LU_split_sm = standard_ci_sm(X, y, active_union, leftout_indices) else: LU_split = LU_split_sm = np.ones((nactive, 2)) * np.nan pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) covered_boot, ci_length_boot = coverage(LU_boot) covered_split, ci_length_split = coverage(LU_split) covered_naive, ci_length_naive = coverage(LU_naive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): active_var[j] = active_set[j] in nonzero return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \ covered_split, ci_length_split, active_var, covered_naive, ci_length_naive
def test_approximate_inference(X, y, true_mean, sigma, threshold=3., seed_n=0, lam_frac=1., loss='gaussian', randomization_scale=1.): from selection.api import randomization n, p = X.shape np.random.seed(seed_n) if loss == "gaussian": lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) loss = rr.glm.logistic(X, y) active_bool = np.zeros(p, np.bool) inactive_bool = ~active_bool randomization = randomization.isotropic_gaussian((p, ), scale=randomization_scale) TS = threshold_score_map(loss, threshold, randomization, active_bool, inactive_bool, randomization_scale) TS.solve_approx() active = TS._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) sys.stderr.write("number of active selected by thresholding" + str(nactive) + "\n") sys.stderr.write("Active set selected by thresholding" + str(active_set) + "\n") sys.stderr.write("Observed target" + str(TS.target_observed) + "\n") if nactive == 0: return None else: true_vec = np.linalg.inv(X[:, active].T.dot(X[:, active])).dot( X[:, active].T).dot(true_mean) sys.stderr.write("True target to be covered" + str(true_vec) + "\n") class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(TS.target_cov) ci_naive = naive_confidence_intervals(target, TS.target_observed) naive_covered = np.zeros(nactive) naive_risk = np.zeros(nactive) ci = approximate_conditional_density(TS) ci.solve_approx() ci_sel = np.zeros((nactive, 2)) sel_MLE = np.zeros(nactive) sel_length = np.zeros(nactive) for j in range(nactive): ci_sel[j, :] = np.array(ci.approximate_ci(j)) sel_MLE[j] = ci.approx_MLE_solver(j, step=1, nstep=150)[0] sel_length[j] = ci_sel[j, 1] - ci_sel[j, 0] sel_covered = np.zeros(nactive, np.bool) sel_risk = np.zeros(nactive) for j in range(nactive): sel_risk[j] = (sel_MLE[j] - true_vec[j])**2. naive_risk[j] = (TS.target_observed[j] - true_vec[j])**2. if (ci_sel[j, 0] <= true_vec[j]) and (ci_sel[j, 1] >= true_vec[j]): sel_covered[j] = 1 if (ci_naive[j, 0] <= true_vec[j]) and (ci_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 print("lengths", sel_length.sum() / nactive) print("selective intervals", ci_sel.T) print("risks", sel_risk.sum() / nactive) return np.transpose( np.vstack((ci_sel[:, 0], ci_sel[:, 1], ci_naive[:, 0], ci_naive[:, 1], sel_MLE, TS.target_observed, sel_covered, naive_covered, sel_risk, naive_risk)))