def test_MSE(signal=1, n=100, p=10, s=1): ninstance = 1 total_mse = 0 nvalid_instance = 0 data_instance = gaussian_instance(n, p, s, signal) tau = 1. for i in range(ninstance): X, y, true_beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, signal=signal) random_Z = np.random.standard_normal(p) lam, epsilon, active, betaE, cube, initial_soln = selection( X, y, random_Z) # selection not defined -- is in a file that was deleted print("active set", np.where(active)[0]) if lam < 0: print("no active covariates") else: est = estimation(X, y, active, betaE, cube, epsilon, lam, sigma, tau) est.compute_mle_all() mse_mle = est.mse_mle(true_beta[active]) print("MLE", est.mle) total_mse += mse_mle nvalid_instance += np.sum(active) return np.true_divide(total_mse, nvalid_instance)
def simulate(n=100, p=40, rho=0.3, signal=(5,5), do_knockoff=False, do_AIC=True, do_BIC=True, full_results={}, alpha=0.05, s=7, correlation='equicorrelated', maxstep=np.inf, compute_maxT_identify=True, ndraw=8000, burnin=2000): if correlation == 'equicorrelated': X, y, _, active, sigma = gaussian_instance(n=n, p=p, rho=rho, signal=signal, s=s, random_signs=False, equicorrelated=True) elif correlation == 'AR': X, y, _, active, sigma = gaussian_instance(n=n, p=p, rho=rho, signal=signal, s=s, random_signs=True, equicorrelated=False) else: raise ValueError('correlation must be one of ["equicorrelated", "AR"]') full_results.setdefault('n', []).append(n) full_results.setdefault('p', []).append(p) full_results.setdefault('rho', []).append(rho) full_results.setdefault('s', []).append(len(active)) full_results.setdefault('signalU', []).append(max(signal)) full_results.setdefault('signalL', []).append(min(signal)) full_results.setdefault('correlation', []).append(correlation) return compute_results(y, X, sigma, active, do_knockoff=do_knockoff, full_results=full_results, maxstep=maxstep, compute_maxT_identify=compute_maxT_identify, alpha=alpha, ndraw=ndraw, burnin=burnin, do_AIC=True, do_BIC=True, )
def generate_X(self): (n, p, s, rho) = (self.n, self.p, self.s, self.rho) X_equi = gaussian_instance(n=n, p=p, equicorrelated=True, rho=self.equicor_rho)[0] X_AR = gaussian_instance(n=n, p=p, equicorrelated=False, rho=rho)[0] X = np.sqrt( self.AR_weight) * X_AR + np.sqrt(1 - self.AR_weight) * X_equi X /= np.sqrt((X**2).mean(0))[None, :] return X
def test_full_pvals(n=100, p=40, rho=0.3, signal=4, ndraw=8000, burnin=2000): X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) from scipy.stats import norm as ndist pval = [] completed_yet = False for i in range(min(n, p)): FS.step() var_select, pval_select = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=False, burnin=burnin, ndraw=ndraw)[0] pval_saturated = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=True)[0][1] # now, nominal ones LSfunc = np.linalg.pinv(FS.X[:,FS.variables]) Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma) pval_nominal = 2 * ndist.sf(np.fabs(Z)) pval.append((var_select, pval_select, pval_saturated, pval_nominal)) if set(active).issubset(np.array(pval)[:,0]) and not completed_yet: completed_yet = True completion_index = i + 1 return X, y, beta, active, sigma, np.array(pval), completion_index
def test_mcmc_tests(n=100, p=40, s=4, rho=0.3, signal=5, ndraw=None, burnin=2000, nstep=200, method='serial'): X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho, s=s) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) extra_steps = 4 null_rank, alt_rank = None, None for i in range(min(n, p)): FS.step() if extra_steps <= 0: null_rank = forward_mod.mcmc_test(FS, i+1, variable=FS.variables[i-2], nstep=nstep, burnin=burnin, method="serial") alt_rank = forward_mod.mcmc_test(FS, i+1, variable=FS.variables[0], burnin=burnin, nstep=nstep, method="parallel") break if set(active).issubset(FS.variables): extra_steps -= 1 return null_rank, alt_rank
def sim2(): X, Y, _, active, sigma = gaussian_instance(n=150, s=3) G = data_splitting.gaussian(X, Y, 5., split_frac=0.5, sigma=sigma) G.fit(use_full=True) if set(active).issubset(G.active) and G.active.shape[0] > len(active): return [G.hypothesis_test(G.active[len(active)])] return []
def generate_X(self): n, p, s, rho = self.n, self.p, self.s, self.rho X = gaussian_instance(n=n, p=p, equicorrelated=False, rho=rho)[0] X *= np.sqrt(n) return X
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(XTX, XTXi, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0.5 noisy_S = sampler(scale=scale) loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) problem = rr.simple_problem(loss, pen) soln = problem.solve(max_its=100, tol=1.e-10) success += soln != 0 return set(np.nonzero(success)[0]) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 20, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' })
def test_independence_null_mcmc(n=100, p=40, s=4, rho=0.5, signal=5, ndraw=None, burnin=2000, nstep=200, method='serial'): X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho, s=s) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) extra_steps = 4 completed = False null_ranks = [] for i in range(min(n, p)): FS.step() if completed and extra_steps > 0: null_rank = forward_mod.mcmc_test(FS, i+1, variable=FS.variables[-1], nstep=nstep, burnin=burnin, method="serial") null_ranks.append(int(null_rank)) if extra_steps <= 0: break if set(active).issubset(FS.variables): extra_steps -= 1 completed = True return tuple(null_ranks)
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0. noisy_S = sampler(scale=scale) soln = XTXi.dot(noisy_S) solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) pval = ndist.cdf(solnZ) pval = 2 * np.minimum(pval, 1 - pval) return set(BHfilter(pval, q=0.2)) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 5, 'sizes': [200] * 10, 'dropout': 0., 'activation': 'relu' })
def test_approximate_mle(n=100, p=10, s=3, snr=5, rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) if randomizer == 'gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer == 'laplace': randomization = randomization.laplace((p,), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer) M_est.solve_approx() inf = approximate_conditional_density(M_est) inf.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: mle_active = np.zeros(nactive) for j in range(nactive): mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0] print("mle for target", mle_active)
def generate_X(self): (n, p, s, rho) = (self.n, self.p, self.s, self.rho) X = gaussian_instance(n=n, p=p, equicorrelated=True, rho=rho, s=0)[0] X /= np.sqrt((X**2).sum(0))[None, :] X *= np.sqrt(n) return X
def test_selection(): n = 500 p = 100 s = 0 signal = 0. np.random.seed(3) # ensures different y X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, sigma=1., rho=0, signal=signal) lam = 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma n, p = X.shape loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomizer = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomizer, 'gaussian', 'parametric') M_est.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) prior_variance = 1000. noise_variance = sigma**2 generative_mean = np.zeros(p) generative_mean[:nactive] = M_est.initial_soln[active] sel_split = selection_probability_random_lasso(M_est, generative_mean) min = sel_split.minimize2(nstep=200) print(min[0], min[1]) test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall])) print("value of likelihood", sel_split.likelihood_loss.smooth_objective(test_point, mode="func")) inv_cov = np.linalg.inv(M_est.score_cov) lik = (M_est.observed_score_state - generative_mean).T.dot(inv_cov).dot(M_est.observed_score_state - generative_mean) / 2. print("value of likelihood check", lik) grad = inv_cov.dot(M_est.observed_score_state - generative_mean) print("grad at likelihood loss", grad)
def simulate(n=200, p=50, s=5, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0.5 noisy_S = sampler(scale=scale) soln = XTXi.dot(noisy_S) solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) return set(np.nonzero(np.fabs(solnZ) > 2.1)[0]) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(5, 7), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 30, 'sizes': [100, 100], 'activation': 'relu' })
def sim(): X, Y, _, active, sigma = gaussian_instance() print(sigma) G = data_carving.gaussian(X, Y, 1., split_frac=0.9, sigma=sigma) G.fit() if set(active).issubset(G.active) and G.active.shape[0] > len(active): return [ G.hypothesis_test(G.active[len(active)], burnin=5000, ndraw=10000) ] return []
def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=5000): # description of statistical problem np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False, center=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(X, XTXi, resid, sampler): n, p = X.shape rho = 0.8 S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape) X_full = np.hstack([X, Xnew]) beta_full = np.linalg.pinv(X_full).dot(ynew) winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] return set(np.nonzero(winners)[0]) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(8, 10), B=B, fit_probability=keras_fit, fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
def generate_X(self): n, p, s, rho = self.n, self.p, self.s, self.rho X = gaussian_instance(n=n, p=p, equicorrelated=False, rho=rho)[0] beta = np.zeros(p) beta[:s] = self.signal np.random.shuffle(beta) beta = randomize_signs(beta) X *= np.sqrt(n) return X
def simulate(n=200, p=100, s=10, signal=(1.5, 2), sigma=2, alpha=0.1, B=3000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(X, XTXi, resid, sampler): S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X, ynew, *[None] * 4) select = G.select() return set(list(select[0])) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 10, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' })
def test_greedy_step(n=50, p=100, s=5, signal=5): X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0., signal=signal, sigma=1.) greedy_step = approximate_inference(X, y, beta, sigma, seed_n=0, lam_frac=1., loss='gaussian') if greedy_step is not None: print("output of selection adjusted inference", greedy_step) return(greedy_step)
def generate(self): n, p, s = self.n, self.p, self.s X = gaussian_instance(n=n, p=p, equicorrelated=True, rho=0., s=s)[0] X /= np.sqrt((X**2).sum(0))[None, :] beta = np.zeros(p) beta[:s] = self.signal beta = randomize_signs(beta) np.random.shuffle(beta) X *= np.sqrt(n) Y = X.dot(beta) + np.random.standard_normal(n) return X, Y, beta
def test_randomized_lasso(n=300, p=500, s=5, signal=7.5, rho=0.2): X, Y, beta, active, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, equicorrelated=False) L = randomized_lasso.gaussian(X, Y, 3.5 * sigma * np.ones(p)) signs = L.fit() print(np.nonzero(signs != 0)[0]) print(np.nonzero(beta != 0)[0]) print( L.summary(signs != 0, ndraw=1000, burnin=200, compute_intervals=False))
def test_CV(ndraw=500, sigma_known=True, burnin=100, s=7, rho=0.3, method=lasso_tuned, snr=5): # generate a null and alternative pvalue # from a particular model X, Y, beta, active, sigma = gaussian_instance(n=500, p=100, s=s, rho=rho, snr=snr) if sigma_known: sigma = sigma else: sigma = None method_ = method(Y, X, scale_inter=0.0001, scale_valid=0.0001, scale_select=0.0001) if True: do_null = True if do_null: which_var = method_.active_set[s] # the first null one method_.setup_inference(which_var) ; iter(method_) for i in range(ndraw + burnin): method_.next() Z = np.array(method_.null_sample[which_var][burnin:]) family = discrete_family(Z, np.ones_like(Z)) obs = method_._gaussian_obs[which_var] pval0 = family.cdf(0, obs) pval0 = 2 * min(pval0, 1 - pval0) else: pval0 = np.random.sample() which_var = 0 method_.setup_inference(which_var); iter(method_) for i in range(ndraw + burnin): method_.next() family = discrete_family(method_.null_sample[which_var][burnin:], np.ones(ndraw)) obs = method_._gaussian_obs[which_var] pvalA = family.cdf(0, obs) pvalA = 2 * min(pvalA, 1 - pvalA) return pval0, pvalA, method_
def test_BIC(do_sample=True, ndraw=8000, burnin=2000, force=False): X, Y, beta, active, sigma, _ = gaussian_instance() n, p = X.shape FS = info_crit_stop(Y, X, sigma, cost=np.log(n)) final_model = len(FS.variables) active = set(list(active)) if active.issubset(FS.variables) or force: which_var = [v for v in FS.variables if v not in active] if do_sample: return [pval[-1] for pval in FS.model_pivots(final_model, saturated=False, burnin=burnin, ndraw=ndraw, which_var=which_var)] else: saturated_pivots = FS.model_pivots(final_model, which_var=which_var) return [pval[-1] for pval in saturated_pivots] return []
def test_threshold(n, p, s, signal): X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0., signal=signal, sigma=1.) true_mean = X.dot(beta) threshold = test_approximate_inference(X, y, true_mean, sigma, seed_n=0, lam_frac=1., loss='gaussian') if threshold is not None: print("output of selection adjusted inference", threshold) return (threshold)
def main(): from selection.tests.instance import gaussian_instance np.random.seed(1) n, p = 3000, 1000 X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=30, rho=0., sigma=1) loss = rr.glm.gaussian(X,y) lam_seq = np.exp(np.linspace(np.log(1.e-6), np.log(1), 30)) * np.fabs(np.dot(X.T,y)).max() K = 5 folds = np.arange(n) % K CV_compute = CV(loss, folds, lam_seq) lam_CV, CV_val, SD_val, lam_CV_randomized, CV_val_randomized, SD_val_randomized = CV_compute.choose_lambda_CVr() #print("CV error curve (nonrandomized):", CV_val) minimum_CV = np.min(CV_val) lam_idx = list(lam_seq).index(lam_CV) SD_min = SD_val[lam_idx] lam_1SD = lam_seq[max([i for i in range(lam_seq.shape[0]) if CV_val[i] <= minimum_CV + SD_min])] #print(lam_CV, lam_1SD) import matplotlib.pyplot as plt plt.plot(np.log(lam_seq), CV_val) plt.show()
def test_nonrandomized(s=0, n=200, p=10, signal=7, rho=0, lam_frac=0.8, loss='gaussian', solve_args={ 'min_its': 20, 'tol': 1.e-10 }): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] print("lam", lam) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) true_vec = beta M_est = M_estimator(lam, loss, penalty) M_est.solve() active = M_est._overall nactive = np.sum(active) print("nactive", nactive) if nactive == 0: return None #score_mean = M_est.observed_internal_state.copy() #score_mean[nactive:] = 0 M_est.setup_sampler(score_mean=np.zeros(p)) #M_est.setup_sampler(score_mean=score_mean) #M_est.sample(ndraw = 1000, burnin=1000, stepsize=1./p) if set(nonzero).issubset(np.nonzero(active)[0]): check_screen = True #test_stat = lambda x: np.linalg.norm(x) #return M_est.hypothesis_test(test_stat, test_stat(M_est.observed_internal_state), stepsize=1./p) ci = M_est.confidence_intervals(M_est.observed_internal_state) pivots = M_est.coefficient_pvalues(M_est.observed_internal_state) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered = coverage(ci)[0] #print(pivots) #print(coverage) return pivots, covered
def simulate(n=100, p=50, s=10, signal=(0, 0), sigma=2, alpha=0.1): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.0, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(X, XTXi, resid, sampler): S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X, ynew, *[None] * 4) select = G.select() return set(list(select[0])) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm observed_set = selection_algorithm(splitting_sampler) # find the target, based on the observed outcome # we just take the first target pivots, covered, lengths = [], [], [] naive_pivots, naive_covered, naive_lengths = [], [], [] for idx in list(observed_set)[:1]: print("variable: ", idx, "total selected: ", len(observed_set)) true_target = truth[idx] (pivot, interval) = infer_full_target(selection_algorithm, observed_set, idx, splitting_sampler, dispersion, hypothesis=true_target, fit_probability=probit_fit, alpha=alpha, B=500) pivots.append(pivot) covered.append( (interval[0] < true_target) * (interval[1] > true_target)) lengths.append(interval[1] - interval[0]) target_sd = np.sqrt(dispersion * XTXi[idx, idx]) observed_target = np.squeeze(XTXi[idx].dot(X.T.dot(y))) quantile = ndist.ppf(1 - 0.5 * alpha) naive_interval = (observed_target - quantile * target_sd, observed_target + quantile * target_sd) naive_pivots.append((1 - ndist.cdf( (observed_target - true_target) / target_sd))) # one-sided naive_covered.append((naive_interval[0] < true_target) * (naive_interval[1] > true_target)) naive_lengths.append(naive_interval[1] - naive_interval[0]) return pivots, covered, lengths, naive_pivots, naive_covered, naive_lengths
def test_without_screening(s=10, n=300, p=100, rho=0., signal=3.5, lam_frac=1., ndraw=10000, burnin=2000, loss='gaussian', randomizer='laplace', randomizer_scale=1., scalings=False, subgrad=True, check_screen=False): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1, random_signs=False) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) X_indep, y_indep, _, _, _ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) loss_indep = rr.glm.gaussian(X_indep, y_indep) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) X_indep, y_indep, _, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, random_signs=False) loss_indep = rr.glm.logistic(X_indep, y_indep) nonzero = np.where(beta)[0] if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=randomizer_scale) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] print("active set", active_set) print("true nonzero", np.nonzero(beta)[0]) views = [M_est] queries = multiple_queries(views) queries.solve() screened = False if set(nonzero).issubset(np.nonzero(active_union)[0]): screened = True if check_screen == False or (check_screen == True and screened == True): #if nactive==s: # return None if scalings: # try condition on some scalings M_est.condition_on_subgradient() M_est.condition_on_scalings() if subgrad: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) boot_target1, boot_target_observed1 = pairs_bootstrap_glm( loss, active_union, inactive=~active_union) boot_target2, boot_target_observed2 = pairs_bootstrap_glm( loss_indep, active_union, inactive=~active_union) target_observed = (boot_target_observed1 - boot_target_observed2)[:nactive] def _target(indices): return boot_target1(indices)[:nactive] - boot_target2( indices)[:nactive] form_covariances = glm_nonparametric_bootstrap(n, n) queries.setup_sampler(form_covariances) queries.setup_opt_state() target_sampler = queries.setup_target(_target, target_observed, reference=target_observed) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros(nactive), sample=target_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) true_vec = np.zeros(nactive) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def simulate(n=100, p=40, rho=0.3, signal=(5,5), do_knockoff=False, do_AIC=True, do_BIC=True, do_glmnet=True, full_results={}, alpha=0.05, s=7, correlation='equicorrelated', maxstep=np.inf, compute_maxT_identify=True, ndraw=8000, burnin=2000): if correlation == 'equicorrelated': X, y, _, active, sigma = gaussian_instance(n=n, p=p, rho=rho, signal=signal, s=s, random_signs=False, equicorrelated=True)[:5] elif correlation == 'AR': X, y, _, active, sigma = gaussian_instance(n=n, p=p, rho=rho, signal=signal, s=s, random_signs=True, equicorrelated=False)[:5] else: raise ValueError('correlation must be one of ["equicorrelated", "AR"]') value = compute_results(y, X, sigma, active, do_knockoff=do_knockoff, full_results=full_results, maxstep=maxstep, compute_maxT_identify=compute_maxT_identify, alpha=alpha, ndraw=ndraw, burnin=burnin, do_AIC=do_AIC, do_BIC=do_BIC, do_glmnet=do_glmnet, ) full_results.setdefault('n', []).append(n) full_results.setdefault('p', []).append(p) full_results.setdefault('rho', []).append(rho) full_results.setdefault('s', []).append(len(active)) full_results.setdefault('signalU', []).append(max(signal)) full_results.setdefault('signalL', []).append(min(signal)) full_results.setdefault('correlation', []).append(correlation) min_len = min([len(full_results[k]) for k in full_results.keys()]) for k in full_results.keys(): full_results[k] = full_results[k][:min_len] return value
def test_kfstep(k=4, s=3, n=100, p=10, Langevin_steps=10000, burning=2000): X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0, signal=10) epsilon = 0. randomization = laplace(loc=0, scale=1.) j_seq = np.empty(k, dtype=int) s_seq = np.empty(k) left = np.ones(p, dtype=bool) obs = 0 initial_state = np.zeros(n + np.sum([i for i in range(p-k+1,p+1)])) initial_state[:n] = y.copy() mat = [np.array((n, ncol)) for ncol in range(p,p-k,-1)] curr = n keep = np.zeros(p, dtype=bool) for i in range(k): X_left = X[:,left] X_selected = X[:, ~left] if (np.sum(left)<p): P_perp = np.identity(n) - X_selected.dot(np.linalg.pinv(X_selected)) mat[i] = P_perp.dot(X_left) else: mat[i] = X mat_complete = np.zeros((n,p)) mat_complete[:, left] = mat[i] T = np.dot(mat[i].T, y) T_complete = np.dot(mat_complete.T, y) obs = np.max(np.abs(T)) keep = np.copy(~left) random_Z = randomization.rvs(T.shape[0]) T_random = T + random_Z initial_state[curr:(curr+p-i)] = T_random # initializing subgradients curr = curr + p-i j_seq[i] = np.argmax(np.abs(T_random)) s_seq[i] = np.sign(T_random[j_seq[i]]) #def find_index(v, idx1): # _sumF = 0 # _sumT = 0 # idx = idx1+1 # for i in range(v.shape[0]): # if (v[i] == False): # _sumF = _sumF + 1 # else: # _sumT = _sumT + 1 # if _sumT >= idx: break # return (_sumT + _sumF-1) T_complete[left] += random_Z left[np.argmax(np.abs(T_complete))] = False # conditioning linear_part = X[:, keep].T P = np.dot(linear_part.T, np.linalg.pinv(linear_part).T) I = np.identity(linear_part.shape[1]) R = I - P def full_projection(state, n=n, p=p, k=k): """ """ new_state = np.empty(state.shape, np.float) new_state[:n] = state[:n] curr = n for i in range(k): projection = projection_cone(p-i, j_seq[i], s_seq[i]) new_state[curr:(curr+p-i)] = projection(state[curr:(curr+p-i)]) curr = curr+p-i return new_state def full_gradient(state, n=n, p=p, k=k, X=X, mat=mat): data = state[:n] grad = np.empty(n + np.sum([i for i in range(p-k+1,p+1)])) grad[:n] = - data curr = n for i in range(k): subgrad = state[curr:(curr+p-i)] sign_vec = np.sign(-mat[i].T.dot(data) + subgrad) grad[curr:(curr + p - i)] = -sign_vec curr = curr+p-i grad[:n] += mat[i].dot(sign_vec) return grad sampler = projected_langevin(initial_state, full_gradient, full_projection, 1./p) samples = [] for i in range(Langevin_steps): if i>burning: old_state = sampler.state.copy() old_data = old_state[:n] sampler.next() new_state = sampler.state.copy() new_data = new_state[:n] new_data = np.dot(P, old_data) + np.dot(R, new_data) sampler.state[:n] = new_data samples.append(sampler.state.copy()) samples = np.array(samples) Z = samples[:,:n] pop = np.abs(mat[k-1].T.dot(Z.T)).max(0) fam = discrete_family(pop, np.ones_like(pop)) pval = fam.cdf(0, obs) pval = 2 * min(pval, 1 - pval) #stop print('pvalue:', pval) return pval
def simulate(n=1000, p=100, s=20, signal=(2, 4), sigma=2, alpha=0.1, B=2000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.1, sigma=sigma, signal=signal, random_signs=True, scale=True)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, sampler): min_success = 6 ntries = 10 def _alpha_grid(X, y, center, XTX): n, p = X.shape alphas, coefs, _ = lasso_path(X, y, Xy=center, precompute=XTX) nselected = np.count_nonzero(coefs, axis=0) return alphas[nselected < np.sqrt(0.8 * p)] alpha_grid = _alpha_grid(X, y, sampler(scale=0.), XTX) success = np.zeros((p, alpha_grid.shape[0])) for _ in range(ntries): scale = 1. # corresponds to sub-samples of 50% noisy_S = sampler(scale=scale) _, coefs, _ = lasso_path(X, y, Xy = noisy_S, precompute=XTX, alphas=alpha_grid) success += np.abs(np.sign(coefs)) selected = np.apply_along_axis(lambda row: any(x>min_success for x in row), 1, success) vars = set(np.nonzero(selected)[0]) return vars XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})