def test_scaling( snr=15, s=5, n=200, p=20, rho=0.1, burnin=20000, ndraw=30000, scale=0.9, nsim=None, # needed for decorator frac=0.5 ): # 0.9 has roughly same screening probability as 50% data splitting, i.e. around 10% randomizer = randomization.laplace((p, ), scale=scale) X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active = M_est.selection_variable['variables'] nactive = active.sum() if set(nonzero).issubset(np.nonzero(active)[0]): pvalues = [] active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] active_selected = A = [ i for i in np.arange(active_set.shape[0]) if active_set[i] in nonzero ] if not I: return None idx = I[0] inactive = ~M_est.selection_variable['variables'] boot_target, target_observed = pairs_bootstrap_glm(loss, active, inactive=inactive) if DEBUG: sampler = lambda: np.random.choice(n, size=(n, ), replace=True) print(boot_target(sampler())[-3:], 'boot target') form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) # null saturated def null_target(indices): result = boot_target(indices) return result[idx] null_observed = np.zeros(1) null_observed[0] = target_observed[idx] target_sampler = mv.setup_target(null_target, null_observed) #target_scaling = 5 * np.linalg.svd(target_sampler.target_transform[0][0])[1].max()**2# should have something do with noise scale too print(target_sampler.crude_lipschitz(), 'crude') test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(null_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true saturated idx = A[0] def active_target(indices): result = boot_target(indices) return result[idx] active_observed = np.zeros(1) active_observed[0] = target_observed[idx] target_sampler = mv.setup_target(active_target, active_observed) target_scaling = 5 * np.linalg.svd( target_sampler.target_transform[0] [0])[1].max()**2 # should have something do with noise scale too test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(active_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # null selected idx = I[0] def null_target(indices): result = boot_target(indices) return np.hstack([result[idx], result[nactive:]]) null_observed = np.zeros_like(null_target(range(n))) null_observed[0] = target_observed[idx] null_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(null_target, null_observed) #, target_set=[0]) target_scaling = 5 * np.linalg.svd( target_sampler.target_transform[0] [0])[1].max()**2 # should have something do with noise scale too print(target_sampler.crude_lipschitz(), 'crude') test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(null_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true selected idx = A[0] def active_target(indices): result = boot_target(indices) return np.hstack([result[idx], result[nactive:]]) active_observed = np.zeros_like(active_target(range(n))) active_observed[0] = target_observed[idx] active_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(active_target, active_observed) #, target_set=[0]) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(active_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # condition on opt variables ### NOT WORKING -- need to implement conditioning within M_estimator!!! if False: # null saturated idx = I[0] def null_target(indices): result = boot_target(indices) return result[idx] null_observed = np.zeros(1) null_observed[0] = target_observed[idx] target_sampler = mv.setup_target(null_target, null_observed) print(target_sampler.crude_lipschitz(), 'crude') test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(null_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true saturated idx = A[0] def active_target(indices): result = boot_target(indices) return result[idx] active_observed = np.zeros(1) active_observed[0] = target_observed[idx] sampler = lambda: np.random.choice(n, size=(n, ), replace=True) target_sampler = mv.setup_target(active_target, active_observed) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(active_observed), burnin=burnin, ndraw=ndraw, stepsize=.5 / target_sampler.crude_lipschitz()) # twosided by default pvalues.append(pval) # true selected # oracle p-value -- draws a new data set X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) X_E = X[:, active_set] active_var = [False, True, False, True] if statsmodels_available: try: model = sm.GLM(y, X_E, family=sm.families.Binomial()) model_results = model.fit() pvalues.extend( [model_results.pvalues[I[0]], model_results.pvalues[A[0]]]) active_var.extend([False, True]) except sm.tools.sm_exceptions.PerfectSeparationError: pass else: pass # data splitting-ish p-value -- draws a new data set of smaller size # frac is presumed to be how much data was used in stage 1, we get (1-frac)*n for stage 2 # frac defaults to 0.5 Xs, ys, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr) Xs = Xs[:int((1 - frac) * n)] ys = ys[:int((1 - frac) * n)] X_Es = Xs[:, active_set] if statsmodels_available: try: model = sm.GLM(ys, X_Es, family=sm.families.Binomial()) model_results = model.fit() pvalues.extend( [model_results.pvalues[I[0]], model_results.pvalues[A[0]]]) active_var.extend([False, False]) except sm.tools.sm_exceptions.PerfectSeparationError: pass else: pass return pvalues, active_var
def test_logistic_saturated_active_coordinate(): s, n, p = 5, 200, 20 randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=14) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) print(lam) # our randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) mv.solve() active = M_est1.selection_variable['variables'] nactive = active.sum() if set(nonzero).issubset(np.nonzero(active)[0]): active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] active_selected = A = [ i for i in np.arange(active_set.shape[0]) if active_set[i] in nonzero ] idx = A[0] inactive = ~M_est1.selection_variable['variables'] boot_target, target_observed = pairs_bootstrap_glm(loss, active, inactive=inactive) def active_target(indices): result = boot_target(indices) return result[idx] active_observed = np.zeros(1) active_observed[0] = target_observed[idx] # the active_observed[1:] is only used as a # starting point for chain -- could be 0 # active_observed[1:] = target_observed[nactive:] form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) target_sampler = mv.setup_target(active_target, active_observed) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, test_stat(active_observed), burnin=10000, ndraw=10000) # twosided by default return pval, True
def test_fixedX(ndraw=10000, burnin=2000): # nsim needed for decorator s, n, p = 5, 200, 20 randomizer = randomization.laplace((p, ), scale=1.) X, Y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0.1, snr=7) lam_frac = 1. lam = lam_frac * np.mean( np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0)) * sigma W = np.ones(p) * lam epsilon = 1. / np.sqrt(n) penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = fixedX_group_lasso(X, Y, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active = M_est.selection_variable['variables'] nactive = active.sum() if set(nonzero).issubset( np.nonzero(active)[0]) and active.sum() > len(nonzero): pvalues = [] active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] active_selected = A = [ i for i in np.arange(active_set.shape[0]) if active_set[i] in nonzero ] if not I: return None idx = I[0] boot_target, target_observed = resid_bootstrap(M_est.loss, active) X_active = X[:, active] beta_hat = np.linalg.pinv(X_active).dot(Y) resid_hat = Y - X_active.dot(beta_hat) form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) # null saturated def null_target(Y_star): result = boot_target(Y_star) return result[idx] null_observed = np.zeros(1) null_observed[0] = target_observed[idx] target_sampler = mv.setup_target(null_target, null_observed) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, null_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) # null selected def null_target(Y_star): result = boot_target(Y_star) return np.hstack([result[idx], result[nactive:]]) null_observed = np.zeros_like(null_target( np.random.standard_normal(n))) null_observed[0] = target_observed[idx] null_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(null_target, null_observed, target_set=[0]) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, null_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) # true saturated idx = A[0] def active_target(Y_star): result = boot_target(Y_star) return result[idx] active_observed = np.zeros(1) active_observed[0] = target_observed[idx] sampler = lambda: np.random.choice(n, size=(n, ), replace=True) target_sampler = mv.setup_target(active_target, active_observed) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, active_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) # true selected def active_target(Y_star): result = boot_target(Y_star) return np.hstack([result[idx], result[nactive:]]) active_observed = np.zeros_like( active_target(np.random.standard_normal(n))) active_observed[0] = target_observed[idx] active_observed[1:] = target_observed[nactive:] target_sampler = mv.setup_target(active_target, active_observed, target_set=[0]) test_stat = lambda x: x[0] pval = target_sampler.hypothesis_test( test_stat, active_observed, burnin=burnin, ndraw=ndraw) # twosided by default pvalues.append(pval) return pvalues, [False, False, True, True]
def test_threshold_score(ndraw=10000, burnin=2000, nsim=None): # nsim needed for decorator s, n, p = 5, 200, 20 threshold = 0.5 X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, signal=7) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) active_bool = np.zeros(p, np.bool) active_bool[range(3)] = 1 inactive_bool = ~active_bool randomizer = randomization.laplace((inactive_bool.sum(), ), scale=0.5) # threshold the score thresh = glm_threshold_score(loss, threshold, randomizer, active_bool, inactive_bool) mv = multiple_queries([thresh]) mv.solve() boundary = thresh.selection_variable['boundary_set'] new_active = np.nonzero(np.arange(3, 20)[boundary])[0] active_set = np.array(sorted(set(range(3)).union(new_active))) if set(nonzero).issubset(active_set): full_active = np.zeros(p, np.bool) full_active[active_set] = 1 nactive = active_set.shape[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] if not I: return None inactive_indicators_mat = np.zeros((len(inactive_selected), nactive)) j = 0 for i in range(nactive): if active_set[i] not in nonzero: inactive_indicators_mat[j, i] = 1 j += 1 form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) boot_target, target_observed = pairs_bootstrap_glm(loss, full_active) inactive_target = lambda indices: boot_target(indices)[ inactive_selected] inactive_observed = target_observed[inactive_selected] # param_cov = _parametric_cov_glm(loss, active_union) target_sampler = mv.setup_target(inactive_target, inactive_observed) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test( test_stat, np.linalg.norm(inactive_observed), alternative='twosided', ndraw=ndraw, burnin=burnin) print(pval) return pval, False