def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(XTX, XTXi, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0.5 noisy_S = sampler(scale=scale) loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) problem = rr.simple_problem(loss, pen) soln = problem.solve(max_its=100, tol=1.e-10) success += soln != 0 return set(np.nonzero(success)[0]) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 20, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' })
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0. noisy_S = sampler(scale=scale) soln = XTXi.dot(noisy_S) solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) pval = ndist.cdf(solnZ) pval = 2 * np.minimum(pval, 1 - pval) return set(BHfilter(pval, q=0.2)) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 5, 'sizes': [200] * 10, 'dropout': 0., 'activation': 'relu' })
def simulate(n=200, p=50, s=5, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0.5 noisy_S = sampler(scale=scale) soln = XTXi.dot(noisy_S) solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) return set(np.nonzero(np.fabs(solnZ) > 2.1)[0]) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(5, 7), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 30, 'sizes': [100, 100], 'activation': 'relu' })
def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=5000): # description of statistical problem np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False, center=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(X, XTXi, resid, sampler): n, p = X.shape rho = 0.8 S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape) X_full = np.hstack([X, Xnew]) beta_full = np.linalg.pinv(X_full).dot(ynew) winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] return set(np.nonzero(winners)[0]) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(8, 10), B=B, fit_probability=keras_fit, fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
def simulate(n=200, p=100, s=10, signal=(1.5, 2), sigma=2, alpha=0.1, B=3000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(X, XTXi, resid, sampler): S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X, ynew, *[None] * 4) select = G.select() return set(list(select[0])) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 10, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' })
def simulate(s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=5000, seed=0): # description of statistical problem n, p = X_full.shape if boot_design: idx = np.random.choice(np.arange(n), n, replace=True) X = X_full[idx] # bootstrap X to make it really an IID sample, i.e. don't condition on X throughout X += 0.1 * np.std(X) * np.random.standard_normal(X.shape) # to make non-degenerate else: X = X_full.copy() X = X - np.mean(X, 0)[None, :] X = X / np.std(X, 0)[None, :] n, p = X.shape truth = np.zeros(p) truth[:s] = np.linspace(signal[0], signal[1], s) np.random.shuffle(truth) truth /= np.sqrt(n) truth *= sigma y = X.dot(truth) + sigma * np.random.standard_normal(n) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p,), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0. noisy_S = sampler(scale=scale) loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) problem = rr.simple_problem(loss, pen) soln = problem.solve(max_its=100, tol=1.e-10) success += soln != 0 return set(np.nonzero(success)[0]) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) # run selection algorithm df = full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) if False: # df is not None: liu_df = liu_inference(X, y, lam, dispersion, truth, alpha=alpha) return pd.merge(df, liu_df, on='variable') else: return df
def simulate(n=1000, p=100, s=20, signal=(2, 4), sigma=2, alpha=0.1, B=2000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.1, sigma=sigma, signal=signal, random_signs=True, scale=True)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, sampler): min_success = 6 ntries = 10 def _alpha_grid(X, y, center, XTX): n, p = X.shape alphas, coefs, _ = lasso_path(X, y, Xy=center, precompute=XTX) nselected = np.count_nonzero(coefs, axis=0) return alphas[nselected < np.sqrt(0.8 * p)] alpha_grid = _alpha_grid(X, y, sampler(scale=0.), XTX) success = np.zeros((p, alpha_grid.shape[0])) for _ in range(ntries): scale = 1. # corresponds to sub-samples of 50% noisy_S = sampler(scale=scale) _, coefs, _ = lasso_path(X, y, Xy = noisy_S, precompute=XTX, alphas=alpha_grid) success += np.abs(np.sign(coefs)) selected = np.apply_along_axis(lambda row: any(x>min_success for x in row), 1, success) vars = set(np.nonzero(selected)[0]) return vars XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
def simulate(s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000, seed=0): # description of statistical problem n, p = X_full.shape if boot_design: idx = np.random.choice(np.arange(n), n, replace=True) X = X_full[ idx] # bootstrap X to make it really an IID sample, i.e. don't condition on X throughout X += 0.1 * np.std(X) * np.random.standard_normal( X.shape) # to make non-degenerate else: X = X_full.copy() X = X - np.mean(X, 0)[None, :] X = X / np.std(X, 0)[None, :] n, p = X.shape truth = np.zeros(p) truth[:s] = np.linspace(signal[0], signal[1], s) np.random.shuffle(truth) truth /= np.sqrt(n) truth *= sigma y = X.dot(truth) + sigma * np.random.standard_normal(n) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) print(dispersion, sigma**2) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(X, XTXi, resid, sampler): S = sampler(scale=0.5) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X, ynew, *[None] * 4) select = G.select(seed=seed) return set(list(select[0])) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm df = full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(6, 10), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 10, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' }) return df
def simulate(s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000, seed=0): # description of statistical problem n, p = X_full.shape if boot_design: idx = np.random.choice(np.arange(n), n, replace=True) X = X_full[ idx] # bootstrap X to make it really an IID sample, i.e. don't condition on X throughout X += 0.1 * np.std(X) * np.random.standard_normal( X.shape) # to make non-degenerate else: X = X_full.copy() X = X - np.mean(X, 0)[None, :] X = X / np.std(X, 0)[None, :] n, p = X.shape truth = np.zeros(p) truth[:s] = np.linspace(signal[0], signal[1], s) np.random.shuffle(truth) truth /= np.sqrt(n) truth *= sigma y = X.dot(truth) + sigma * np.random.standard_normal(n) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) print(dispersion, sigma**2) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, sampler): min_success = 6 ntries = 10 def _alpha_grid(X, y, center, XTX): n, p = X.shape alphas, coefs, _ = lasso_path(X.copy(), y.copy(), Xy=center.copy(), precompute=XTX.copy()) nselected = np.count_nonzero(coefs, axis=0) alphas = alphas[nselected < 20] return alphas alpha_grid = _alpha_grid(X, y, sampler.center, XTX) success = np.zeros((p, alpha_grid.shape[0])) for _ in range(ntries): scale = 1. # corresponds to sub-samples of 50% noisy_S = sampler(scale=scale) _, coefs, _ = lasso_path(X, y, Xy=noisy_S, precompute=XTX, alphas=alpha_grid) success += np.abs(np.sign(coefs)) selected = np.apply_along_axis( lambda row: any(x > min_success for x in row), 1, success) vars = set(np.nonzero(selected)[0]) return vars selection_algorithm = functools.partial(meta_algorithm, X, XTXi) # run selection algorithm df = full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(6, 10), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 10, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' }) return df
def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=2000): # description of statistical problem np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False, center=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(X, XTXi, resid, sampler): n, p = X.shape idx = np.random.choice(np.arange(n), 200, replace=False) S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X[idx], ynew[idx], *[None] * 4) select = G.select() return set(list(select[0])) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm df = full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 20, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' }) if df is not None: observed_set = list(df['variable']) true_target = truth[observed_set] np.random.seed(seed) X2, _, _ = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, center=False, scale=False)[:3] stage_1 = np.random.choice(np.arange(n), 200, replace=False) stage_2 = sorted(set(range(n)).difference(stage_1)) X2 = X2[stage_2] y2 = X2.dot(truth) + sigma * np.random.standard_normal(X2.shape[0]) XTXi_2 = np.linalg.inv(X2.T.dot(X2)) resid2 = y2 - X2.dot(XTXi_2.dot(X2.T.dot(y2))) dispersion_2 = np.linalg.norm(resid2)**2 / (X2.shape[0] - X2.shape[1]) naive_df = naive_full_model_inference(X2, y2, dispersion_2, observed_set, alpha=alpha) df = pd.merge(df, naive_df, on='variable') return df
dispersion = np.linalg.norm(resid)**2 / (n-p) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) fit_probability=keras_fit, fit_args={'epochs':200}, # run selection algorithm df = full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(1, 1), B=B, fit_probability=logit_fit, fit_args={'df':20}) if df is not None: liu_df = liu_inference(X, y, lam, dispersion, truth, alpha=alpha) return pd.merge(df, liu_df, on='variable')
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) S = X.T.dot(y) covS = dispersion * XTX splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p,), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0. noisy_S = sampler(scale=scale) loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) problem = rr.simple_problem(loss, pen) soln = problem.solve(max_its=100, tol=1.e-10) success += soln != 0 return set(np.nonzero(success)[0]) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) # run selection algorithm df = full_model_inference(X, y, truth, selection_algorithm, splitting_sampler, success_params=(1, 1), B=B, fit_probability=random_forest_fit, fit_args={'ntrees':5000}) liu_df = liu_inference(X, y, lam, dispersion, truth, alpha=alpha) return pd.merge(df, liu_df, on='variable')