def test_selected_targets(n=2000, p=200, signal_fac=1., s=5, sigma=3, rho=0.4, randomizer_scale=1, full_dispersion=True): """ Compare to R randomized lasso """ inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) while True: X, Y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] idx = np.arange(p) sigmaX = rho**np.abs(np.subtract.outer(idx, idx)) print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma**2.) * n)) n, p = X.shape sigma_ = np.std(Y) W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ conv = const(X, Y, W, randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() nonzero = signs != 0 if nonzero.sum() > 0: dispersion = None if full_dispersion: dispersion = np.linalg.norm( Y - X.dot(np.linalg.pinv(X).dot(Y)))**2 / (n - p) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, dispersion=dispersion) estimate, _, _, pval, intervals, _ = conv.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) return pval[beta[nonzero] == 0], pval[ beta[nonzero] != 0], coverage, intervals
def compare_methods(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, target="selected", randomizer_scale=np.sqrt(0.50), full_dispersion=True, tuning_rand="lambda.theory"): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() true_set = np.asarray([u for u in range(p) if beta[u] != 0]) if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / ( n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) randomized_lasso = lasso.gaussian(X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 sys.stderr.write("active variables selected by randomized LASSO " + str(nonzero.sum()) + "\n" + "\n") active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) active_rand_bool = np.asarray( [(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) nreport = 0. if nonzero.sum() > 0: if target == "full": target_randomized = beta[nonzero] (observed_target, cov_target, cov_target_score, alternatives) = full_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) elif target == "selected": target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) else: raise ValueError('not a valid specification of target') toc = time.time() MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) tic = time.time() time_MLE = tic - toc cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) power_MLE = ((active_rand_bool) * (np.logical_or( (0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) MLE_discoveries = BHfilter(MLE_pval, q=0.1) power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float( (beta != 0).sum()) fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float( max(MLE_discoveries.sum(), 1.)) bias_MLE = np.mean(MLE_estimate - target_randomized) toc = time.time() intervals_uni, pvalue_uni = randomized_lasso.inference_new( observed_target, cov_target, cov_target_score, alternatives) tic = time.time() time_uni = tic - toc intervals_uni = intervals_uni.T cov_uni, selective_uni_power = coverage(intervals_uni, pvalue_uni, target_randomized, beta[nonzero]) length_uni = np.mean(intervals_uni[:, 1] - intervals_uni[:, 0]) power_uni = ((active_rand_bool) * (np.logical_or( (0. < intervals_uni[:, 0]), (0. > intervals_uni[:, 1])))).sum() / float((beta != 0).sum()) uni_discoveries = BHfilter(pvalue_uni, q=0.1) power_uni_BH = (uni_discoveries * active_rand_bool).sum() / float( (beta != 0).sum()) fdr_uni_BH = (uni_discoveries * ~active_rand_bool).sum() / float( max(uni_discoveries.sum(), 1.)) bias_randLASSO = np.mean(randomized_lasso.initial_soln[nonzero] - target_randomized) else: nreport += 1 cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power, time_MLE = [ 0., 0., 0., 0., 0., 0., 0., 0. ] cov_uni, length_uni, power_uni, power_uni_BH, fdr_uni_BH, bias_randLASSO, selective_uni_power, time_uni = [ 0., 0., 0., 0., 0., 0., 0., 0. ] MLE_discoveries = np.zeros(1) uni_discoveries = np.zeros(1) MLE_inf = np.vstack( (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, time_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) uni_inf = np.vstack( (cov_uni, length_uni, 0., nonzero.sum(), bias_randLASSO, selective_uni_power, time_uni, power_uni, power_uni_BH, fdr_uni_BH, uni_discoveries.sum())) return np.vstack((MLE_inf, uni_inf, nreport))
def risk_comparison(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.50), full_dispersion=False, tuning_nonrand="lambda.min", tuning_rand="lambda.1se", ndraw=50): risks = np.zeros((6, 1)) for i in range(ndraw): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() if full_dispersion: print("shapes", y.shape, (np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2).shape) dispersion = np.linalg.norm( y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None _sigma_ = np.std(y) lam_theory = _sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso( X, y, lam_theory / float(n)) if full_dispersion is False: dispersion = None active_min = (glm_LASSO_min != 0) if active_min.sum() > 0: sigma_ = np.sqrt( np.linalg.norm(y - X[:, active_min].dot( np.linalg.pinv(X[:, active_min]).dot(y)))**2 / (n - active_min.sum())) else: sigma_ = _sigma_ print("true and estimated sigma", sigma, _sigma_, sigma_) if tuning_nonrand == "lambda.min": lam_LASSO = lam_min glm_LASSO = glm_LASSO_min elif tuning_nonrand == "lambda.1se": lam_LASSO = lam_1se glm_LASSO = glm_LASSO_1se else: lam_LASSO = lam_theory / float(n) glm_LASSO = glm_LASSO_theory active_LASSO = (glm_LASSO != 0) rel_LASSO = np.zeros(p) if active_LASSO.sum() > 0: post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) rel_LASSO[active_LASSO] = post_LASSO_OLS if tuning_rand == "lambda.min": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_min * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) elif tuning_rand == "lambda.1se": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_1se * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) else: randomized_lasso = lasso.gaussian( X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 sel_MLE = np.zeros(p) ind_est = np.zeros(p) randomized_lasso_est = np.zeros(p) randomized_rel_lasso_est = np.zeros(p) if nonzero.sum() > 0: target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) MLE_estimate, _, _, _, _, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) sel_MLE[nonzero] = MLE_estimate ind_est[nonzero] = ind_unbiased_estimator randomized_lasso_est = randomized_lasso.initial_soln randomized_rel_lasso_est = randomized_lasso._beta_full risks += np.vstack( (relative_risk(sel_MLE, beta, Sigma), relative_risk(ind_est, beta, Sigma), relative_risk(randomized_lasso_est, beta, Sigma), relative_risk(randomized_rel_lasso_est, beta, Sigma), relative_risk(rel_LASSO, beta, Sigma), relative_risk(glm_LASSO, beta, Sigma))) print("risks so far", risks / (i + 1)) return risks / ndraw
def test_randomized_slope(n=500, p=100, signal_fac=1.3, s=5, sigma=3., rho=0.35, randomizer_scale=np.sqrt(1.), target="selected", use_MLE=True): while True: inst = gaussian_instance signal = np.sqrt(signal_fac * 2. * np.log(p)) X, Y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] sigma_ = np.sqrt( np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y)))**2 / (n - p)) Y /= sigma_ r_beta, r_E, r_lambda_seq, r_sigma = slope_R( X, Y, W=None, normalize=True, choice_weights="gaussian", #put gaussian sigma=1.) conv = slope.gaussian(X, Y, r_sigma * r_lambda_seq, sigma=1., randomizer_scale=randomizer_scale * 1.) signs = conv.fit() nonzero = signs != 0 print("dimensions", n, p, nonzero.sum()) if nonzero.sum() > 0: if target == 'full': (observed_target, cov_target, cov_target_score, alternatives) = full_targets(conv.loglike, conv._W, nonzero, dispersion=1.) elif target == 'selected': (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, dispersion=1.) if target == "selected": beta_target = np.linalg.pinv(X[:, nonzero]).dot( X.dot(beta)) / sigma_ else: beta_target = beta[nonzero] / sigma_ if use_MLE: estimate, _, _, pval, intervals, _ = conv.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) else: _, pval, intervals = conv.summary(observed_target, cov_target, cov_target_score, alternatives, compute_intervals=True) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) break if True: return pval[beta_target == 0], pval[ beta_target != 0], coverage, intervals
def comparison_cvmetrics_selected(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.50), full_dispersion=True, tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) true_mean = X.dot(beta) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() true_set = np.asarray([u for u in range(p) if beta[u] != 0]) if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / ( n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso( X, y, lam_theory / float(n)) if tuning_nonrand == "lambda.min": lam_LASSO = lam_min glm_LASSO = glm_LASSO_min elif tuning_nonrand == "lambda.1se": lam_LASSO = lam_1se glm_LASSO = glm_LASSO_1se else: lam_LASSO = lam_theory / float(n) glm_LASSO = glm_LASSO_theory active_LASSO = (glm_LASSO != 0) nactive_LASSO = active_LASSO.sum() active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) active_LASSO_bool = np.asarray( [(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], np.bool) rel_LASSO = np.zeros(p) Lee_nreport = 0 bias_Lee = 0. bias_naive = 0. if nactive_LASSO > 0: post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) rel_LASSO[active_LASSO] = post_LASSO_OLS Lee_target = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta)) Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=0, alpha=0.1) if (Lee_pval.shape[0] == Lee_target.shape[0]): cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) inf_entries = np.mean(inf_entries_bool) if inf_entries == 1.: length_Lee = 0. else: length_Lee = np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0])[~inf_entries_bool]) power_Lee = ((active_LASSO_bool) * (np.logical_or((0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))) \ .sum() / float((beta != 0).sum()) Lee_discoveries = BHfilter(Lee_pval, q=0.1) power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float( (beta != 0).sum()) fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float( max(Lee_discoveries.sum(), 1.)) bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) naive_sd = sigma_ * np.sqrt( np.diag((np.linalg.inv(X[:, active_LASSO].T.dot( X[:, active_LASSO]))))) naive_intervals = np.vstack([ post_LASSO_OLS - 1.65 * naive_sd, post_LASSO_OLS + 1.65 * naive_sd ]).T naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) cov_naive, selective_naive_power = coverage( naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) power_naive = ((active_LASSO_bool) * (np.logical_or( (0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( (beta != 0).sum()) naive_discoveries = BHfilter(naive_pval, q=0.1) power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float( (beta != 0).sum()) fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float( max(naive_discoveries.sum(), 1.)) bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) partial_Lasso_risk = (glm_LASSO[active_LASSO] - Lee_target).T.dot(glm_LASSO[active_LASSO] - Lee_target) partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) else: Lee_nreport = 1 cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [ 0., 0., 0., 0., 0., 0., 0. ] cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [ 0., 0., 0., 0., 0., 0. ] naive_discoveries = np.zeros(1) Lee_discoveries = np.zeros(1) partial_Lasso_risk, partial_relLasso_risk = [0., 0.] elif nactive_LASSO == 0: Lee_nreport = 1 cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [ 0., 0., 0., 0., 0., 0., 0. ] cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [ 0., 0., 0., 0., 0., 0. ] naive_discoveries = np.zeros(1) Lee_discoveries = np.zeros(1) partial_Lasso_risk, partial_relLasso_risk = [0., 0.] if tuning_rand == "lambda.min": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_min * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) elif tuning_rand == "lambda.1se": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_1se * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) else: randomized_lasso = lasso.gaussian( X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) active_rand_bool = np.asarray( [(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) sel_MLE = np.zeros(p) ind_est = np.zeros(p) randomized_lasso_est = np.zeros(p) randomized_rel_lasso_est = np.zeros(p) MLE_nreport = 0 sys.stderr.write("active variables selected by cv LASSO " + str(nactive_LASSO) + "\n") sys.stderr.write("active variables selected by randomized LASSO " + str(nonzero.sum()) + "\n" + "\n") if nonzero.sum() > 0: target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) sel_MLE[nonzero] = MLE_estimate ind_est[nonzero] = ind_unbiased_estimator randomized_lasso_est = randomized_lasso.initial_soln randomized_rel_lasso_est = randomized_lasso._beta_full cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) power_MLE = ((active_rand_bool) * (np.logical_or( (0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) MLE_discoveries = BHfilter(MLE_pval, q=0.1) power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float( (beta != 0).sum()) fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float( max(MLE_discoveries.sum(), 1.)) bias_MLE = np.mean(MLE_estimate - target_randomized) partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) partial_randLasso_risk = ( randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) partial_relrandLasso_risk = ( randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) else: MLE_nreport = 1 cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [ 0., 0., 0., 0., 0., 0., 0. ] MLE_discoveries = np.zeros(1) partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [ 0., 0., 0., 0. ] risks = np.vstack( (relative_risk(sel_MLE, beta, Sigma), relative_risk(ind_est, beta, Sigma), relative_risk(randomized_lasso_est, beta, Sigma), relative_risk(randomized_rel_lasso_est, beta, Sigma), relative_risk(rel_LASSO, beta, Sigma), relative_risk(glm_LASSO, beta, Sigma))) partial_risks = np.vstack( (partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk, partial_relLasso_risk, partial_Lasso_risk)) naive_inf = np.vstack( (cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH, naive_discoveries.sum())) Lee_inf = np.vstack( (cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, power_Lee, power_Lee_BH, fdr_Lee_BH, Lee_discoveries.sum())) Liu_inf = np.zeros((10, 1)) MLE_inf = np.vstack( (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) nreport = np.vstack((Lee_nreport, 0., MLE_nreport)) return np.vstack( (risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport))
def test_marginal_slope(n=3000, p=1000, signal_fac=1.5, s=30, sigma=2., rho=0.20, randomizer_scale= np.sqrt(0.5), split_proportion= 0.67, target = "selected"): inst = gaussian_instance signal = np.sqrt(signal_fac * 2. * np.log(p)) X, y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] sigma_ = np.sqrt(np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p)) #sigma_ = np.std(y)/np.sqrt(2) #sigma_ = 1. Y = y/sigma_ score = X.T.dot(Y) omega = randomization.isotropic_gaussian((p,), randomizer_scale * sigma_).sample() W = X.T.dot(X) marginal_select = marginal_screening.type1(score, W, 0.1, randomizer_scale, useC=True, perturb=omega) boundary, cond_mean_1, cond_cov_1, affine_con_1, logdens_linear_1, initial_soln_1 = marginal_select.fit() nonzero = boundary != 0 first_selected = np.asarray([t for t in range(p) if nonzero[t]]) X_tilde = X[:, nonzero] r_beta, r_E, r_lambda_seq, r_sigma = slope_R(X_tilde, Y, W=None, normalize=True, choice_weights="gaussian", # put gaussian sigma=1.) conv = slope.gaussian(X_tilde, Y, r_sigma * r_lambda_seq, sigma=1., randomizer_scale=randomizer_scale * 1.) signs, cond_mean_2, cond_cov_2, affine_con_2, logdens_linear_2, initial_soln_2 = conv.fit() nonzero_slope = signs != 0 second_selected = np.asarray([s for s in range(nonzero.sum()) if nonzero_slope[s]]) subsample_size = int(split_proportion * n) sel_idx = np.zeros(n, np.bool) sel_idx[:subsample_size] = 1 np.random.shuffle(sel_idx) inf_idx = ~sel_idx Y_inf = Y[inf_idx] X_inf = X[inf_idx, :] #_sigma_ = np.sqrt(np.linalg.norm(Y_inf - X_inf.dot(np.linalg.pinv(X_inf).dot(Y_inf))) ** 2 / (n - p)) Y_sel = Y[sel_idx] X_sel = X[sel_idx, :] #Y_inf /= _sigma_ score_split = X_sel.T.dot(Y_sel) stdev_split = np.sqrt(np.diag(X_sel.T.dot(X_sel))) threshold_split = stdev_split * ndist.ppf(1. - 0.1/ 2.) boundary_split = np.fabs(score_split) >= threshold_split nonzero_split = boundary_split != 0 first_selected_split = np.asarray([u for u in range(p) if nonzero_split[u]]) X_tilde_sel = X_sel[:, nonzero_split] r_beta_split, r_E_split, r_lambda_seq_split, r_sigma_split = slope_R(X_tilde_sel, Y_sel, W=None, normalize=True, choice_weights="gaussian", sigma=1.) nonzero_slope_split = (r_beta_split != 0) second_selected_split = np.asarray([r for r in range(nonzero_split.sum()) if nonzero_slope_split[r]]) print("compare dimensions- ms ", nonzero.sum(), nonzero_split.sum()) print("compare dimensions- slope ", nonzero_slope.sum(), nonzero_slope_split.sum()) beta_target_split = np.linalg.pinv(X_inf[:, first_selected_split[second_selected_split]]).dot(X_inf[:, first_selected_split].dot(beta[nonzero_split]))/ sigma_ post_split_OLS = np.linalg.pinv(X_inf[:, first_selected_split[second_selected_split]]).dot(Y_inf) naive_split_sd = np.sqrt(np.diag((np.linalg.inv(X_inf[:, first_selected_split[second_selected_split]].T.dot(X_inf[:, first_selected_split[second_selected_split]]))))) intervals_split = np.vstack([post_split_OLS - 1.65 * naive_split_sd, post_split_OLS + 1.65 * naive_split_sd]).T coverage_split = (beta_target_split > intervals_split[:, 0]) * (beta_target_split < intervals_split[:, 1]) length_split = intervals_split[:, 1] - intervals_split[:, 0] pval_split = 2 *(1.-ndist.cdf(np.abs(post_split_OLS) / naive_split_sd)) pval_alt_split = (pval_split[beta[first_selected_split[second_selected_split]] != 0]) < 0.1 if pval_alt_split.sum() > 0: power_split = np.mean(pval_alt_split) else: power_split = 0. if target == "selected": _, _, cov_target_score_1, _ = marginal_select.multivariate_targets(first_selected[second_selected]) (observed_target, cov_target, cov_target_score_2, alternatives) = selected_targets(conv.loglike, conv._W, nonzero_slope, dispersion=1.) beta_target = np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(X_tilde.dot(beta[nonzero])) / sigma_ elif target == "full": _, _, cov_target_score_1, _ = marginal_select.marginal_targets(first_selected[second_selected]) (observed_target, cov_target, cov_target_score_2, alternatives) = full_targets(conv.loglike, conv._W, nonzero_slope, dispersion=1.) beta_target = beta[first_selected[second_selected]] / sigma_ estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target, cov_target, cov_target_score_1, cov_target_score_2, initial_soln_1, initial_soln_2, cond_mean_1, cond_mean_2, cond_cov_1, cond_cov_2, logdens_linear_1, logdens_linear_2, affine_con_1.linear_part, affine_con_2.linear_part, affine_con_1.offset, affine_con_2.offset, solve_args={'tol': 1.e-12}, level=0.9) pval_alt = (pval[beta[first_selected[second_selected]] != 0]) < 0.1 if pval_alt.sum() > 0: power_adjusted = np.mean(pval_alt) else: power_adjusted = 0. fdr = ((pval[beta[first_selected[second_selected]] == 0]) < 0.1).sum() / float((pval < 0.1).sum()) coverage_adjusted = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) length_adjusted = intervals[:, 1] - intervals[:, 0] post_sel_OLS = np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(Y) naive_sd = np.sqrt(np.diag((np.linalg.inv(X_tilde[:, nonzero_slope].T.dot(X_tilde[:, nonzero_slope]))))) intervals_naive = np.vstack([post_sel_OLS - 1.65 * naive_sd, post_sel_OLS + 1.65 * naive_sd]).T coverage_naive = (beta_target > intervals_naive[:, 0]) * (beta_target < intervals_naive[:, 1]) length_naive = intervals_naive[:, 1] - intervals_naive[:, 0] return coverage_adjusted, sigma_ * length_adjusted, power_adjusted, coverage_naive, sigma_ * length_naive, \ coverage_split, sigma_ * length_split, power_split, fdr
def test_selected_targets(n=100, p=500, signal_fac=0.2, s=10, sigma=3., rho=0.4, randomizer_scale=1.): """ Compare to R randomized lasso """ inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) while True: X, Y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] idx = np.arange(p) sigmaX = rho ** np.abs(np.subtract.outer(idx, idx)) print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n)) n, p = X.shape sigma_ = np.std(Y) W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ conv = const(X, Y, W, randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() nonzero = signs != 0 if nonzero.sum() > 0: dispersion = None (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, dispersion=dispersion) estimate, observed_info_mean, _, _, _, _ = conv.selective_MLE(observed_target, cov_target, cov_target_score, alternatives) index = np.random.permutation(n)[0] contrast = ((X[:, nonzero])[index,:]) target = contrast.T.dot(np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))) est = contrast.T.dot(estimate) var_est = contrast.T.dot(observed_info_mean).dot(contrast) quantile = ndist.ppf(1 - 0.05) intervals = np.vstack([est - quantile * np.sqrt(var_est), est + quantile * np.sqrt(var_est)]).T pivot = ndist.cdf((est-target)/np.sqrt(var_est)) coverage = (target > intervals[0,0]) * (target < intervals[0,1]) return coverage, pivot
def compute_sampler_quantiles(n=500, p=100, signal_fac=1.2, s=5, sigma=1., rho=0., randomizer_scale=1, full_dispersion=True): inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) while True: X, Y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] idx = np.arange(p) sigmaX = rho ** np.abs(np.subtract.outer(idx, idx)) print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n)) n, p = X.shape if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) sigma_ = np.sqrt(dispersion) W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ conv = const(X, Y, W, randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() nonzero = signs != 0 (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, dispersion=dispersion) true_mean = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) estimate, observed_info_mean, _, pval, intervals, _ = conv.selective_MLE(observed_target, cov_target, cov_target_score, alternatives) opt_linear, opt_offset = conv.opt_transform target_precision = np.linalg.inv(cov_target) randomizer_cov, randomizer_precision = conv.randomizer.cov_prec score_linear = np.identity(p) target_linear = score_linear.dot(cov_target_score.T.dot(target_precision)) target_offset = conv.observed_score_state - target_linear.dot(observed_target) nopt = opt_linear.shape[1] ntarget = target_linear.shape[1] implied_precision = np.zeros((ntarget + nopt, ntarget + nopt)) implied_precision[:ntarget, :ntarget] = target_linear.T.dot(randomizer_precision).dot(target_linear) + target_precision implied_precision[:ntarget, ntarget:] = target_linear.T.dot(randomizer_precision).dot(opt_linear) implied_precision[ntarget:, :ntarget] = opt_linear.T.dot(randomizer_precision).dot(target_linear) implied_precision[ntarget:, ntarget:] = opt_linear.T.dot(randomizer_precision).dot(opt_linear) implied_cov = np.linalg.inv(implied_precision) conditioned_value = target_offset + opt_offset implied_mean = implied_cov.dot(np.hstack((target_precision.dot(true_mean)-target_linear.T.dot(randomizer_precision).dot(conditioned_value), -opt_linear.T.dot(randomizer_precision).dot(conditioned_value)))) A_scaling = np.zeros((nopt, ntarget+nopt)) A_scaling[:,ntarget:] = -np.identity(nopt) b_scaling = np.zeros(nopt) affine_con = constraints(A_scaling, b_scaling, mean=implied_mean, covariance=implied_cov) initial_point = np.zeros(ntarget+nopt) initial_point[ntarget:] = conv.observed_opt_state sampler = sample_from_constraints(affine_con, initial_point, ndraw=500000, burnin=1000) print("sampler", sampler.shape, sampler[:,:ntarget].shape) mle_sample = [] for j in range(sampler.shape[0]): estimate, _, _, _, _, _ = conv.selective_MLE(sampler[j,:ntarget], cov_target, cov_target_score, alternatives) mle_sample.append(estimate) print("iteration ", j) mle_sample = np.asarray(mle_sample) print("check", mle_sample.shape, np.mean(mle_sample, axis=0) - true_mean) for i in range(nonzero.sum()): temp = 251 + i ax = plt.subplot(temp) stats.probplot(mle_sample[:,i], dist="norm", plot=pylab) plt.subplots_adjust(hspace=.5, wspace=.5) pylab.show() sampler_quantiles = np.vstack([np.percentile(mle_sample, 5, axis=0), np.percentile(mle_sample, 95, axis=0)]) normal_quantiles = np.vstack((norm.ppf(0.05, loc=true_mean, scale=np.sqrt(np.diag(observed_info_mean))), norm.ppf(0.95, loc=true_mean, scale=np.sqrt(np.diag(observed_info_mean))))) print("sampler quantiles", sampler_quantiles.T) print("normal quantiles", normal_quantiles.T) break
def multiple_runs_lasso(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.50), full_dispersion=True): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory / float(n)) active_LASSO_1 = (glm_LASSO_theory != 0) active_LASSO_2 = (glm_LASSO_1se != 0) active_LASSO = np.logical_or(active_LASSO_1, active_LASSO_2) nreport_nonrand = 0. if active_LASSO.sum()>0: target_nonrandomized = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta)) post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, post_LASSO_OLS + 1.65 * naive_sd]).T naive_pval = 2 * (1.-ndist.cdf(np.abs(post_LASSO_OLS)/ naive_sd)) cov_naive, power_naive = coverage(naive_intervals, naive_pval, target_nonrandomized, beta[active_LASSO]) length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) fdr_naive = ((naive_pval[beta[active_LASSO] == 0]) < 0.1).sum() / float((naive_pval < 0.1).sum()) else: nreport_nonrand +=1. cov_naive, power_naive, length_naive, fdr_naive = [0.,0., 0.,0.] randomized_lasso_1 = lasso.gaussian(X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs_1 = randomized_lasso_1.fit() nonzero_1 = signs_1 != 0 randomized_lasso_2 = lasso.gaussian(X, y, feature_weights=n * lam_1se * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs_2 = randomized_lasso_2.fit() nonzero_2 = signs_2 != 0 signs = np.logical_or(signs_1, signs_2) nonzero = signs!=0 print("check", nonzero_1.sum(), nonzero_2.sum(), nonzero.sum(), active_LASSO.sum()) nreport = 0. if nonzero.sum() > 0: target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) observed_target = np.linalg.pinv(X[:, nonzero]).dot(y) (_, _, cov_target_score_1, alternatives_1) = selected_targets(randomized_lasso_1.loglike, randomized_lasso_1._W, nonzero, dispersion=dispersion) (_, cov_target, cov_target_score_2, alternatives_2) = selected_targets(randomized_lasso_2.loglike, randomized_lasso_2._W, nonzero, dispersion=dispersion) estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target, cov_target, cov_target_score_1, cov_target_score_2, randomized_lasso_1.observed_opt_state, randomized_lasso_2.observed_opt_state, randomized_lasso_1.cond_mean, randomized_lasso_2.cond_mean, randomized_lasso_1.cond_cov, randomized_lasso_2.cond_cov, randomized_lasso_1.logdens_linear, randomized_lasso_2.logdens_linear, randomized_lasso_1.con_linear, randomized_lasso_2.con_linear, randomized_lasso_1.con_offset, randomized_lasso_2.con_offset, solve_args={'tol': 1.e-12}, level=0.9) coverage_adjusted, power_adjusted = coverage(intervals, pval, target_randomized, beta[nonzero]) length_adjusted = np.mean(intervals[:, 1] - intervals[:, 0]) fdr_adjusted = ((pval[beta[nonzero] == 0]) < 0.1).sum() / float((pval < 0.1).sum()) else: nreport +=1 coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted = [0., 0., 0., 0.] MLE_inf = np.vstack((coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, nonzero.sum())) Naive_inf = np.vstack((cov_naive, length_naive, power_naive, fdr_naive, active_LASSO.sum())) print MLE_inf, Naive_inf return np.vstack((MLE_inf, Naive_inf, nreport, nreport_nonrand))
def compare_twostage_mle(n=3000, p=1000, nval=3000, rho=0.35, s=35, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.50), full_dispersion=True): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) X -= X.mean(0)[None, :] scaling = X.std(0)[None, :] * np.sqrt(n) X /= scaling y = y - y.mean() if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) Y = y / sigma_ score = X.T.dot(Y) omega = randomization.isotropic_gaussian((p,), randomizer_scale * sigma_).sample() W = X.T.dot(X) marginal_select = marginal_screening.type1(score, W, 0.1, randomizer_scale, useC=True, perturb=omega) boundary, cond_mean_1, cond_cov_1, affine_con_1, logdens_linear_1, initial_soln_1 = marginal_select.fit() nonzero = boundary != 0 first_selected = np.asarray([t for t in range(p) if nonzero[t]]) X_tilde = X[:, nonzero] r_beta, r_E, r_lambda_seq, r_sigma = slope_R(X_tilde, Y, W=None, normalize=True, choice_weights="gaussian", # put gaussian sigma=1.) conv = slope.gaussian(X_tilde, Y, r_sigma * r_lambda_seq, sigma=1., randomizer_scale=randomizer_scale * 1.) signs, cond_mean_2, cond_cov_2, affine_con_2, logdens_linear_2, initial_soln_2 = conv.fit() nonzero_slope = signs != 0 second_selected = np.asarray([s for s in range(nonzero.sum()) if nonzero_slope[s]]) stdev = np.sqrt(np.diag(X.T.dot(X))) boundary_nonrand = (score > stdev * ndist.ppf(1. - 0.10 / 2.)) nonzero_nonrand = boundary_nonrand != 0 first_selected_nonrand = np.asarray([z for z in range(p) if nonzero[z]]) X_tilde_nonrand = X[:, nonzero_nonrand] r_beta_nonrand, r_E_nonrand, _, _ = slope_R(X_tilde_nonrand, Y, W=None, normalize=True, choice_weights="gaussian", # put gaussian sigma=1.) nonzero_slope_nonrand = (r_beta_nonrand != 0) second_selected_nonrand = np.asarray([w for w in range(nonzero_nonrand.sum()) if nonzero_slope_nonrand[w]]) print("compare dimensions- ms ", nonzero.sum(), nonzero_nonrand.sum()) print("compare dimensions- slope ", nonzero_slope.sum(), nonzero_slope_nonrand.sum()) nreport = 0. nreport_nonrand = 0. if nonzero_slope.sum()>0: _, _, cov_target_score_1, _ = marginal_select.multivariate_targets(first_selected[second_selected]) (observed_target, cov_target, cov_target_score_2, alternatives) = selected_targets(conv.loglike, conv._W, nonzero_slope, dispersion=1.) beta_target = np.sqrt(n) * np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(X_tilde.dot(beta[nonzero])) / sigma_ estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target, cov_target, cov_target_score_1, cov_target_score_2, initial_soln_1, initial_soln_2, cond_mean_1, cond_mean_2, cond_cov_1, cond_cov_2, logdens_linear_1, logdens_linear_2, affine_con_1.linear_part, affine_con_2.linear_part, affine_con_1.offset, affine_con_2.offset, solve_args={'tol': 1.e-12}, level=0.9) pval_alt = (pval[beta[first_selected[second_selected]] != 0]) < 0.1 if pval_alt.sum() > 0: power_adjusted = np.mean(pval_alt) else: power_adjusted = 0. fdr_adjusted = ((pval[beta[first_selected[second_selected]] == 0]) < 0.1).sum()/float((pval<0.1).sum()) coverage_adjusted = np.mean((beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1])) length_adjusted = sigma_* np.mean(intervals[:, 1] - intervals[:, 0])/np.sqrt(n) post_sel_OLS = np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(Y) naive_sd = np.sqrt(np.diag((np.linalg.inv(X_tilde[:, nonzero_slope].T.dot(X_tilde[:, nonzero_slope]))))) intervals_naive = np.vstack([post_sel_OLS - 1.65 * naive_sd, post_sel_OLS + 1.65 * naive_sd]).T coverage_naive = np.mean((beta_target > intervals_naive[:, 0]) * (beta_target < intervals_naive[:, 1])) length_naive = sigma_* np.mean(intervals_naive[:, 1] - intervals_naive[:, 0])/np.sqrt(n) else: nreport += 1 coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, coverage_naive, length_naive = [0., 0., 0., 0., 0., 0.] if nonzero_slope_nonrand.sum()>0: beta_target_nonrand = np.sqrt(n) * np.linalg.pinv(X_tilde_nonrand[:, nonzero_slope_nonrand]).dot(X_tilde_nonrand.dot(beta[nonzero_nonrand])) / sigma_ post_sel_OLS_nonrand = np.linalg.pinv(X_tilde_nonrand[:, nonzero_slope_nonrand]).dot(Y) naive_sd_nonrand = np.sqrt(np.diag((np.linalg.inv(X_tilde_nonrand[:, nonzero_slope_nonrand].T.dot(X_tilde_nonrand[:, nonzero_slope_nonrand]))))) intervals_naive_nonrand = np.vstack([post_sel_OLS_nonrand - 1.65 * naive_sd_nonrand, post_sel_OLS_nonrand + 1.65 * naive_sd_nonrand]).T coverage_naive_nonrand = np.mean((beta_target_nonrand > intervals_naive_nonrand[:, 0]) * (beta_target_nonrand < intervals_naive_nonrand[:, 1])) length_naive_nonrand = sigma_ * np.mean(intervals_naive_nonrand[:, 1] - intervals_naive_nonrand[:, 0])/np.sqrt(n) pval_nonrand = 2 * (1.-ndist.cdf(np.abs(post_sel_OLS_nonrand) / naive_sd_nonrand)) pval_alt_nonrand = (pval_nonrand[beta[first_selected_nonrand[second_selected_nonrand]] != 0]) < 0.1 if pval_alt_nonrand.sum() > 0: power_nonrand = np.mean(pval_alt_nonrand) else: power_nonrand = 0. fdr_nonrand = ((pval_nonrand[beta[first_selected_nonrand[second_selected_nonrand]] == 0]) < 0.1).sum() / float((pval_nonrand < 0.1).sum()) else: nreport_nonrand += 1 coverage_naive_nonrand, length_naive_nonrand, power__nonrand, fdr__nonrand = [0., 0., 0., 0.] MLE_inf = np.vstack((coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, nonzero.sum(), nonzero_slope.sum())) #Naive_rand_inf = np.vstack((coverage_naive, length_naive, 0., 0.)) Naive_inf = np.vstack((coverage_naive_nonrand, length_naive_nonrand, power_nonrand, fdr_nonrand, nonzero_nonrand.sum(), nonzero_slope_nonrand.sum())) print("inf", MLE_inf, Naive_inf) return np.vstack((MLE_inf, Naive_inf, nreport, nreport_nonrand))
def pivot(n=500, p=100, nval=500, rho=0., s=5, beta_type=1, snr=0.25, randomizer_scale=np.sqrt(1.), full_dispersion=True): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / ( n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) randomized_lasso = lasso.gaussian(X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 sys.stderr.write("active variables selected by randomized LASSO " + str(nonzero.sum()) + "\n" + "\n") if nonzero.sum() > 0: target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) toc = time.time() MLE_estimate, observed_info_mean, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) tic = time.time() cov_MLE, _ = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) pivot_MLE = np.true_divide(MLE_estimate - target_randomized, np.sqrt(np.diag(observed_info_mean))) time_MLE = tic - toc toc = time.time() sampler_pivot, sampler_pval, sampler_intervals = randomized_lasso.summary( observed_target, cov_target, cov_target_score, alternatives, level=0.9, compute_intervals=True, ndraw=200000) tic = time.time() cov_sampler, _ = coverage(sampler_intervals, sampler_pval, target_randomized, beta[nonzero]) time_sampler = tic - toc return pivot_MLE, sampler_pivot, time_MLE, time_sampler, np.mean( cov_MLE), np.mean(cov_sampler)