Example #1
0
 def method_instance(self):
     if not hasattr(self, "_method_instance"):
         n, p = self.X.shape
         self._method_instance = random_lasso_method.gaussian(self.X,
                                                              self.Y,
                                                              self.lagrange * np.sqrt(n),
                                                              randomizer_scale=self.randomizer_scale * np.std(self.Y) * np.sqrt(n))
     return self._method_instance
Example #2
0
 def method_instance(self):
     if not hasattr(self, "_method_instance"):
         n, p = self.X.shape
         lagrange = np.ones(p) * choose_lambda(self.X) * self.kappa
         self._method_instance = random_lasso_method.gaussian(self.X,
                                                              self.Y,
                                                              lagrange,
                                                              randomizer_scale=self.randomizer_scale * np.std(self.Y))
     return self._method_instance
 def method_instance(self):
     if not hasattr(self, "_method_instance"):
         n, p = self.X.shape
         mean_diag = np.mean((self.X ** 2).sum(0))
         self._method_instance = random_lasso_method.gaussian(self.X,
                                                              self.Y,
                                                              feature_weights = self.lagrange * np.sqrt(n),
                                                              ridge_term=np.std(self.Y) * np.sqrt(mean_diag) / np.sqrt(n),
                                                              randomizer_scale=self.randomizer_scale * np.std(self.Y) * np.sqrt(n))
     return self._method_instance
Example #4
0
def test_full_lasso(n=200, p=30, signal_fac=1.5, s=5, ndraw=5000, burnin=1000, sigma=3, full=False, rho=0.4, randomizer_scale=1):
    """
    General LASSO -- 
    """

    inst, const = gaussian_instance, lasso.gaussian
    signal = np.sqrt(signal_fac * np.log(p))
    X, Y, beta = inst(n=n,
                      p=p, 
                      signal=signal, 
                      s=s, 
                      equicorrelated=False, 
                      rho=rho, 
                      sigma=sigma, 
                      random_signs=True)[:3]

    n, p = X.shape

    W = np.ones(X.shape[1]) * np.sqrt(1.5 * np.log(p)) * sigma

    conv = const(X, 
                 Y, 
                 W, 
                 randomizer_scale=randomizer_scale * sigma)
    
    signs = conv.fit(solve_args={'min_its':500, 'tol':1.e-13})
    nonzero = signs != 0

    conv2 = lasso.gaussian(X, 
                           Y, 
                           W,
                           randomizer_scale=randomizer_scale * sigma)
    conv2.fit(perturb=conv._initial_omega, solve_args={'min_its':500, 'tol':1.e-13})
    conv2.decompose_subgradient(condition=np.ones(p, np.bool))

    np.testing.assert_allclose(conv2._view.sampler.affine_con.covariance,
                               conv.sampler.affine_con.covariance)

    np.testing.assert_allclose(conv2._view.sampler.affine_con.mean,
                               conv.sampler.affine_con.mean)

    np.testing.assert_allclose(conv2._view.sampler.affine_con.linear_part,
                               conv.sampler.affine_con.linear_part)

    np.testing.assert_allclose(conv2._view.sampler.affine_con.offset,
                               conv.sampler.affine_con.offset)

    np.testing.assert_allclose(conv2._view.initial_soln,
                               conv.initial_soln)

    np.testing.assert_allclose(conv2._view.initial_subgrad,
                               conv.initial_subgrad)
Example #5
0
def compare_methods(n=500,
                    p=100,
                    nval=500,
                    rho=0.35,
                    s=5,
                    beta_type=1,
                    snr=0.20,
                    target="selected",
                    randomizer_scale=np.sqrt(0.50),
                    full_dispersion=True,
                    tuning_rand="lambda.theory"):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                            p=p,
                                            nval=nval,
                                            rho=rho,
                                            s=s,
                                            beta_type=beta_type,
                                            snr=snr)
    print("snr", snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()
    true_set = np.asarray([u for u in range(p) if beta[u] != 0])

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (
            n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    randomized_lasso = lasso.gaussian(X,
                                      y,
                                      feature_weights=lam_theory * np.ones(p),
                                      randomizer_scale=np.sqrt(n) *
                                      randomizer_scale * sigma_)

    signs = randomized_lasso.fit()
    nonzero = signs != 0
    sys.stderr.write("active variables selected by randomized LASSO " +
                     str(nonzero.sum()) + "\n" + "\n")
    active_set_rand = np.asarray([t for t in range(p) if nonzero[t]])
    active_rand_bool = np.asarray(
        [(np.in1d(active_set_rand[x], true_set).sum() > 0)
         for x in range(nonzero.sum())], np.bool)
    nreport = 0.
    if nonzero.sum() > 0:
        if target == "full":
            target_randomized = beta[nonzero]
            (observed_target, cov_target, cov_target_score,
             alternatives) = full_targets(randomized_lasso.loglike,
                                          randomized_lasso._W,
                                          nonzero,
                                          dispersion=dispersion)
        elif target == "selected":
            target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
            (observed_target, cov_target, cov_target_score,
             alternatives) = selected_targets(randomized_lasso.loglike,
                                              randomized_lasso._W,
                                              nonzero,
                                              dispersion=dispersion)
        else:
            raise ValueError('not a valid specification of target')
        toc = time.time()
        MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(
            observed_target, cov_target, cov_target_score, alternatives)
        tic = time.time()
        time_MLE = tic - toc

        cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval,
                                                target_randomized,
                                                beta[nonzero])
        length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0])
        power_MLE = ((active_rand_bool) * (np.logical_or(
            (0. < MLE_intervals[:, 0]),
            (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum())
        MLE_discoveries = BHfilter(MLE_pval, q=0.1)
        power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float(
            (beta != 0).sum())
        fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(
            max(MLE_discoveries.sum(), 1.))
        bias_MLE = np.mean(MLE_estimate - target_randomized)

        toc = time.time()
        intervals_uni, pvalue_uni = randomized_lasso.inference_new(
            observed_target, cov_target, cov_target_score, alternatives)

        tic = time.time()
        time_uni = tic - toc
        intervals_uni = intervals_uni.T
        cov_uni, selective_uni_power = coverage(intervals_uni, pvalue_uni,
                                                target_randomized,
                                                beta[nonzero])
        length_uni = np.mean(intervals_uni[:, 1] - intervals_uni[:, 0])
        power_uni = ((active_rand_bool) * (np.logical_or(
            (0. < intervals_uni[:, 0]),
            (0. > intervals_uni[:, 1])))).sum() / float((beta != 0).sum())
        uni_discoveries = BHfilter(pvalue_uni, q=0.1)
        power_uni_BH = (uni_discoveries * active_rand_bool).sum() / float(
            (beta != 0).sum())
        fdr_uni_BH = (uni_discoveries * ~active_rand_bool).sum() / float(
            max(uni_discoveries.sum(), 1.))
        bias_randLASSO = np.mean(randomized_lasso.initial_soln[nonzero] -
                                 target_randomized)

    else:
        nreport += 1
        cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power, time_MLE = [
            0., 0., 0., 0., 0., 0., 0., 0.
        ]
        cov_uni, length_uni, power_uni, power_uni_BH, fdr_uni_BH, bias_randLASSO, selective_uni_power, time_uni = [
            0., 0., 0., 0., 0., 0., 0., 0.
        ]
        MLE_discoveries = np.zeros(1)
        uni_discoveries = np.zeros(1)

    MLE_inf = np.vstack(
        (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power,
         time_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum()))

    uni_inf = np.vstack(
        (cov_uni, length_uni, 0., nonzero.sum(), bias_randLASSO,
         selective_uni_power, time_uni, power_uni, power_uni_BH, fdr_uni_BH,
         uni_discoveries.sum()))

    return np.vstack((MLE_inf, uni_inf, nreport))
def risk_comparison(n=500,
                    p=100,
                    nval=500,
                    rho=0.35,
                    s=5,
                    beta_type=1,
                    snr=0.20,
                    randomizer_scale=np.sqrt(0.50),
                    full_dispersion=False,
                    tuning_nonrand="lambda.min",
                    tuning_rand="lambda.1se",
                    ndraw=50):

    risks = np.zeros((6, 1))
    for i in range(ndraw):
        X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                                p=p,
                                                nval=nval,
                                                rho=rho,
                                                s=s,
                                                beta_type=beta_type,
                                                snr=snr)
        print("snr", snr)
        X -= X.mean(0)[None, :]
        X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
        y = y - y.mean()

        if full_dispersion:
            print("shapes", y.shape,
                  (np.linalg.norm(y -
                                  X.dot(np.linalg.pinv(X).dot(y)))**2).shape)
            dispersion = np.linalg.norm(
                y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (n - p)
            sigma_ = np.sqrt(dispersion)
        else:
            dispersion = None
            _sigma_ = np.std(y)
        lam_theory = _sigma_ * 1. * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
        glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(
            X, y, lam_theory / float(n))

        if full_dispersion is False:
            dispersion = None
            active_min = (glm_LASSO_min != 0)
            if active_min.sum() > 0:
                sigma_ = np.sqrt(
                    np.linalg.norm(y - X[:, active_min].dot(
                        np.linalg.pinv(X[:, active_min]).dot(y)))**2 /
                    (n - active_min.sum()))
            else:
                sigma_ = _sigma_

        print("true and estimated sigma", sigma, _sigma_, sigma_)

        if tuning_nonrand == "lambda.min":
            lam_LASSO = lam_min
            glm_LASSO = glm_LASSO_min
        elif tuning_nonrand == "lambda.1se":
            lam_LASSO = lam_1se
            glm_LASSO = glm_LASSO_1se
        else:
            lam_LASSO = lam_theory / float(n)
            glm_LASSO = glm_LASSO_theory
        active_LASSO = (glm_LASSO != 0)
        rel_LASSO = np.zeros(p)
        if active_LASSO.sum() > 0:
            post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y)
            rel_LASSO[active_LASSO] = post_LASSO_OLS

        if tuning_rand == "lambda.min":
            randomized_lasso = lasso.gaussian(
                X,
                y,
                feature_weights=n * lam_min * np.ones(p),
                randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
        elif tuning_rand == "lambda.1se":
            randomized_lasso = lasso.gaussian(
                X,
                y,
                feature_weights=n * lam_1se * np.ones(p),
                randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
        else:
            randomized_lasso = lasso.gaussian(
                X,
                y,
                feature_weights=lam_theory * np.ones(p),
                randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
        signs = randomized_lasso.fit()
        nonzero = signs != 0
        sel_MLE = np.zeros(p)
        ind_est = np.zeros(p)
        randomized_lasso_est = np.zeros(p)
        randomized_rel_lasso_est = np.zeros(p)

        if nonzero.sum() > 0:
            target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
            (observed_target, cov_target, cov_target_score,
             alternatives) = selected_targets(randomized_lasso.loglike,
                                              randomized_lasso._W,
                                              nonzero,
                                              dispersion=dispersion)

            MLE_estimate, _, _, _, _, ind_unbiased_estimator = randomized_lasso.selective_MLE(
                observed_target, cov_target, cov_target_score, alternatives)
            sel_MLE[nonzero] = MLE_estimate
            ind_est[nonzero] = ind_unbiased_estimator
            randomized_lasso_est = randomized_lasso.initial_soln
            randomized_rel_lasso_est = randomized_lasso._beta_full

        risks += np.vstack(
            (relative_risk(sel_MLE, beta,
                           Sigma), relative_risk(ind_est, beta, Sigma),
             relative_risk(randomized_lasso_est, beta, Sigma),
             relative_risk(randomized_rel_lasso_est, beta,
                           Sigma), relative_risk(rel_LASSO, beta, Sigma),
             relative_risk(glm_LASSO, beta, Sigma)))
        print("risks so far", risks / (i + 1))

    return risks / ndraw
Example #7
0
def comparison_cvmetrics_debiased(n=100,
                                  p=150,
                                  nval=500,
                                  rho=0.35,
                                  s=5,
                                  beta_type=1,
                                  snr=0.20,
                                  randomizer_scale=np.sqrt(0.25),
                                  full_dispersion=False,
                                  tuning_nonrand="lambda.min",
                                  tuning_rand="lambda.1se"):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                            p=p,
                                            nval=nval,
                                            rho=rho,
                                            s=s,
                                            beta_type=beta_type,
                                            snr=snr)
    print("snr", snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()
    true_set = np.asarray([u for u in range(p) if beta[u] != 0])

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (
            n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        _sigma_ = np.std(y)

    lam_theory = _sigma_ * 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(
        X, y, lam_theory / float(n))

    if full_dispersion is False:
        dispersion = None
        active_min = (glm_LASSO_min != 0)
        if active_min.sum() > 0:
            sigma_ = np.sqrt(
                np.linalg.norm(y - X[:, active_min].dot(
                    np.linalg.pinv(X[:, active_min]).dot(y)))**2 /
                (n - active_min.sum()))
        else:
            sigma_ = _sigma_
    print("estimated and true sigma", sigma, _sigma_, sigma_)

    if tuning_nonrand == "lambda.min":
        lam_LASSO = lam_min
        glm_LASSO = glm_LASSO_min
    elif tuning_nonrand == "lambda.1se":
        lam_LASSO = lam_1se
        glm_LASSO = glm_LASSO_1se
    else:
        lam_LASSO = lam_theory / float(n)
        glm_LASSO = glm_LASSO_theory

    active_LASSO = (glm_LASSO != 0)
    nactive_LASSO = active_LASSO.sum()
    active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]])
    active_LASSO_bool = np.asarray(
        [(np.in1d(active_set_LASSO[z], true_set).sum() > 0)
         for z in range(nactive_LASSO)], np.bool)

    rel_LASSO = np.zeros(p)
    Lee_nreport = 0.
    bias_naive = 0.

    if nactive_LASSO > 0:
        rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y)
        Lee_target = beta[active_LASSO]
        post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y)
        naive_sd = sigma_ * np.sqrt(
            np.diag(
                (np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO])))))
        naive_intervals = np.vstack([
            post_LASSO_OLS - 1.65 * naive_sd, post_LASSO_OLS + 1.65 * naive_sd
        ]).T
        naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd)
        cov_naive, selective_naive_power = coverage(naive_intervals,
                                                    naive_pval, Lee_target,
                                                    beta[active_LASSO])
        length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])
        power_naive = ((active_LASSO_bool) * (np.logical_or(
            (0. < naive_intervals[:, 0]),
            (0. > naive_intervals[:, 1])))).sum() / float((beta != 0).sum())
        naive_discoveries = BHfilter(naive_pval, q=0.1)
        power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float(
            (beta != 0).sum())
        fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float(
            max(naive_discoveries.sum(), 1.))
        bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target)

        partial_Lasso_risk = (glm_LASSO[active_LASSO] -
                              Lee_target).T.dot(glm_LASSO[active_LASSO] -
                                                Lee_target)
        partial_relLasso_risk = (post_LASSO_OLS -
                                 Lee_target).T.dot(post_LASSO_OLS - Lee_target)

    elif nactive_LASSO == 0:
        Lee_nreport += 1
        cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [
            0., 0., 0., 0., 0., 0.
        ]
        naive_discoveries = np.zeros(1)
        partial_Lasso_risk, partial_relLasso_risk = [0., 0.]

    if tuning_rand == "lambda.min":
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=n * lam_min * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    elif tuning_rand == "lambda.1se":
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=n * lam_1se * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    else:
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=lam_theory * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    signs = randomized_lasso.fit()
    nonzero = signs != 0
    active_set_rand = np.asarray([t for t in range(p) if nonzero[t]])
    active_rand_bool = np.asarray(
        [(np.in1d(active_set_rand[x], true_set).sum() > 0)
         for x in range(nonzero.sum())], np.bool)
    sel_MLE = np.zeros(p)
    ind_est = np.zeros(p)
    randomized_lasso_est = np.zeros(p)
    randomized_rel_lasso_est = np.zeros(p)
    MLE_nreport = 0

    if nonzero.sum() > 0:
        target_randomized = beta[nonzero]
        (observed_target, cov_target, cov_target_score,
         alternatives) = debiased_targets(randomized_lasso.loglike,
                                          randomized_lasso._W,
                                          nonzero,
                                          penalty=randomized_lasso.penalty,
                                          dispersion=dispersion)
        MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(
            observed_target, cov_target, cov_target_score, alternatives)
        sel_MLE[nonzero] = MLE_estimate
        ind_est[nonzero] = ind_unbiased_estimator
        randomized_lasso_est = randomized_lasso.initial_soln
        randomized_rel_lasso_est = randomized_lasso._beta_full

        cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval,
                                                target_randomized,
                                                beta[nonzero])
        length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0])
        power_MLE = ((active_rand_bool) * (np.logical_or(
            (0. < MLE_intervals[:, 0]),
            (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum())
        MLE_discoveries = BHfilter(MLE_pval, q=0.1)
        power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float(
            (beta != 0).sum())
        fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(
            max(MLE_discoveries.sum(), 1.))
        bias_MLE = np.mean(MLE_estimate - target_randomized)

        partial_MLE_risk = (MLE_estimate -
                            target_randomized).T.dot(MLE_estimate -
                                                     target_randomized)
        partial_ind_risk = (ind_unbiased_estimator -
                            target_randomized).T.dot(ind_unbiased_estimator -
                                                     target_randomized)
        partial_randLasso_risk = (
            randomized_lasso_est[nonzero] -
            target_randomized).T.dot(randomized_lasso_est[nonzero] -
                                     target_randomized)
        partial_relrandLasso_risk = (
            randomized_rel_lasso_est[nonzero] -
            target_randomized).T.dot(randomized_rel_lasso_est[nonzero] -
                                     target_randomized)
    else:
        MLE_nreport = 1
        cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [
            0., 0., 0., 0., 0., 0., 0.
        ]
        MLE_discoveries = np.zeros(1)
        partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [
            0., 0., 0., 0.
        ]

    risks = np.vstack(
        (relative_risk(sel_MLE, beta,
                       Sigma), relative_risk(ind_est, beta, Sigma),
         relative_risk(randomized_lasso_est, beta, Sigma),
         relative_risk(randomized_rel_lasso_est, beta,
                       Sigma), relative_risk(rel_LASSO, beta, Sigma),
         relative_risk(glm_LASSO, beta, Sigma)))

    partial_risks = np.vstack(
        (partial_MLE_risk, partial_ind_risk, partial_randLasso_risk,
         partial_relrandLasso_risk, partial_relLasso_risk, partial_Lasso_risk))

    naive_inf = np.vstack(
        (cov_naive, length_naive, 0., nactive_LASSO, bias_naive,
         selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH,
         naive_discoveries.sum()))
    Lee_inf = np.zeros((10, 1))
    Liu_inf = np.zeros((10, 1))
    MLE_inf = np.vstack(
        (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power,
         power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum()))
    nreport = np.vstack((Lee_nreport, 0., MLE_nreport))
    return np.vstack(
        (risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport))
Example #8
0
def comparison_cvmetrics_full(n=500,
                              p=100,
                              nval=500,
                              rho=0.35,
                              s=5,
                              beta_type=1,
                              snr=0.20,
                              randomizer_scale=np.sqrt(0.25),
                              full_dispersion=True,
                              tuning_nonrand="lambda.min",
                              tuning_rand="lambda.1se"):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                            p=p,
                                            nval=nval,
                                            rho=rho,
                                            s=s,
                                            beta_type=beta_type,
                                            snr=snr)
    print("snr", snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()
    true_set = np.asarray([u for u in range(p) if beta[u] != 0])

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (
            n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(
        X, y, lam_theory / float(n))
    if tuning_nonrand == "lambda.min":
        lam_LASSO = lam_min
        glm_LASSO = glm_LASSO_min
    elif tuning_nonrand == "lambda.1se":
        lam_LASSO = lam_1se
        glm_LASSO = glm_LASSO_1se
    else:
        lam_LASSO = lam_theory / float(n)
        glm_LASSO = glm_LASSO_theory

    active_LASSO = (glm_LASSO != 0)
    nactive_LASSO = active_LASSO.sum()
    active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]])
    active_LASSO_bool = np.asarray(
        [(np.in1d(active_set_LASSO[z], true_set).sum() > 0)
         for z in range(nactive_LASSO)], np.bool)

    rel_LASSO = np.zeros(p)
    Lee_nreport = 0
    bias_Lee = 0.
    bias_naive = 0.

    if nactive_LASSO > 0:
        rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y)
        Lee_target = beta[active_LASSO]
        Lee_intervals, Lee_pval = selInf_R(X,
                                           y,
                                           glm_LASSO,
                                           n * lam_LASSO,
                                           sigma_,
                                           Type=1,
                                           alpha=0.1)

        if (Lee_pval.shape[0] == Lee_target.shape[0]):

            cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval,
                                                    Lee_target,
                                                    beta[active_LASSO])
            inf_entries_bool = np.isinf(Lee_intervals[:, 1] -
                                        Lee_intervals[:, 0])
            inf_entries = np.mean(inf_entries_bool)
            if inf_entries == 1.:
                length_Lee = 0.
            else:
                length_Lee = np.mean((Lee_intervals[:, 1] -
                                      Lee_intervals[:, 0])[~inf_entries_bool])
            power_Lee = ((active_LASSO_bool) * (np.logical_or(
                (0. < Lee_intervals[:, 0]),
                (0. > Lee_intervals[:, 1])))).sum() / float((beta != 0).sum())
            Lee_discoveries = BHfilter(Lee_pval, q=0.1)
            power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float(
                (beta != 0).sum())
            fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float(
                max(Lee_discoveries.sum(), 1.))
            bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target)

            post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y)
            naive_sd = sigma_ * np.sqrt(
                np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(
                    X[:, active_LASSO])))))
            naive_intervals = np.vstack([
                post_LASSO_OLS - 1.65 * naive_sd,
                post_LASSO_OLS + 1.65 * naive_sd
            ]).T
            naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd)
            cov_naive, selective_naive_power = coverage(
                naive_intervals, naive_pval, Lee_target, beta[active_LASSO])
            length_naive = np.mean(naive_intervals[:, 1] -
                                   naive_intervals[:, 0])
            power_naive = ((active_LASSO_bool) * (np.logical_or(
                (0. < naive_intervals[:, 0]),
                (0. > naive_intervals[:, 1])))).sum() / float(
                    (beta != 0).sum())
            naive_discoveries = BHfilter(naive_pval, q=0.1)
            power_naive_BH = (naive_discoveries *
                              active_LASSO_bool).sum() / float(
                                  (beta != 0).sum())
            fdr_naive_BH = (naive_discoveries *
                            ~active_LASSO_bool).sum() / float(
                                max(naive_discoveries.sum(), 1.))
            bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target)

            partial_Lasso_risk = (glm_LASSO[active_LASSO] -
                                  Lee_target).T.dot(glm_LASSO[active_LASSO] -
                                                    Lee_target)
            partial_relLasso_risk = (post_LASSO_OLS -
                                     Lee_target).T.dot(post_LASSO_OLS -
                                                       Lee_target)
        else:
            Lee_nreport = 1
            cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [
                0., 0., 0., 0., 0., 0., 0.
            ]
            cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [
                0., 0., 0., 0., 0., 0.
            ]
            naive_discoveries = np.zeros(1)
            Lee_discoveries = np.zeros(1)
            partial_Lasso_risk, partial_relLasso_risk = [0., 0.]

    elif nactive_LASSO == 0:
        Lee_nreport = 1
        cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [
            0., 0., 0., 0., 0., 0., 0.
        ]
        cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [
            0., 0., 0., 0., 0., 0.
        ]
        naive_discoveries = np.zeros(1)
        Lee_discoveries = np.zeros(1)
        partial_Lasso_risk, partial_relLasso_risk = [0., 0.]

    lasso_Liu = ROSI.gaussian(X, y, n * lam_LASSO)
    Lasso_soln_Liu = lasso_Liu.fit()
    active_set_Liu = np.nonzero(Lasso_soln_Liu != 0)[0]
    nactive_Liu = active_set_Liu.shape[0]
    active_Liu_bool = np.asarray(
        [(np.in1d(active_set_Liu[a], true_set).sum() > 0)
         for a in range(nactive_Liu)], np.bool)
    Liu_nreport = 0

    if nactive_Liu > 0:
        Liu_target = beta[Lasso_soln_Liu != 0]
        df = lasso_Liu.summary(level=0.90,
                               compute_intervals=True,
                               dispersion=dispersion)
        Liu_lower, Liu_upper, Liu_pval = np.asarray(df['lower_confidence']), \
                                         np.asarray(df['upper_confidence']), \
                                         np.asarray(df['pval'])
        Liu_intervals = np.vstack((Liu_lower, Liu_upper)).T
        cov_Liu, selective_Liu_power = coverage(Liu_intervals, Liu_pval,
                                                Liu_target,
                                                beta[Lasso_soln_Liu != 0])
        length_Liu = np.mean(Liu_intervals[:, 1] - Liu_intervals[:, 0])
        power_Liu = ((active_Liu_bool) * (np.logical_or(
            (0. < Liu_intervals[:, 0]),
            (0. > Liu_intervals[:, 1])))).sum() / float((beta != 0).sum())
        Liu_discoveries = BHfilter(Liu_pval, q=0.1)
        power_Liu_BH = (Liu_discoveries * active_Liu_bool).sum() / float(
            (beta != 0).sum())
        fdr_Liu_BH = (Liu_discoveries * ~active_Liu_bool).sum() / float(
            max(Liu_discoveries.sum(), 1.))

    else:
        Liu_nreport = 1
        cov_Liu, length_Liu, power_Liu, power_Liu_BH, fdr_Liu_BH, selective_Liu_power = [
            0., 0., 0., 0., 0., 0.
        ]
        Liu_discoveries = np.zeros(1)

    if tuning_rand == "lambda.min":
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=n * lam_min * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    elif tuning_rand == "lambda.1se":
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=n * lam_1se * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    else:
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=lam_theory * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    signs = randomized_lasso.fit()
    nonzero = signs != 0
    active_set_rand = np.asarray([t for t in range(p) if nonzero[t]])
    active_rand_bool = np.asarray(
        [(np.in1d(active_set_rand[x], true_set).sum() > 0)
         for x in range(nonzero.sum())], np.bool)
    sel_MLE = np.zeros(p)
    ind_est = np.zeros(p)
    randomized_lasso_est = np.zeros(p)
    randomized_rel_lasso_est = np.zeros(p)
    MLE_nreport = 0

    if nonzero.sum() > 0:
        target_randomized = beta[nonzero]
        (observed_target, cov_target, cov_target_score,
         alternatives) = full_targets(randomized_lasso.loglike,
                                      randomized_lasso._W,
                                      nonzero,
                                      dispersion=dispersion)
        MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(
            observed_target, cov_target, cov_target_score, alternatives)
        sel_MLE[nonzero] = MLE_estimate
        ind_est[nonzero] = ind_unbiased_estimator
        randomized_lasso_est = randomized_lasso.initial_soln
        randomized_rel_lasso_est = randomized_lasso._beta_full

        cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval,
                                                target_randomized,
                                                beta[nonzero])
        length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0])
        power_MLE = ((active_rand_bool) * (np.logical_or(
            (0. < MLE_intervals[:, 0]),
            (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum())
        MLE_discoveries = BHfilter(MLE_pval, q=0.1)
        power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float(
            (beta != 0).sum())
        fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(
            max(MLE_discoveries.sum(), 1.))
        bias_MLE = np.mean(MLE_estimate - target_randomized)

        partial_MLE_risk = (MLE_estimate -
                            target_randomized).T.dot(MLE_estimate -
                                                     target_randomized)
        partial_ind_risk = (ind_unbiased_estimator -
                            target_randomized).T.dot(ind_unbiased_estimator -
                                                     target_randomized)
        partial_randLasso_risk = (
            randomized_lasso_est[nonzero] -
            target_randomized).T.dot(randomized_lasso_est[nonzero] -
                                     target_randomized)
        partial_relrandLasso_risk = (
            randomized_rel_lasso_est[nonzero] -
            target_randomized).T.dot(randomized_rel_lasso_est[nonzero] -
                                     target_randomized)
    else:
        MLE_nreport = 1
        cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [
            0., 0., 0., 0., 0., 0., 0.
        ]
        MLE_discoveries = np.zeros(1)
        partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [
            0., 0., 0., 0.
        ]

    risks = np.vstack(
        (relative_risk(sel_MLE, beta,
                       Sigma), relative_risk(ind_est, beta, Sigma),
         relative_risk(randomized_lasso_est, beta, Sigma),
         relative_risk(randomized_rel_lasso_est, beta,
                       Sigma), relative_risk(rel_LASSO, beta, Sigma),
         relative_risk(glm_LASSO, beta, Sigma)))

    partial_risks = np.vstack(
        (partial_MLE_risk, partial_ind_risk, partial_randLasso_risk,
         partial_relrandLasso_risk, partial_relLasso_risk, partial_Lasso_risk))

    naive_inf = np.vstack(
        (cov_naive, length_naive, 0., nactive_LASSO, bias_naive,
         selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH,
         naive_discoveries.sum()))
    Lee_inf = np.vstack(
        (cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee,
         selective_Lee_power, power_Lee, power_Lee_BH, fdr_Lee_BH,
         Lee_discoveries.sum()))
    Liu_inf = np.vstack(
        (cov_Liu, length_Liu, 0., nactive_Liu, bias_Lee, selective_Liu_power,
         power_Liu, power_Liu_BH, fdr_Liu_BH, Liu_discoveries.sum()))
    MLE_inf = np.vstack(
        (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power,
         power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum()))
    nreport = np.vstack((Lee_nreport, Liu_nreport, MLE_nreport))
    return np.vstack(
        (risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport))
Example #9
0
def multiple_runs_lasso(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20,
                         randomizer_scale=np.sqrt(0.50), full_dispersion=True):


    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory / float(n))

    active_LASSO_1 = (glm_LASSO_theory != 0)
    active_LASSO_2 = (glm_LASSO_1se != 0)
    active_LASSO = np.logical_or(active_LASSO_1, active_LASSO_2)
    nreport_nonrand = 0.
    if active_LASSO.sum()>0:
        target_nonrandomized = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta))
        post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y)

        naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO])))))
        naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd,
                                     post_LASSO_OLS + 1.65 * naive_sd]).T
        naive_pval = 2 * (1.-ndist.cdf(np.abs(post_LASSO_OLS)/ naive_sd))
        cov_naive, power_naive = coverage(naive_intervals, naive_pval, target_nonrandomized, beta[active_LASSO])
        length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])
        fdr_naive = ((naive_pval[beta[active_LASSO] == 0]) < 0.1).sum() / float((naive_pval < 0.1).sum())
    else:
        nreport_nonrand +=1.
        cov_naive, power_naive, length_naive, fdr_naive = [0.,0., 0.,0.]

    randomized_lasso_1 = lasso.gaussian(X,
                                        y,
                                        feature_weights=lam_theory * np.ones(p),
                                        randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)

    signs_1 = randomized_lasso_1.fit()
    nonzero_1 = signs_1 != 0

    randomized_lasso_2 = lasso.gaussian(X,
                                        y,
                                        feature_weights=n * lam_1se * np.ones(p),
                                        randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)

    signs_2 = randomized_lasso_2.fit()
    nonzero_2 = signs_2 != 0

    signs = np.logical_or(signs_1, signs_2)
    nonzero = signs!=0
    print("check", nonzero_1.sum(), nonzero_2.sum(), nonzero.sum(), active_LASSO.sum())
    nreport = 0.
    if nonzero.sum() > 0:
        target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
        observed_target = np.linalg.pinv(X[:, nonzero]).dot(y)
        (_,
         _,
         cov_target_score_1,
         alternatives_1) = selected_targets(randomized_lasso_1.loglike,
                                            randomized_lasso_1._W,
                                            nonzero,
                                            dispersion=dispersion)

        (_,
         cov_target,
         cov_target_score_2,
         alternatives_2) = selected_targets(randomized_lasso_2.loglike,
                                            randomized_lasso_2._W,
                                            nonzero,
                                            dispersion=dispersion)


        estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target,
                                                                    cov_target,
                                                                    cov_target_score_1,
                                                                    cov_target_score_2,
                                                                    randomized_lasso_1.observed_opt_state,
                                                                    randomized_lasso_2.observed_opt_state,
                                                                    randomized_lasso_1.cond_mean,
                                                                    randomized_lasso_2.cond_mean,
                                                                    randomized_lasso_1.cond_cov,
                                                                    randomized_lasso_2.cond_cov,
                                                                    randomized_lasso_1.logdens_linear,
                                                                    randomized_lasso_2.logdens_linear,
                                                                    randomized_lasso_1.con_linear,
                                                                    randomized_lasso_2.con_linear,
                                                                    randomized_lasso_1.con_offset,
                                                                    randomized_lasso_2.con_offset,
                                                                    solve_args={'tol': 1.e-12},
                                                                    level=0.9)

        coverage_adjusted, power_adjusted = coverage(intervals, pval, target_randomized, beta[nonzero])
        length_adjusted = np.mean(intervals[:, 1] - intervals[:, 0])
        fdr_adjusted = ((pval[beta[nonzero] == 0]) < 0.1).sum() / float((pval < 0.1).sum())

    else:
        nreport +=1
        coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted = [0., 0., 0., 0.]

    MLE_inf = np.vstack((coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, nonzero.sum()))
    Naive_inf = np.vstack((cov_naive, length_naive, power_naive, fdr_naive, active_LASSO.sum()))

    print MLE_inf, Naive_inf
    return np.vstack((MLE_inf, Naive_inf, nreport, nreport_nonrand))
Example #10
0
def pivot(n=500,
          p=100,
          nval=500,
          rho=0.,
          s=5,
          beta_type=1,
          snr=0.25,
          randomizer_scale=np.sqrt(1.),
          full_dispersion=True):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                            p=p,
                                            nval=nval,
                                            rho=rho,
                                            s=s,
                                            beta_type=beta_type,
                                            snr=snr)
    print("snr", snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (
            n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    randomized_lasso = lasso.gaussian(X,
                                      y,
                                      feature_weights=lam_theory * np.ones(p),
                                      randomizer_scale=np.sqrt(n) *
                                      randomizer_scale * sigma_)

    signs = randomized_lasso.fit()
    nonzero = signs != 0
    sys.stderr.write("active variables selected by randomized LASSO " +
                     str(nonzero.sum()) + "\n" + "\n")

    if nonzero.sum() > 0:
        target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
        (observed_target, cov_target, cov_target_score,
         alternatives) = selected_targets(randomized_lasso.loglike,
                                          randomized_lasso._W,
                                          nonzero,
                                          dispersion=dispersion)

        toc = time.time()
        MLE_estimate, observed_info_mean, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(
            observed_target, cov_target, cov_target_score, alternatives)
        tic = time.time()
        cov_MLE, _ = coverage(MLE_intervals, MLE_pval, target_randomized,
                              beta[nonzero])

        pivot_MLE = np.true_divide(MLE_estimate - target_randomized,
                                   np.sqrt(np.diag(observed_info_mean)))
        time_MLE = tic - toc

        toc = time.time()
        sampler_pivot, sampler_pval, sampler_intervals = randomized_lasso.summary(
            observed_target,
            cov_target,
            cov_target_score,
            alternatives,
            level=0.9,
            compute_intervals=True,
            ndraw=200000)

        tic = time.time()
        cov_sampler, _ = coverage(sampler_intervals, sampler_pval,
                                  target_randomized, beta[nonzero])
        time_sampler = tic - toc

        return pivot_MLE, sampler_pivot, time_MLE, time_sampler, np.mean(
            cov_MLE), np.mean(cov_sampler)