Ejemplo n.º 1
0
def test_sampler(n=500,
                 p=100,
                 signal_fac=1.,
                 s=5,
                 sigma=3.,
                 rho=0.4,
                 randomizer_scale=1.):

    inst, const = gaussian_instance, lasso.gaussian
    signal = np.sqrt(signal_fac * 2 * np.log(p))

    X, Y, beta = inst(n=n,
                      p=p,
                      signal=signal,
                      s=s,
                      equicorrelated=False,
                      rho=rho,
                      sigma=sigma,
                      random_signs=True)[:3]

    n, p = X.shape

    sigma_ = np.std(Y)
    W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_

    conv = const(X, Y, W, randomizer_scale=randomizer_scale * sigma_)

    signs = conv.fit()
    nonzero = signs != 0

    beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
    dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y)))**2 / (n -
                                                                           p)

    (observed_target, cov_target, cov_target_score,
     alternatives) = selected_targets(conv.loglike,
                                      conv._W,
                                      nonzero,
                                      dispersion=dispersion)

    A_scaling = conv.sampler.affine_con.linear_part
    b_scaling = conv.sampler.affine_con.offset
    logdens_linear = conv.sampler.logdens_transform[0]

    posterior_inf = posterior_inference_lasso(observed_target, cov_target,
                                              cov_target_score,
                                              conv.observed_opt_state,
                                              conv.cond_mean, conv.cond_cov,
                                              logdens_linear, A_scaling,
                                              b_scaling, observed_target)

    samples = posterior_inf.posterior_sampler(nsample=2000,
                                              nburnin=200,
                                              step=1.)
    lci = np.percentile(samples, 5, axis=0)
    uci = np.percentile(samples, 95, axis=0)
    coverage = (lci < beta_target) * (uci > beta_target)
    length = uci - lci

    print("check ", coverage, length)
def test_instance():

    n, p, s = 500, 100, 5
    X = np.random.standard_normal((n, p))
    beta = np.zeros(p)
    beta[:s] = np.sqrt(2 * np.log(p) / n)
    Y = X.dot(beta) + np.random.standard_normal(n)

    scale_ = np.std(Y)
    # uses noise of variance n * scale_ / 4 by default
    L = lasso.gaussian(X, Y, 3 * scale_ * np.sqrt(2 * np.log(p) * np.sqrt(n)))
    signs = L.fit()
    E = (signs != 0)

    M = E.copy()
    M[-3:] = 1
    dispersion = np.linalg.norm(
        Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y)))**2 / (n - M.sum())
    (observed_target, cov_target, cov_target_score,
     alternatives) = selected_targets(L.loglike,
                                      L._W,
                                      M,
                                      dispersion=dispersion)

    print("check shapes", observed_target.shape, E.sum())

    result = L.selective_MLE(observed_target, cov_target, cov_target_score)[0]
    estimate = result['MLE']
    pval = result['pvalue']
    intervals = np.asarray(result[['lower_confidence', 'upper_confidence']])

    beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta))

    coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:,
                                                                          1])
    print("observed_opt_state ", L.observed_opt_state)
    #print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased']))

    return coverage
def test_selected_targets(n=2000,
                          p=200,
                          signal_fac=1.,
                          s=5,
                          sigma=3,
                          rho=0.4,
                          randomizer_scale=1,
                          full_dispersion=True):
    """
    Compare to R randomized lasso
    """

    inst, const = gaussian_instance, lasso.gaussian
    signal = np.sqrt(signal_fac * 2 * np.log(p))

    while True:
        X, Y, beta = inst(n=n,
                          p=p,
                          signal=signal,
                          s=s,
                          equicorrelated=False,
                          rho=rho,
                          sigma=sigma,
                          random_signs=True)[:3]

        idx = np.arange(p)
        sigmaX = rho**np.abs(np.subtract.outer(idx, idx))
        print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma**2.) * n))

        n, p = X.shape

        sigma_ = np.std(Y)
        W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_

        conv = const(X, Y, W, randomizer_scale=randomizer_scale * sigma_)

        signs = conv.fit()
        nonzero = signs != 0
        print("dimensions", n, p, nonzero.sum())

        if nonzero.sum() > 0:
            dispersion = None
            if full_dispersion:
                dispersion = np.linalg.norm(
                    Y - X.dot(np.linalg.pinv(X).dot(Y)))**2 / (n - p)

            (observed_target, cov_target, cov_target_score,
             alternatives) = selected_targets(conv.loglike,
                                              conv._W,
                                              nonzero,
                                              dispersion=dispersion)

            result = conv.selective_MLE(observed_target, cov_target,
                                        cov_target_score)[0]
            estimate = result['MLE']
            pval = result['pvalue']
            intervals = np.asarray(
                result[['lower_confidence', 'upper_confidence']])

            beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))

            coverage = (beta_target > intervals[:, 0]) * (beta_target <
                                                          intervals[:, 1])

            print("observed_opt_state ", conv.observed_opt_state)
            # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased']))

            return pval[beta[nonzero] == 0], pval[
                beta[nonzero] != 0], coverage, intervals
def test_standalone_inference(n=2000,
                              p=100,
                              signal_fac=1.5,
                              proportion=0.7,
                              approx=True,
                              MLE=True):
    """
    Check that standalone functions reproduce same p-values
    as methods of `selectinf.randomized.lasso`
    """

    signal = np.sqrt(signal_fac * np.log(p)) / np.sqrt(n)
    X = np.random.standard_normal((n, p))
    T = np.random.exponential(1, size=(n, ))
    S = np.random.choice([0, 1], n, p=[0.2, 0.8])

    cox_lasso = split_lasso.coxph(X, T, S, 2 * np.ones(p) * np.sqrt(n),
                                  proportion)

    signs = cox_lasso.fit()
    nonzero = signs != 0

    cox_sel = rr.glm.cox(X[:, nonzero], T, S)

    cox_full = rr.glm.cox(X, T, S)

    refit_soln = cox_sel.solve(min_its=2000)
    padded_soln = np.zeros(p)
    padded_soln[nonzero] = refit_soln
    cox_full.solve(min_its=2000)

    full_hess = cox_full.hessian(padded_soln)
    selected_hess = full_hess[nonzero][:, nonzero]

    (observed_target, cov_target, cov_target_score,
     alternatives) = selected_targets(cox_lasso.loglike,
                                      None,
                                      nonzero,
                                      hessian=full_hess,
                                      dispersion=1)

    if nonzero.sum():
        if approx:
            approx_result = cox_lasso.approximate_grid_inference(
                observed_target, cov_target, cov_target_score)
            approx_pval = approx_result['pvalue']

            testval = approximate_normalizer_inference(
                proportion, cox_lasso.initial_soln[nonzero], refit_soln,
                signs[nonzero], selected_hess,
                cox_lasso.feature_weights[nonzero])

            assert np.allclose(testval['pvalue'], approx_pval)

        else:
            approx_pval = np.empty(nonzero.sum()) * np.nan

        if MLE:
            MLE_result = cox_lasso.selective_MLE(observed_target, cov_target,
                                                 cov_target_score)[0]
            MLE_pval = MLE_result['pvalue']
        else:
            MLE_pval = np.empty(nonzero.sum()) * np.nan

        # working under null here
        beta = np.zeros(p)

        testval = approximate_mle_inference(proportion,
                                            cox_lasso.initial_soln[nonzero],
                                            refit_soln, signs[nonzero],
                                            selected_hess,
                                            cox_lasso.feature_weights[nonzero])

        assert np.allclose(testval['pvalue'], MLE_pval)
        return approx_pval[beta[nonzero] == 0], MLE_pval[beta[nonzero] ==
                                                         0], testval
    else:
        return [], []
def test_selected_instance(seedn,
                           n=2000,
                           p=200,
                           signal_fac=1.2,
                           s=5,
                           sigma=2,
                           rho=0.7,
                           randomizer_scale=1.,
                           full_dispersion=True):
    """
    Compare to R randomized lasso
    """

    inst, const = gaussian_instance, lasso.gaussian
    signal = np.sqrt(signal_fac * 2 * np.log(p))

    while True:
        np.random.seed(seed=seedn)
        X, Y, beta = inst(n=n,
                          p=p,
                          signal=signal,
                          s=s,
                          equicorrelated=True,
                          rho=rho,
                          sigma=sigma,
                          random_signs=True)[:3]

        idx = np.arange(p)
        sigmaX = rho**np.abs(np.subtract.outer(idx, idx))
        print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma**2.) * n))

        n, p = X.shape

        sigma_ = np.std(Y)
        W = 0.8 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_

        conv = const(X,
                     Y,
                     W,
                     ridge_term=0.,
                     randomizer_scale=randomizer_scale * sigma_)

        signs = conv.fit()
        nonzero = signs != 0
        print("dimensions", n, p, nonzero.sum())

        if nonzero.sum() > 0:
            dispersion = None
            if full_dispersion:
                dispersion = np.linalg.norm(
                    Y - X.dot(np.linalg.pinv(X).dot(Y)))**2 / (n - p)

            (observed_target, cov_target, cov_target_score,
             alternatives) = selected_targets(conv.loglike,
                                              conv._W,
                                              nonzero,
                                              dispersion=dispersion)

            result = conv.selective_MLE(observed_target, cov_target,
                                        cov_target_score)[0]

            return result['MLE'], result['lower_confidence'], result[
                'upper_confidence']