def test_data_carving_IC(n=100,
                         p=200,
                         s=7,
                         sigma=5,
                         rho=0.3,
                         snr=7.,
                         split_frac=0.9,
                         ndraw=5000,
                         burnin=1000, 
                         df=np.inf,
                         coverage=0.90,
                         compute_intervals=False):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)
        splitn = int(n*split_frac)
        indices = np.arange(n)
        np.random.shuffle(indices)
        stage_one = indices[:splitn]

        FS = info_crit_stop(y, X, sigma, cost=np.log(n), subset=stage_one)

        if set(range(s)).issubset(FS.active):
            results, FS = data_carving_IC(y, X, sigma,
                                          stage_one=stage_one,
                                          splitting=True, 
                                          ndraw=ndraw,
                                          burnin=burnin,
                                          coverage=coverage,
                                          compute_intervals=compute_intervals,
                                          cost=np.log(n))

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:,FS.variables[:-1]]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            return ([carve[j] for j, i in enumerate(FS.active) if i >= s], 
                    [split[j] for j, i in enumerate(FS.active) if i >= s], 
                    [carve[j] for j, i in enumerate(FS.active) if i < s], 
                    [split[j] for j, i in enumerate(FS.active) if i < s], 
                    counter, carve_coverage, split_coverage)
Example #2
0
def test_gaussian_pvals(n=100,
                        p=500,
                        s=7,
                        sigma=5,
                        rho=0.3,
                        snr=8.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)
        L = lasso.gaussian(X, y, 20., sigma=sigma)
        L.fit()
        L.fit(L.lasso_solution)
        v = {1:'twosided',
             0:'onesided'}[counter % 2]
        if set(active).issubset(L.active):
            S = L.summary(v)
            return [p for p, v in zip(S['pval'], S['variable']) if v not in active]
Example #3
0
def test_lasso(s=5, n=100, p=50):
    
    X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.)
    lam_frac = 1.

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.gaussian_Xfixed(X, y)
    random_Z = randomization.rvs(p)
    epsilon = 1.

    lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    random_Z = randomization.rvs(p)
    penalty = randomized.selective_l1norm(p, lagrange=lam)

    sampler1 = randomized.selective_sampler_MH(loss,
                                               random_Z,
                                               epsilon,
                                               randomization,
                                               penalty)

    loss_args = {'mean':np.zeros(n), 
                 'sigma':sigma}
    null, alt = pval(sampler1, 
                     loss_args,
                     X, y,
                     nonzero)
    
    return null, alt
def test_mcmc_tests(n=100, p=40, s=4, rho=0.3, snr=5, ndraw=None, burnin=2000,
                    nsim=None,
                    nstep=200,
                    method='serial'):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4

    null_rank, alt_rank = None, None

    for i in range(min(n, p)):
        FS.next()

        if extra_steps <= 0:
            null_rank = FS.mcmc_test(i+1, variable=FS.variables[i-2], 
                                     nstep=nstep,
                                     burnin=burnin,
                                     method="serial")
            alt_rank = FS.mcmc_test(i+1, variable=FS.variables[0], 
                                    burnin=burnin,
                                    nstep=nstep, 
                                    method="parallel")
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1

    return null_rank, alt_rank
Example #5
0
def test_sqrt_lasso_pvals(n=100,
                          p=200,
                          s=7,
                          sigma=5,
                          rho=0.3,
                          snr=7.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)

        lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) / np.sqrt(n)
        Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0)

        weights_with_zeros = 0.7*lam_theor * np.ones(p)
        weights_with_zeros[:3] = 0.

        L = lasso.sqrt_lasso(X, y, weights_with_zeros)
        L.fit()
        v = {1:'twosided',
             0:'onesided'}[counter % 2]
        if set(active).issubset(L.active):
            S = L.summary(v)
            return [p for p, v in zip(S['pval'], S['variable']) if v not in active]
Example #6
0
def test_logistic_pvals(n=500,
                        p=200,
                        s=3,
                        sigma=2,
                        rho=0.3,
                        snr=7.):

    counter = 0

    while True:
        counter += 1

        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)

        z = (y > 0)
        X = np.hstack([np.ones((n,1)), X])

        active = np.array(active)
        active += 1
        active = [0] + list(active)

        L = lasso.logistic(X, z, [0]*1 + [1.2]*p)
        L.fit()
        S = L.summary('onesided')

        if set(active).issubset(L.active) > 0:
            return [p for p, v in zip(S['pval'], S['variable']) if v not in active]
        return []
def test_independence_null_mcmc(n=100, p=40, s=4, rho=0.5, snr=5, 
                                ndraw=None, burnin=2000,
                                nsim=None,
                                nstep=200,
                                method='serial'):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4
    completed = False

    null_ranks = []
    for i in range(min(n, p)):
        FS.next()

        if completed and extra_steps > 0:
            null_rank = FS.mcmc_test(i+1, variable=FS.variables[-1], 
                                     nstep=nstep,
                                     burnin=burnin,
                                     method="serial")
            null_ranks.append(int(null_rank))

        if extra_steps <= 0:
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1
            completed = True

    return tuple(null_ranks)
def test_full_pvals(n=100, p=40, rho=0.3, snr=4):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho)
    FS = forward_stepwise(X, y, covariance=sigma**2 * np.identity(n))

    from scipy.stats import norm as ndist
    pval = []
    completed_yet = False
    for i in range(min(n, p)):
        FS.next()
        var_select, pval_select = FS.model_pivots(i+1, alternative='twosided',
                                                  which_var=[FS.variables[-1]],
                                                  saturated=False,
                                                  burnin=2000,
                                                  ndraw=8000)[0]
        pval_saturated = FS.model_pivots(i+1, alternative='twosided',
                                         which_var=[FS.variables[-1]],
                                         saturated=True)[0][1]

        # now, nominal ones

        LSfunc = np.linalg.pinv(FS.X[:,FS.variables])
        Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma)
        pval_nominal = 2 * ndist.sf(np.fabs(Z))
        pval.append((var_select, pval_select, pval_saturated, pval_nominal))
            
        if set(active).issubset(np.array(pval)[:,0]) and not completed_yet:
            completed_yet = True
            completion_index = i + 1

    return X, y, beta, active, sigma, np.array(pval), completion_index
def test_data_carving(n=100,
                      p=200,
                      s=7,
                      rho=0.3,
                      snr=7.,
                      split_frac=0.8,
                      lam_frac=1.,
                      ndraw=8000,
                      burnin=2000, 
                      df=np.inf,
                      coverage=0.90,
                      sigma=3,
                      fit_args={'min_its':120, 'tol':1.e-12},
                      compute_intervals=True):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)
        L, stage_one = split_model(y, 
                                   X, 
                                   lam_frac=lam_frac,
                                   split_frac=split_frac,
                                   fit_args=fit_args)[:2]

        print L.active
        if set(range(s)).issubset(L.active):
            results, L = data_carving(y, X, lam_frac=lam_frac, 
                                      stage_one=stage_one,
                                      splitting=True, 
                                      ndraw=ndraw,
                                      burnin=burnin,
                                      coverage=coverage,
                                      fit_args=fit_args,
                                      compute_intervals=compute_intervals)

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:,L.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            return carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage
def test_skinny_fat():

    X, Y = instance()[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]
    soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)

    X, Y = instance(p=50)[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]
    soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)
def test_skinny_fat():

    X, Y = instance()[:2]
    n, p = X.shape
    lam = SQ.choose_lambda(X)
    obj1 = SQ.sqlasso_objective(X, Y)
    obj2 = SQ.sqlasso_objective_skinny(X, Y)
    soln1 = SQ.solve_sqrt_lasso_fat(X, Y, min_its=500, weights=np.ones(p) * lam)
    soln2 = SQ.solve_sqrt_lasso_skinny(X, Y, min_its=500, weights=np.ones(p) * lam)

    np.testing.assert_almost_equal(soln1, soln2)

    X, Y = instance(p=50)[:2]
    n, p = X.shape
    lam = SQ.choose_lambda(X)
    obj1 = SQ.sqlasso_objective(X, Y)
    obj2 = SQ.sqlasso_objective_skinny(X, Y)
    soln1 = SQ.solve_sqrt_lasso_fat(X, Y, min_its=500, weights=np.ones(p) * lam)
    soln2 = SQ.solve_sqrt_lasso_skinny(X, Y, min_its=500, weights=np.ones(p) * lam)

    np.testing.assert_almost_equal(soln1, soln2)
Example #12
0
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta, active, sigma = instance(n=n, p=p, s=s)
    las = lasso.gaussian(X, y, 4., sigma=sigma)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    S = las.summary(compute_intervals=True)
    nominal_intervals(las)
Example #13
0
def test_adding_quadratic_lasso():

    X, y, beta, active, sigma = instance(n=300, p=200)
    Q = rr.identity_quadratic(0.01, 0, np.random.standard_normal(X.shape[1]), 0)

    L1 = lasso.gaussian(X, y, 20, quadratic=Q)
    beta1 = L1.fit(solve_args={'min_its':500, 'tol':1.e-12})
    G1 = X[:,L1.active].T.dot(X.dot(beta1) - y) + Q.objective(beta1,'grad')[L1.active]
    np.testing.assert_allclose(G1 * np.sign(beta1[L1.active]), -20)

    lin = rr.identity_quadratic(0.0, 0, np.random.standard_normal(X.shape[1]), 0)
    L2 = lasso.gaussian(X, y, 20, quadratic=lin)
    beta2 = L2.fit(solve_args={'min_its':500, 'tol':1.e-12})
    G2 = X[:,L2.active].T.dot(X.dot(beta2) - y) + lin.objective(beta2,'grad')[L2.active]
    np.testing.assert_allclose(G2 * np.sign(beta2[L2.active]), -20)
Example #14
0
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta, active, sigma = instance(n=n, p=p, s=s)
    las = lasso.gaussian(X, y, 4., sigma=sigma)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    las.summary()
    intervals = las.intervals
    nominal_intervals(las)
    t.append([(beta[I], L, U) for I, L, U in intervals])
    return t
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta = instance(n=n, p=p, s=s)[:3]
    las = lasso.gaussian(X, y, 4., .25)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    las.active_pvalues
    intervals = las.intervals
    nominal_intervals(las)
    t.append([(beta[I], L, U) for I, L, U in intervals])
    return t
Example #16
0
def test_gaussian_sandwich_pvals(n=100,
                                 p=200,
                                 s=20,
                                 sigma=10,
                                 rho=0.3,
                                 snr=6.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)

        heteroscedastic_error = sigma * np.random.standard_normal(n) * (np.fabs(X[:,-1]) + 0.5)**2
        heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-2]) + 0.2)**2
        heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-3]) + 0.5)**2
        y += heteroscedastic_error

        # two different estimators of variance
        sandwich = gaussian_sandwich_estimator(X, y, B=1000)
        parametric = gaussian_parametric_estimator(X, y, sigma=None)

        # make sure things work with some unpenalized columns

        feature_weights = np.ones(p) * 3 * sigma
        feature_weights[10:12] = 0
        L_P = lasso.gaussian(X, y, feature_weights, covariance_estimator=parametric)
        L_P.fit()

        if set(active).issubset(L_P.active):

            S = L_P.summary('twosided')
            P_P = [p for p, v in zip(S['pval'], S['variable']) if v not in active]
        
            L_S = lasso.gaussian(X, y, feature_weights, covariance_estimator=sandwich)
            L_S.fit()

            S = L_S.summary('twosided')
            P_S = [p for p, v in zip(S['pval'], S['variable']) if v not in active]

            return P_P, P_S
def test_BIC(do_sample=True, ndraw=8000, burnin=2000, nsim=None,
             force=False):

    X, Y, beta, active, sigma = instance()
    n, p = X.shape
    FS = info_crit_stop(Y, X, sigma, cost=np.log(n))
    final_model = len(FS.variables) 

    active = set(list(active))
    if active.issubset(FS.variables) or force:
        which_var = [v for v in FS.variables if v not in active]

        if do_sample:
            return [pval[-1] for pval in FS.model_pivots(final_model, saturated=False, burnin=burnin, ndraw=ndraw, which_var=which_var)]
        else:
            saturated_pivots = FS.model_pivots(final_model, which_var=which_var)
            return [pval[-1] for pval in saturated_pivots]
    return []
def test_tilting(nsim=100):

    P = []
    covered0 = 0
    coveredA = 0
    screen = 0

    for i in range(nsim):
        X, Y, beta, active, sigma = instance(n=20, p=30)

        Y0 = np.random.standard_normal(X.shape[0]) * sigma

        # null pvalues and intervals

        cone, pvalue, idx, sign = selected_covtest(X, Y0, sigma=sigma)
        eta = X[:,idx] * sign
        p1, _, _, fam = gibbs_test(cone, Y0, eta, 
                                   ndraw=50000,
                                   burnin=10000,
                                   alternative='twosided',
                                   sigma_known=True,
                                   tilt=eta,
                                   UMPU=False)

        observed_value = (Y0 * eta).sum()
        lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
        lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
        upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
        covered0 += (lower_lim_final < 0) * (upper_lim_final > 0)
        print covered0 / (i + 1.), 'coverage0'

        # compare to no tilting

        p2 = gibbs_test(cone, Y0, X[:,idx] * sign,
                        ndraw=50000,
                        burnin=10000,
                        alternative='twosided',
                        sigma_known=True,
                        tilt=None,
                        UMPU=False)[0]
        print p2, 'huh'
        P.append((p1, p2))
        Pa = np.array(P)

        # p1 and p2 should be very close, so have high correlation
        print np.corrcoef(Pa.T)[0,1], 'correlation'

        # they should also look uniform -- mean should be about 0.5, sd about 0.29

        print np.mean(Pa, 0), 'mean of nulls'
        print np.std(Pa, 0), 'sd of nulls'

        # alternative intervals

        mu = 3 * X[:,0] * sigma
        YA = np.random.standard_normal(X.shape[0]) * sigma + mu 

        cone, pvalue, idx, sign = selected_covtest(X, YA, sigma=sigma)
        _, _, _, fam = gibbs_test(cone, YA, X[:,idx] * sign,
                                  ndraw=15000,
                                  burnin=10000,
                                  alternative='greater',
                                  sigma_known=True,
                                  tilt=eta)

        if idx == 0:
            screen += 1

            eta = X[:,0] * sign
            observed_value = (YA * eta).sum()
            target = (eta * mu).sum()
            lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
            lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
            upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
            print lower_lim_final, upper_lim_final, target
            coveredA += (lower_lim_final < target) * (upper_lim_final > target)
            print coveredA / (screen * 1.), 'coverageA'

        print screen / (i + 1.), 'screening'

    plt.figure()
    plt.scatter(Pa[:,0], Pa[:,1])

    try:
        import statsmodels.api as sm
        plt.figure()
        G = np.linspace(0, 1, 101)
        plt.plot(G, sm.distributions.ECDF(Pa[:,0])(G))
        plt.plot(G, sm.distributions.ECDF(Pa[:,1])(G))
    except ImportError: # no statsmodels
        pass
def test_tilting(nsim=100, ndraw=50000, burnin=10000):

    P = []
    covered0 = 0
    coveredA = 0
    screen = 0

    for i in range(nsim):
        X, Y, beta, active, sigma = instance(n=20, p=30)

        Y0 = np.random.standard_normal(X.shape[0]) * sigma

        # null pvalues and intervals

        cone, pvalue, idx, sign = selected_covtest(X, Y0, sigma=sigma)
        eta = X[:,idx] * sign
        p1, _, _, fam = gibbs_test(cone, Y0, eta, 
                                   ndraw=ndraw,
                                   burnin=burnin,
                                   alternative='twosided',
                                   sigma_known=True,
                                   tilt=eta,
                                   UMPU=False)

        observed_value = (Y0 * eta).sum()
        lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
        lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
        upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
        covered0 += (lower_lim_final < 0) * (upper_lim_final > 0)
        print(covered0 / (i + 1.), 'coverage0')

        # compare to no tilting

        p2 = gibbs_test(cone, Y0, X[:,idx] * sign,
                        ndraw=ndraw,
                        burnin=burnin,
                        alternative='twosided',
                        sigma_known=True,
                        tilt=None,
                        UMPU=False)[0]
        print(p2, 'huh')
        P.append((p1, p2))
        Pa = np.array(P)

        # p1 and p2 should be very close, so have high correlation
        print(np.corrcoef(Pa.T)[0,1], 'correlation')

        # they should also look uniform -- mean should be about 0.5, sd about 0.29

        print(np.mean(Pa, 0), 'mean of nulls')
        print(np.std(Pa, 0), 'sd of nulls')

        # alternative intervals

        mu = 3 * X[:,0] * sigma
        YA = np.random.standard_normal(X.shape[0]) * sigma + mu 

        cone, pvalue, idx, sign = selected_covtest(X, YA, sigma=sigma)
        _, _, _, fam = gibbs_test(cone, YA, X[:,idx] * sign,
                                  ndraw=ndraw,
                                  burnin=burnin,
                                  alternative='greater',
                                  sigma_known=True,
                                  tilt=eta)

        if idx == 0:
            screen += 1

            eta = X[:,0] * sign
            observed_value = (YA * eta).sum()
            target = (eta * mu).sum()
            lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
            lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
            upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
            print(lower_lim_final, upper_lim_final, target)
            coveredA += (lower_lim_final < target) * (upper_lim_final > target)
            print(coveredA / (screen * 1.), 'coverageA')

        print(screen / (i + 1.), 'screening')
Example #20
0
def test_data_carving_poisson(n=200,
                              p=300,
                              s=5,
                              sigma=5,
                              rho=0.3,
                              snr=9.,
                              split_frac=0.8,
                              lam_frac=1.2,
                              ndraw=8000,
                              burnin=2000, 
                              df=np.inf,
                              coverage=0.90,
                              compute_intervals=True,
                              nsim=None,
                              use_full_cov=True):
    
    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        X = np.hstack([np.ones((n,1)), X])
        y = np.random.poisson(10, size=y.shape)
        s = 1

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n*split_frac)]
        n1 = len(stage_one)

        lam_theor = 6. * np.ones(p+1)
        lam_theor[0] = 0.
        DC = data_carving.poisson(X, y, feature_weights=lam_theor,
                                  stage_one=stage_one)

        DC.fit()

        if len(DC.active) < n - int(n*split_frac):
            DS = data_splitting.poisson(X, y, feature_weights=lam_theor,
                                         stage_one=stage_one)
            DS.fit(use_full_cov=use_full_cov)
            data_split = True
        else:
            print('not enough data for data splitting second stage')
            print(DC.active)
            data_split = False

        print(DC.active)
        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                if data_split:
                    split.append(DS.hypothesis_test(var))
                else:
                    split.append(np.random.sample())

            Xa = X[:,DC.active]

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)
        
    return return_value
Example #21
0
def test_data_carving_sqrt_lasso(n=100,
                                 p=200,
                                 s=7,
                                 sigma=5,
                                 rho=0.3,
                                 snr=7.,
                                 split_frac=0.9,
                                 lam_frac=1.2,
                                 ndraw=8000,
                                 burnin=2000, 
                                 df=np.inf,
                                 coverage=0.90,
                                 compute_intervals=True,
                                 nsim=None):
    
    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n*split_frac)]
        n1 = len(stage_one)

        lam_theor = lam_frac * np.mean(np.fabs(np.dot(X[stage_one].T, np.random.standard_normal((n1, 5000)))).max(0)) / np.sqrt(n1)
        DC = data_carving.sqrt_lasso(X, y, feature_weights=lam_theor,
                                     stage_one=stage_one)

        DC.fit()
        DS = data_splitting.sqrt_lasso(X, y, feature_weights=lam_theor,
                                       stage_one=stage_one)
        DS.fit()
                
        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                split.append(DS.hypothesis_test(var))

            Xa = X[:,DC.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)

    return return_value
Example #22
0
def test_data_carving_gaussian(n=100,
                               p=200,
                               s=7,
                               sigma=5,
                               rho=0.3,
                               snr=7.,
                               split_frac=0.8,
                               lam_frac=2.,
                               ndraw=8000,
                               burnin=2000, 
                               df=np.inf,
                               coverage=0.90,
                               compute_intervals=True,
                               nsim=None,
                               use_full_cov=True):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n*split_frac)]

        lam_theor = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 5000)))).max(0)) * sigma
        DC = data_carving.gaussian(X, y, feature_weights=lam_theor,
                                   sigma=sigma,
                                   stage_one=stage_one)
        DC.fit()

        if len(DC.active) < n - int(n*split_frac):
            DS = data_splitting.gaussian(X, y, feature_weights=lam_theor,
                                         sigma=sigma,
                                         stage_one=stage_one)
            DS.fit(use_full_cov=True)
            DS.fit(use_full_cov=False)
            DS.fit(use_full_cov=use_full_cov)
            data_split = True
        else:
            print('not enough data for second stage data splitting')
            print(DC.active)
            data_split = False

                
        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                if data_split:
                    split.append(DS.hypothesis_test(var))
                else:
                    split.append(np.random.sample()) # appropriate p-value if data splitting can't estimate 2nd stage

            Xa = X[:,DC.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)

    return return_value
Example #23
0
def test_data_carving(n=100,
                      p=200,
                      s=7,
                      sigma=5,
                      rho=0.3,
                      snr=7.,
                      split_frac=0.9,
                      lam_frac=2.,
                      ndraw=8000,
                      burnin=2000, 
                      df=np.inf,
                      coverage=0.90,
                      compute_intervals=True):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)
        L, stage_one = split_model(y, X, 
                        sigma=sigma,
                        lam_frac=lam_frac,
                        split_frac=split_frac)[:2]

        if set(range(s)).issubset(L.active):
            while True:
                results, L = data_carving(y, X, lam_frac=lam_frac, 
                                          sigma=sigma,
                                          stage_one=stage_one,
                                          splitting=True, 
                                          ndraw=ndraw,
                                          burnin=burnin,
                                          coverage=coverage,
                                          compute_intervals=compute_intervals)
                if set(range(s)).issubset(L.active):
                    print "succeed"
                    break
                print "failed at least once"

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:,L.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            TP = s
            FP = L.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(L.active).intersection(range(s)))
            FP = L.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)
    return return_value