Ejemplo n.º 1
0
def test_skinny_fat():

    X, Y = instance()[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X,
                                 Y,
                                 weights=np.ones(p) * lam,
                                 solve_args={'min_its': 500})[0]
    soln2 = solve_sqrt_lasso_skinny(X,
                                    Y,
                                    weights=np.ones(p) * lam,
                                    solve_args={'min_its': 500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)

    X, Y = instance(p=50)[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X,
                                 Y,
                                 weights=np.ones(p) * lam,
                                 solve_args={'min_its': 500})[0]
    soln2 = solve_sqrt_lasso_skinny(X,
                                    Y,
                                    weights=np.ones(p) * lam,
                                    solve_args={'min_its': 500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)
def test_mcmc_tests(n=100, p=40, s=4, rho=0.3, snr=5, ndraw=None, burnin=2000,
                    nsim=None,
                    nstep=200,
                    method='serial'):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4

    null_rank, alt_rank = None, None

    for i in range(min(n, p)):
        FS.next()

        if extra_steps <= 0:
            null_rank = FS.mcmc_test(i+1, variable=FS.variables[i-2], 
                                     nstep=nstep,
                                     burnin=burnin,
                                     method="serial")
            alt_rank = FS.mcmc_test(i+1, variable=FS.variables[0], 
                                    burnin=burnin,
                                    nstep=nstep, 
                                    method="parallel")
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1

    return null_rank, alt_rank
Ejemplo n.º 3
0
def test_logistic_pvals(n=500, p=200, s=3, sigma=2, rho=0.3, snr=7.):

    counter = 0

    while True:
        counter += 1

        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr)

        z = (y > 0)
        X = np.hstack([np.ones((n, 1)), X])

        active = np.array(active)
        active += 1
        active = [0] + list(active)

        L = lasso.logistic(X, z, [0] * 1 + [1.2] * p)
        L.fit()
        S = L.summary('onesided')

        if set(active).issubset(L.active) > 0:
            return [
                p for p, v in zip(S['pval'], S['variable']) if v not in active
            ]
        return []
Ejemplo n.º 4
0
def test_independence_null_mcmc(n=100, p=40, s=4, rho=0.5, snr=5, 
                                ndraw=None, burnin=2000,
                                nsim=None,
                                nstep=200,
                                method='serial'):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4
    completed = False

    null_ranks = []
    for i in range(min(n, p)):
        FS.next()

        if completed and extra_steps > 0:
            null_rank = FS.mcmc_test(i+1, variable=FS.variables[-1], 
                                     nstep=nstep,
                                     burnin=burnin,
                                     method="serial")
            null_ranks.append(int(null_rank))

        if extra_steps <= 0:
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1
            completed = True

    return tuple(null_ranks)
def test_independence_null_mcmc(n=100, p=40, s=4, rho=0.5, snr=5, 
                                ndraw=None, burnin=2000,
                                nsim=None,
                                nstep=200,
                                method='serial'):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4
    completed = False

    null_ranks = []
    for i in range(min(n, p)):
        FS.next()

        if completed and extra_steps > 0:
            null_rank = FS.mcmc_test(i+1, variable=FS.variables[-1], 
                                     nstep=nstep,
                                     burnin=burnin,
                                     method="serial")
            null_ranks.append(int(null_rank))

        if extra_steps <= 0:
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1
            completed = True

    return tuple(null_ranks)
def test_full_pvals(n=100, p=40, rho=0.3, snr=4):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho)
    FS = forward_stepwise(X, y, covariance=sigma**2 * np.identity(n))

    from scipy.stats import norm as ndist
    pval = []
    completed_yet = False
    for i in range(min(n, p)):
        FS.next()
        var_select, pval_select = FS.model_pivots(i + 1,
                                                  alternative='twosided',
                                                  which_var=[FS.variables[-1]],
                                                  saturated=False,
                                                  burnin=2000,
                                                  ndraw=8000)[0]
        pval_saturated = FS.model_pivots(i + 1,
                                         alternative='twosided',
                                         which_var=[FS.variables[-1]],
                                         saturated=True)[0][1]

        # now, nominal ones

        LSfunc = np.linalg.pinv(FS.X[:, FS.variables])
        Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma)
        pval_nominal = 2 * ndist.sf(np.fabs(Z))
        pval.append((var_select, pval_select, pval_saturated, pval_nominal))

        if set(active).issubset(np.array(pval)[:, 0]) and not completed_yet:
            completed_yet = True
            completion_index = i + 1

    return X, y, beta, active, sigma, np.array(pval), completion_index
Ejemplo n.º 7
0
def test_lasso(s=5, n=100, p=50):

    X, y, _, nonzero, sigma = instance(n=n,
                                       p=p,
                                       random_signs=True,
                                       s=s,
                                       sigma=1.)
    lam_frac = 1.

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.gaussian_Xfixed(X, y)
    random_Z = randomization.rvs(p)
    epsilon = 1.

    lam = sigma * lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    random_Z = randomization.rvs(p)
    penalty = randomized.selective_l1norm(p, lagrange=lam)

    sampler1 = randomized.selective_sampler_MH(loss, random_Z, epsilon,
                                               randomization, penalty)

    loss_args = {'mean': np.zeros(n), 'sigma': sigma}
    null, alt = pval(sampler1, loss_args, X, y, nonzero)

    return null, alt
def test_data_carving_IC(n=100,
                         p=200,
                         s=7,
                         sigma=5,
                         rho=0.3,
                         snr=7.,
                         split_frac=0.9,
                         ndraw=5000,
                         burnin=1000, 
                         df=np.inf,
                         coverage=0.90,
                         compute_intervals=False):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)
        splitn = int(n*split_frac)
        indices = np.arange(n)
        np.random.shuffle(indices)
        stage_one = indices[:splitn]

        FS = info_crit_stop(y, X, sigma, cost=np.log(n), subset=stage_one)

        if set(range(s)).issubset(FS.active):
            results, FS = data_carving_IC(y, X, sigma,
                                          stage_one=stage_one,
                                          splitting=True, 
                                          ndraw=ndraw,
                                          burnin=burnin,
                                          coverage=coverage,
                                          compute_intervals=compute_intervals,
                                          cost=np.log(n))

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:,FS.variables[:-1]]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            return ([carve[j] for j, i in enumerate(FS.active) if i >= s], 
                    [split[j] for j, i in enumerate(FS.active) if i >= s], 
                    [carve[j] for j, i in enumerate(FS.active) if i < s], 
                    [split[j] for j, i in enumerate(FS.active) if i < s], 
                    counter, carve_coverage, split_coverage)
def test_full_pvals(n=100, p=40, rho=0.3, snr=4):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho)
    FS = forward_stepwise(X, y, covariance=sigma**2 * np.identity(n))

    from scipy.stats import norm as ndist
    pval = []
    completed_yet = False
    for i in range(min(n, p)):
        FS.next()
        var_select, pval_select = FS.model_pivots(i+1, alternative='twosided',
                                                  which_var=[FS.variables[-1]],
                                                  saturated=False,
                                                  burnin=2000,
                                                  ndraw=8000)[0]
        pval_saturated = FS.model_pivots(i+1, alternative='twosided',
                                         which_var=[FS.variables[-1]],
                                         saturated=True)[0][1]

        # now, nominal ones

        LSfunc = np.linalg.pinv(FS.X[:,FS.variables])
        Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma)
        pval_nominal = 2 * ndist.sf(np.fabs(Z))
        pval.append((var_select, pval_select, pval_saturated, pval_nominal))
            
        if set(active).issubset(np.array(pval)[:,0]) and not completed_yet:
            completed_yet = True
            completion_index = i + 1

    return X, y, beta, active, sigma, np.array(pval), completion_index
Ejemplo n.º 10
0
def test_lasso(s=5, n=100, p=50):
    
    X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.)
    lam_frac = 1.

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.gaussian_Xfixed(X, y)
    random_Z = randomization.rvs(p)
    epsilon = 1.

    lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    random_Z = randomization.rvs(p)
    penalty = randomized.selective_l1norm(p, lagrange=lam)

    sampler1 = randomized.selective_sampler_MH(loss,
                                               random_Z,
                                               epsilon,
                                               randomization,
                                               penalty)

    loss_args = {'mean':np.zeros(n), 
                 'sigma':sigma}
    null, alt = pval(sampler1, 
                     loss_args,
                     X, y,
                     nonzero)
    
    return null, alt
Ejemplo n.º 11
0
def MSE(snr=1, n=100, p=10, s=1):

    ninstance = 1
    total_mse = 0
    nvalid_instance = 0
    data_instance = instance(n, p, s, snr)
    tau = 1.
    for i in range(ninstance):
        X, y, true_beta, nonzero, sigma = data_instance.generate_response()
        #print "true param value", true_beta[0]
        random_Z = np.random.standard_normal(p)
        lam, epsilon, active, betaE, cube, initial_soln = selection(
            X, y, random_Z)
        print "active set", np.where(active)[0]
        if lam < 0:
            print "no active covariates"
        else:
            est = estimation(X, y, active, betaE, cube, epsilon, lam, sigma,
                             tau)
            est.compute_mle_all()

            mse_mle = est.mse_mle(true_beta[active])
            print "MLE", est.mle
            total_mse += mse_mle
            nvalid_instance += np.sum(active)

    return np.true_divide(total_mse, nvalid_instance)
Ejemplo n.º 12
0
def test_gaussian_pvals(n=100,
                        p=500,
                        s=7,
                        sigma=5,
                        rho=0.3,
                        snr=8.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)
        L = lasso.gaussian(X, y, 20., sigma=sigma)
        L.fit()
        L.fit(L.lasso_solution)
        v = {1:'twosided',
             0:'onesided'}[counter % 2]
        if set(active).issubset(L.active):
            S = L.summary(v)
            return [p for p, v in zip(S['pval'], S['variable']) if v not in active]
Ejemplo n.º 13
0
def test_mcmc_tests(n=100, p=40, s=4, rho=0.3, snr=5, ndraw=None, burnin=2000,
                    nsim=None,
                    nstep=200,
                    method='serial'):

    X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4

    null_rank, alt_rank = None, None

    for i in range(min(n, p)):
        FS.next()

        if extra_steps <= 0:
            null_rank = FS.mcmc_test(i+1, variable=FS.variables[i-2], 
                                     nstep=nstep,
                                     burnin=burnin,
                                     method="serial")
            alt_rank = FS.mcmc_test(i+1, variable=FS.variables[0], 
                                    burnin=burnin,
                                    nstep=nstep, 
                                    method="parallel")
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1

    return null_rank, alt_rank
Ejemplo n.º 14
0
def test_sqrt_lasso_pvals(n=100,
                          p=200,
                          s=7,
                          sigma=5,
                          rho=0.3,
                          snr=7.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)

        lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) / np.sqrt(n)
        Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0)

        weights_with_zeros = 0.7*lam_theor * np.ones(p)
        weights_with_zeros[:3] = 0.

        L = lasso.sqrt_lasso(X, y, weights_with_zeros)
        L.fit()
        v = {1:'twosided',
             0:'onesided'}[counter % 2]
        if set(active).issubset(L.active):
            S = L.summary(v)
            return [p for p, v in zip(S['pval'], S['variable']) if v not in active]
Ejemplo n.º 15
0
def simulate(n=100, p=40, rho=0.3, 
             snr=5,
             do_knockoff=False,
             full_results={},
             alpha=0.05,
             s=7,
             random_signs=False,
             maxstep=np.inf,
             compute_maxT_identify=True):

    X, y, _, active, sigma = instance(n=n,
                                      p=p,
                                      rho=rho,
                                      snr=snr,
                                      s=s,
                                      random_signs=random_signs)
    full_results.setdefault('n', []).append(n)
    full_results.setdefault('p', []).append(p)
    full_results.setdefault('rho', []).append(rho)
    full_results.setdefault('s', []).append(len(active))
    full_results.setdefault('snr', []).append(snr)

    return run(y, X, sigma, active, 
               do_knockoff=do_knockoff,
               full_results=full_results,
               maxstep=maxstep,
               compute_maxT_identify=compute_maxT_identify,
               alpha=alpha)
Ejemplo n.º 16
0
def test_data_carving(n=100,
                      p=200,
                      s=7,
                      sigma=5,
                      rho=0.3,
                      snr=7.,
                      split_frac=0.9,
                      lam_frac=2.,
                      ndraw=8000,
                      burnin=2000,
                      df=np.inf,
                      coverage=0.90,
                      compute_intervals=False):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr,
                                             df=df)
        mu = np.dot(X, beta)
        L, stage_one = split_model(y,
                                   X,
                                   sigma=sigma,
                                   lam_frac=lam_frac,
                                   split_frac=split_frac)[:2]

        if set(range(s)).issubset(L.active):
            results, L = data_carving(y,
                                      X,
                                      lam_frac=lam_frac,
                                      sigma=sigma,
                                      stage_one=stage_one,
                                      splitting=True,
                                      ndraw=ndraw,
                                      burnin=burnin,
                                      coverage=coverage)

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:, L.active]
            truth = np.dot(np.linalg.pinv(Xa), mu)

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            return carve[s:], split[
                s:], carve[:
                           s], split[:
                                     s], counter, carve_coverage, split_coverage
Ejemplo n.º 17
0
def test_logistic_pvals(n=500,
                        p=200,
                        s=3,
                        sigma=2,
                        rho=0.3,
                        snr=7.):

    counter = 0

    while True:
        counter += 1

        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)

        z = (y > 0)
        X = np.hstack([np.ones((n,1)), X])

        active = np.array(active)
        active += 1
        active = [0] + list(active)

        L = lasso.logistic(X, z, [0]*1 + [1.2]*p)
        L.fit()
        S = L.summary('onesided')

        if set(active).issubset(L.active) > 0:
            return [p for p, v in zip(S['pval'], S['variable']) if v not in active]
        return []
Ejemplo n.º 18
0
def test_sqrt_lasso_pvals(n=100, p=200, s=7, sigma=5, rho=0.3, snr=7.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr)

        lam_theor = np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 1000)))).max(0)) / np.sqrt(n)
        Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0)

        weights_with_zeros = 0.7 * lam_theor * np.ones(p)
        weights_with_zeros[:3] = 0.

        L = lasso.sqrt_lasso(X, y, weights_with_zeros)
        L.fit()
        v = {1: 'twosided', 0: 'onesided'}[counter % 2]
        if set(active).issubset(L.active):
            S = L.summary(v)
            return [
                p for p, v in zip(S['pval'], S['variable']) if v not in active
            ]
Ejemplo n.º 19
0
def sim2():
    X, Y, _, active, sigma = instance(n=150, s=3)
    G = data_splitting.gaussian(X, Y, 5., split_frac=0.5, sigma=sigma)
    G.fit(use_full=True)
    if set(active).issubset(G.active) and G.active.shape[0] > len(active):
        return [G.hypothesis_test(G.active[len(active)])]
    return []
Ejemplo n.º 20
0
def test_gaussian_sandwich_pvals(n=100,
                                 p=200,
                                 s=20,
                                 sigma=10,
                                 rho=0.3,
                                 snr=6.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr)

        heteroscedastic_error = sigma * np.random.standard_normal(n) * (
            np.fabs(X[:, -1]) + 0.5)**2
        heteroscedastic_error += sigma * np.random.standard_normal(n) * (
            np.fabs(X[:, -2]) + 0.2)**2
        heteroscedastic_error += sigma * np.random.standard_normal(n) * (
            np.fabs(X[:, -3]) + 0.5)**2
        y += heteroscedastic_error

        # two different estimators of variance
        sandwich = gaussian_sandwich_estimator(X, y, B=1000)
        parametric = gaussian_parametric_estimator(X, y, sigma=None)

        # make sure things work with some unpenalized columns

        feature_weights = np.ones(p) * 3 * sigma
        feature_weights[10:12] = 0
        L_P = lasso.gaussian(X,
                             y,
                             feature_weights,
                             covariance_estimator=parametric)
        L_P.fit()

        if set(active).issubset(L_P.active):

            S = L_P.summary('twosided')
            P_P = [
                p for p, v in zip(S['pval'], S['variable']) if v not in active
            ]

            L_S = lasso.gaussian(X,
                                 y,
                                 feature_weights,
                                 covariance_estimator=sandwich)
            L_S.fit()

            S = L_S.summary('twosided')
            P_S = [
                p for p, v in zip(S['pval'], S['variable']) if v not in active
            ]

            return P_P, P_S
Ejemplo n.º 21
0
def test_data_carving(n=100,
                      p=200,
                      s=7,
                      rho=0.3,
                      snr=7.,
                      split_frac=0.8,
                      lam_frac=1.,
                      ndraw=8000,
                      burnin=2000, 
                      df=np.inf,
                      coverage=0.90,
                      sigma=3,
                      fit_args={'min_its':120, 'tol':1.e-12},
                      compute_intervals=True):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)
        L, stage_one = split_model(y, 
                                   X, 
                                   lam_frac=lam_frac,
                                   split_frac=split_frac,
                                   fit_args=fit_args)[:2]

        print L.active
        if set(range(s)).issubset(L.active):
            results, L = data_carving(y, X, lam_frac=lam_frac, 
                                      stage_one=stage_one,
                                      splitting=True, 
                                      ndraw=ndraw,
                                      burnin=burnin,
                                      coverage=coverage,
                                      fit_args=fit_args,
                                      compute_intervals=compute_intervals)

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:,L.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            return carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage
Ejemplo n.º 22
0
def sim():
    X, Y, _, active, sigma = instance()
    print(sigma)
    G = data_carving.gaussian(X, Y, 1., split_frac=0.9, sigma=sigma)
    G.fit()
    if set(active).issubset(G.active) and G.active.shape[0] > len(active):
        return [
            G.hypothesis_test(G.active[len(active)], burnin=5000, ndraw=10000)
        ]
    return []
def test_skinny_fat():

    X, Y = instance()[:2]
    n, p = X.shape
    lam = SQ.choose_lambda(X)
    obj1 = SQ.sqlasso_objective(X, Y)
    obj2 = SQ.sqlasso_objective_skinny(X, Y)
    soln1 = SQ.solve_sqrt_lasso_fat(X, Y, min_its=500, weights=np.ones(p) * lam)
    soln2 = SQ.solve_sqrt_lasso_skinny(X, Y, min_its=500, weights=np.ones(p) * lam)

    np.testing.assert_almost_equal(soln1, soln2)

    X, Y = instance(p=50)[:2]
    n, p = X.shape
    lam = SQ.choose_lambda(X)
    obj1 = SQ.sqlasso_objective(X, Y)
    obj2 = SQ.sqlasso_objective_skinny(X, Y)
    soln1 = SQ.solve_sqrt_lasso_fat(X, Y, min_its=500, weights=np.ones(p) * lam)
    soln2 = SQ.solve_sqrt_lasso_skinny(X, Y, min_its=500, weights=np.ones(p) * lam)

    np.testing.assert_almost_equal(soln1, soln2)
Ejemplo n.º 24
0
def test_skinny_fat():

    X, Y = instance()[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]
    soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)

    X, Y = instance(p=50)[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]
    soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)
Ejemplo n.º 25
0
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta, active, sigma = instance(n=n, p=p, s=s)
    las = lasso.gaussian(X, y, 4., sigma=sigma)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    S = las.summary(compute_intervals=True)
    nominal_intervals(las)
Ejemplo n.º 26
0
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta, active, sigma = instance(n=n, p=p, s=s)
    las = lasso.gaussian(X, y, 4., sigma=sigma)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    S = las.summary(compute_intervals=True)
    nominal_intervals(las)
Ejemplo n.º 27
0
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta, active, sigma = instance(n=n, p=p, s=s)
    las = lasso.gaussian(X, y, 4., sigma=sigma)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    las.summary()
    intervals = las.intervals
    nominal_intervals(las)
    t.append([(beta[I], L, U) for I, L, U in intervals])
    return t
Ejemplo n.º 28
0
def test_adding_quadratic_lasso():

    X, y, beta, active, sigma = instance(n=300, p=200)
    Q = rr.identity_quadratic(0.01, 0, np.random.standard_normal(X.shape[1]), 0)

    L1 = lasso.gaussian(X, y, 20, quadratic=Q)
    beta1 = L1.fit(solve_args={'min_its':500, 'tol':1.e-12})
    G1 = X[:,L1.active].T.dot(X.dot(beta1) - y) + Q.objective(beta1,'grad')[L1.active]
    np.testing.assert_allclose(G1 * np.sign(beta1[L1.active]), -20)

    lin = rr.identity_quadratic(0.0, 0, np.random.standard_normal(X.shape[1]), 0)
    L2 = lasso.gaussian(X, y, 20, quadratic=lin)
    beta2 = L2.fit(solve_args={'min_its':500, 'tol':1.e-12})
    G2 = X[:,L2.active].T.dot(X.dot(beta2) - y) + lin.objective(beta2,'grad')[L2.active]
    np.testing.assert_allclose(G2 * np.sign(beta2[L2.active]), -20)
Ejemplo n.º 29
0
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta = instance(n=n, p=p, s=s)[:3]
    las = lasso.gaussian(X, y, 4., .25)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    las.active_pvalues
    intervals = las.intervals
    nominal_intervals(las)
    t.append([(beta[I], L, U) for I, L, U in intervals])
    return t
Ejemplo n.º 30
0
def test_gaussian_sandwich_pvals(n=100,
                                 p=200,
                                 s=20,
                                 sigma=10,
                                 rho=0.3,
                                 snr=6.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr)

        heteroscedastic_error = sigma * np.random.standard_normal(n) * (np.fabs(X[:,-1]) + 0.5)**2
        heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-2]) + 0.2)**2
        heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-3]) + 0.5)**2
        y += heteroscedastic_error

        # two different estimators of variance
        sandwich = gaussian_sandwich_estimator(X, y, B=1000)
        parametric = gaussian_parametric_estimator(X, y, sigma=None)

        # make sure things work with some unpenalized columns

        feature_weights = np.ones(p) * 3 * sigma
        feature_weights[10:12] = 0
        L_P = lasso.gaussian(X, y, feature_weights, covariance_estimator=parametric)
        L_P.fit()

        if set(active).issubset(L_P.active):

            S = L_P.summary('twosided')
            P_P = [p for p, v in zip(S['pval'], S['variable']) if v not in active]
        
            L_S = lasso.gaussian(X, y, feature_weights, covariance_estimator=sandwich)
            L_S.fit()

            S = L_S.summary('twosided')
            P_S = [p for p, v in zip(S['pval'], S['variable']) if v not in active]

            return P_P, P_S
Ejemplo n.º 31
0
def test_BIC(do_sample=True, ndraw=8000, burnin=2000, nsim=None,
             force=False):

    X, Y, beta, active, sigma = instance()
    n, p = X.shape
    FS = info_crit_stop(Y, X, sigma, cost=np.log(n))
    final_model = len(FS.variables) 

    active = set(list(active))
    if active.issubset(FS.variables) or force:
        which_var = [v for v in FS.variables if v not in active]

        if do_sample:
            return [pval[-1] for pval in FS.model_pivots(final_model, saturated=False, burnin=burnin, ndraw=ndraw, which_var=which_var)]
        else:
            saturated_pivots = FS.model_pivots(final_model, which_var=which_var)
            return [pval[-1] for pval in saturated_pivots]
    return []
def test_BIC(do_sample=True, ndraw=8000, burnin=2000, nsim=None,
             force=False):

    X, Y, beta, active, sigma = instance()
    n, p = X.shape
    FS = info_crit_stop(Y, X, sigma, cost=np.log(n))
    final_model = len(FS.variables) 

    active = set(list(active))
    if active.issubset(FS.variables) or force:
        which_var = [v for v in FS.variables if v not in active]

        if do_sample:
            return [pval[-1] for pval in FS.model_pivots(final_model, saturated=False, burnin=burnin, ndraw=ndraw, which_var=which_var)]
        else:
            saturated_pivots = FS.model_pivots(final_model, which_var=which_var)
            return [pval[-1] for pval in saturated_pivots]
    return []
Ejemplo n.º 33
0
def test_adding_quadratic_lasso():

    X, y, beta, active, sigma = instance(n=300, p=200)
    Q = rr.identity_quadratic(0.01, 0, np.random.standard_normal(X.shape[1]),
                              0)

    L1 = lasso.gaussian(X, y, 20, quadratic=Q)
    beta1 = L1.fit(solve_args={'min_its': 500, 'tol': 1.e-12})
    G1 = X[:, L1.active].T.dot(X.dot(beta1) - y) + Q.objective(
        beta1, 'grad')[L1.active]
    np.testing.assert_allclose(G1 * np.sign(beta1[L1.active]), -20)

    lin = rr.identity_quadratic(0.0, 0, np.random.standard_normal(X.shape[1]),
                                0)
    L2 = lasso.gaussian(X, y, 20, quadratic=lin)
    beta2 = L2.fit(solve_args={'min_its': 500, 'tol': 1.e-12})
    G2 = X[:, L2.active].T.dot(X.dot(beta2) - y) + lin.objective(
        beta2, 'grad')[L2.active]
    np.testing.assert_allclose(G2 * np.sign(beta2[L2.active]), -20)
Ejemplo n.º 34
0
def test_gaussian_pvals(n=100, p=500, s=7, sigma=5, rho=0.3, snr=8.):

    counter = 0

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr)
        L = lasso.gaussian(X, y, 20., sigma=sigma)
        L.fit()
        L.fit(L.lasso_solution)
        v = {1: 'twosided', 0: 'onesided'}[counter % 2]
        if set(active).issubset(L.active):
            S = L.summary(v)
            return [
                p for p, v in zip(S['pval'], S['variable']) if v not in active
            ]
Ejemplo n.º 35
0
def test_tilting(nsim=100, ndraw=50000, burnin=10000):

    P = []
    covered0 = 0
    coveredA = 0
    screen = 0

    for i in range(nsim):
        X, Y, beta, active, sigma = instance(n=20, p=30)

        Y0 = np.random.standard_normal(X.shape[0]) * sigma

        # null pvalues and intervals

        cone, pvalue, idx, sign = selected_covtest(X, Y0, sigma=sigma)
        eta = X[:,idx] * sign
        p1, _, _, fam = gibbs_test(cone, Y0, eta, 
                                   ndraw=ndraw,
                                   burnin=burnin,
                                   alternative='twosided',
                                   sigma_known=True,
                                   tilt=eta,
                                   UMPU=False)

        observed_value = (Y0 * eta).sum()
        lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
        lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
        upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
        covered0 += (lower_lim_final < 0) * (upper_lim_final > 0)
        print(covered0 / (i + 1.), 'coverage0')

        # compare to no tilting

        p2 = gibbs_test(cone, Y0, X[:,idx] * sign,
                        ndraw=ndraw,
                        burnin=burnin,
                        alternative='twosided',
                        sigma_known=True,
                        tilt=None,
                        UMPU=False)[0]
        print(p2, 'huh')
        P.append((p1, p2))
        Pa = np.array(P)

        # p1 and p2 should be very close, so have high correlation
        print(np.corrcoef(Pa.T)[0,1], 'correlation')

        # they should also look uniform -- mean should be about 0.5, sd about 0.29

        print(np.mean(Pa, 0), 'mean of nulls')
        print(np.std(Pa, 0), 'sd of nulls')

        # alternative intervals

        mu = 3 * X[:,0] * sigma
        YA = np.random.standard_normal(X.shape[0]) * sigma + mu 

        cone, pvalue, idx, sign = selected_covtest(X, YA, sigma=sigma)
        _, _, _, fam = gibbs_test(cone, YA, X[:,idx] * sign,
                                  ndraw=ndraw,
                                  burnin=burnin,
                                  alternative='greater',
                                  sigma_known=True,
                                  tilt=eta)

        if idx == 0:
            screen += 1

            eta = X[:,0] * sign
            observed_value = (YA * eta).sum()
            target = (eta * mu).sum()
            lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
            lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
            upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
            print(lower_lim_final, upper_lim_final, target)
            coveredA += (lower_lim_final < target) * (upper_lim_final > target)
            print(coveredA / (screen * 1.), 'coverageA')

        print(screen / (i + 1.), 'screening')
Ejemplo n.º 36
0
    selected : []
        Sequence of selected variables.

    active_set : set
        Set of active variables.

    Returns
    -------

    idx : int
        Completion index.

    >>> selected = [1,3,2,4,6,7,8,23,11,5]
    >>> active = [1,4,8]
    >>> completion_index(selected, active)
    6
    """
    active_set = set(active_set)
    for i in range(len(selected)):
        if active_set.issubset(selected[:i]):
            return i - 1
    return len(selected) - 1


if __name__ == "__main__":
    from selection.algorithms.lasso import instance
    X, y, beta, active, sigma = instance(n=100, p=40, snr=5, rho=0.3)
    R, FS = compute_pvalues(y, X, sigma=sigma, maxstep=20)
    print completion_index(R['variable_selected'], active)
Ejemplo n.º 37
0
def test_lasso(s=5, n=200, p=20):

    X, y, _, nonzero, sigma = instance(n=n,
                                       p=p,
                                       random_signs=True,
                                       s=s,
                                       sigma=1.,
                                       rho=0,
                                       snr=10)
    print 'sigma', sigma
    lam_frac = 1.

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.gaussian_Xfixed(X, y)

    random_Z = randomization.rvs(p)
    epsilon = 1.
    lam = sigma * lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    random_Z = randomization.rvs(p)
    penalty = randomized.selective_l1norm_lan(p, lagrange=lam)

    #sampler1 = randomized.selective_sampler_MH_lan(loss,
    #                                           random_Z,
    #                                           epsilon,
    #                                           randomization,
    #                                          penalty)

    #loss_args = {'mean': np.zeros(n),
    #             'sigma': sigma,
    #             'linear_part':np.identity(y.shape[0]),
    #             'value': 0}

    #sampler1.setup_sampling(y, loss_args=loss_args)
    # data, opt_vars = sampler1.state

    # initial solution
    # rr.smooth_atom instead of loss?
    problem = rr.simple_problem(loss, penalty)
    random_term = rr.identity_quadratic(epsilon, 0, -random_Z, 0)
    solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500}
    initial_soln = problem.solve(random_term, **solve_args)

    active = (initial_soln != 0)
    inactive = ~active
    initial_grad = -np.dot(X.T, y - np.dot(X, initial_soln))
    betaE = initial_soln[active]
    signs = np.sign(betaE)
    subgradient = random_Z - initial_grad - epsilon * initial_soln
    cube = np.divide(subgradient[inactive], lam)
    #print betaE, cube
    #initial_grad = loss.smooth_objective(initial_soln,  mode='grad')
    #print penalty.setup_sampling(initial_grad,
    #                                     initial_soln,
    #                                     random_Z,
    #                                     epsilon)

    data0 = y.copy()
    #active = penalty.active_set

    if (np.sum(active) == 0):
        print 'here'
        return [-1], [-1]

    nalpha = n
    nactive = betaE.shape[0]
    ninactive = cube.shape[0]

    alpha = np.ones(n)
    beta_bar = np.linalg.lstsq(X[:, active], y)[0]
    obs_residuals = y - np.dot(X[:, active], beta_bar)

    #obs_residuals -= np.mean(obs_residuals)
    #betaE, cube = opt_vars

    init_vec_state = np.zeros(n + nactive + ninactive)
    init_vec_state[:n] = alpha
    init_vec_state[n:(n + nactive)] = betaE
    init_vec_state[(n + nactive):] = cube

    def full_projection(vec_state,
                        signs=signs,
                        nalpha=nalpha,
                        nactive=nactive,
                        ninactive=ninactive):

        alpha = vec_state[:nalpha].copy()
        betaE = vec_state[nalpha:(nalpha + nactive)]
        cube = vec_state[(nalpha + nactive):]

        #signs = penalty.signs
        projected_alpha = alpha.copy()
        projected_betaE = betaE.copy()
        projected_cube = np.zeros_like(cube)

        projected_alpha = np.clip(alpha, 0, np.inf)

        for i in range(nactive):
            if (projected_betaE[i] * signs[i] < 0):
                projected_betaE[i] = 0

        projected_cube = np.clip(cube, -1, 1)

        return np.concatenate(
            (projected_alpha, projected_betaE, projected_cube), 0)

    null, alt = pval(init_vec_state, full_projection, X, y, obs_residuals,
                     signs, lam, epsilon, nonzero, active)

    return null, alt
Ejemplo n.º 38
0
def test_data_carving_sqrt_lasso(n=100,
                                 p=200,
                                 s=7,
                                 sigma=5,
                                 rho=0.3,
                                 snr=7.,
                                 split_frac=0.9,
                                 lam_frac=1.2,
                                 ndraw=8000,
                                 burnin=2000,
                                 df=np.inf,
                                 coverage=0.90,
                                 compute_intervals=True,
                                 nsim=None):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr,
                                             df=df)
        mu = np.dot(X, beta)

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n * split_frac)]
        n1 = len(stage_one)

        lam_theor = lam_frac * np.mean(
            np.fabs(
                np.dot(X[stage_one].T, np.random.standard_normal(
                    (n1, 5000)))).max(0)) / np.sqrt(n1)
        DC = data_carving.sqrt_lasso(X,
                                     y,
                                     feature_weights=lam_theor,
                                     stage_one=stage_one)

        DC.fit()
        DS = data_splitting.sqrt_lasso(X,
                                       y,
                                       feature_weights=lam_theor,
                                       stage_one=stage_one)
        DS.fit()

        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(
                    DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                split.append(DS.hypothesis_test(var))

            Xa = X[:, DC.active]
            truth = np.dot(np.linalg.pinv(Xa), mu)

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter,
                 carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)

    return return_value
Ejemplo n.º 39
0
def test_data_carving_poisson(n=200,
                              p=300,
                              s=5,
                              sigma=5,
                              rho=0.3,
                              snr=9.,
                              split_frac=0.8,
                              lam_frac=1.2,
                              ndraw=8000,
                              burnin=2000,
                              df=np.inf,
                              coverage=0.90,
                              compute_intervals=True,
                              nsim=None,
                              use_full_cov=True):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr,
                                             df=df)
        X = np.hstack([np.ones((n, 1)), X])
        y = np.random.poisson(10, size=y.shape)
        s = 1

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n * split_frac)]
        n1 = len(stage_one)

        lam_theor = 6. * np.ones(p + 1)
        lam_theor[0] = 0.
        DC = data_carving.poisson(X,
                                  y,
                                  feature_weights=lam_theor,
                                  stage_one=stage_one)

        DC.fit()

        if len(DC.active) < n - int(n * split_frac):
            DS = data_splitting.poisson(X,
                                        y,
                                        feature_weights=lam_theor,
                                        stage_one=stage_one)
            DS.fit(use_full_cov=use_full_cov)
            data_split = True
        else:
            print('not enough data for data splitting second stage')
            print(DC.active)
            data_split = False

        print(DC.active)
        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(
                    DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                if data_split:
                    split.append(DS.hypothesis_test(var))
                else:
                    split.append(np.random.sample())

            Xa = X[:, DC.active]

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter,
                 carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)

    return return_value
Ejemplo n.º 40
0
def test_data_carving_sqrt_lasso(n=100,
                                 p=200,
                                 s=7,
                                 sigma=5,
                                 rho=0.3,
                                 snr=7.,
                                 split_frac=0.9,
                                 lam_frac=1.2,
                                 ndraw=8000,
                                 burnin=2000, 
                                 df=np.inf,
                                 coverage=0.90,
                                 compute_intervals=True,
                                 nsim=None):
    
    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n*split_frac)]
        n1 = len(stage_one)

        lam_theor = lam_frac * np.mean(np.fabs(np.dot(X[stage_one].T, np.random.standard_normal((n1, 5000)))).max(0)) / np.sqrt(n1)
        DC = data_carving.sqrt_lasso(X, y, feature_weights=lam_theor,
                                     stage_one=stage_one)

        DC.fit()
        DS = data_splitting.sqrt_lasso(X, y, feature_weights=lam_theor,
                                       stage_one=stage_one)
        DS.fit()
                
        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                split.append(DS.hypothesis_test(var))

            Xa = X[:,DC.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)

    return return_value
Ejemplo n.º 41
0
def test_data_carving_gaussian(n=100,
                               p=200,
                               s=7,
                               sigma=5,
                               rho=0.3,
                               snr=7.,
                               split_frac=0.8,
                               lam_frac=2.,
                               ndraw=8000,
                               burnin=2000,
                               df=np.inf,
                               coverage=0.90,
                               compute_intervals=True,
                               nsim=None,
                               use_full_cov=True):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr,
                                             df=df)
        mu = np.dot(X, beta)

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n * split_frac)]

        lam_theor = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 5000)))).max(0)) * sigma
        DC = data_carving.gaussian(X,
                                   y,
                                   feature_weights=lam_theor,
                                   sigma=sigma,
                                   stage_one=stage_one)
        DC.fit()

        if len(DC.active) < n - int(n * split_frac):
            DS = data_splitting.gaussian(X,
                                         y,
                                         feature_weights=lam_theor,
                                         sigma=sigma,
                                         stage_one=stage_one)
            DS.fit(use_full_cov=True)
            DS.fit(use_full_cov=False)
            DS.fit(use_full_cov=use_full_cov)
            data_split = True
        else:
            print('not enough data for second stage data splitting')
            print(DC.active)
            data_split = False

        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(
                    DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                if data_split:
                    split.append(DS.hypothesis_test(var))
                else:
                    split.append(
                        np.random.sample()
                    )  # appropriate p-value if data splitting can't estimate 2nd stage

            Xa = X[:, DC.active]
            truth = np.dot(np.linalg.pinv(Xa), mu)

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter,
                 carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)

    return return_value
Ejemplo n.º 42
0
def test_fstep(s=0, n=100, p=10, Langevin_steps=10000, burning=2000, condition_on_sign=True):

    X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0)
    epsilon = 0.
    randomization = laplace(loc=0, scale=1.)

    random_Z = randomization.rvs(p)
    T = np.dot(X.T,y) 
    T_random = T + random_Z
    T_abs = np.abs(T_random)
    j_star = np.argmax(T_abs)
    s_star = np.sign(T_random[j_star])

    # this is the subgradient part of the projection

    if condition_on_sign:
        projection = projection_cone(p, j_star, s_star)
    else:
        projection = projection_cone_nosign(p, j_star)

    def full_projection(state, n=n, p=p):
        """
        State is (y, u) -- first n coordinates are y, last p are u.
        """
        new_state = np.empty(state.shape, np.float)
        new_state[:n] = state[:n]
        new_state[n:] = projection(state[n:])
        return new_state

    obs = np.max(np.abs(T))
    eta_star = np.zeros(p)
    eta_star[j_star] = s_star

    def full_gradient(state, n=n, p=p, X=X):
        data = state[:n]
        subgrad = state[n:]
        sign_vec = np.sign(-X.T.dot(data) + subgrad)

        grad = np.empty(state.shape, np.float)
        grad[n:] = - sign_vec

        grad[:n] = - (data - X.dot(sign_vec))
        return grad



    state = np.zeros(n+p)
    state[:n] = y
    state[n:] = T_random

    sampler = projected_langevin(state,
                                 full_gradient,
                                 full_projection,
                                 1./p)
    samples = []

    for i in range(Langevin_steps):
        if i>burning:
            sampler.next()
            samples.append(sampler.state.copy())

    samples = np.array(samples)
    Z = samples[:,:n]

    pop = np.abs(X.T.dot(Z.T)).max(0)
    fam = discrete_family(pop, np.ones_like(pop))
    pval = fam.cdf(0, obs)
    pval = 2 * min(pval, 1 - pval)

    #stop

    print 'pvalue:', pval
    return pval
Ejemplo n.º 43
0
def test_data_carving_poisson(n=200,
                              p=300,
                              s=5,
                              sigma=5,
                              rho=0.3,
                              snr=9.,
                              split_frac=0.8,
                              lam_frac=1.2,
                              ndraw=8000,
                              burnin=2000, 
                              df=np.inf,
                              coverage=0.90,
                              compute_intervals=True,
                              nsim=None,
                              use_full_cov=True):
    
    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        X = np.hstack([np.ones((n,1)), X])
        y = np.random.poisson(10, size=y.shape)
        s = 1

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n*split_frac)]
        n1 = len(stage_one)

        lam_theor = 6. * np.ones(p+1)
        lam_theor[0] = 0.
        DC = data_carving.poisson(X, y, feature_weights=lam_theor,
                                  stage_one=stage_one)

        DC.fit()

        if len(DC.active) < n - int(n*split_frac):
            DS = data_splitting.poisson(X, y, feature_weights=lam_theor,
                                         stage_one=stage_one)
            DS.fit(use_full_cov=use_full_cov)
            data_split = True
        else:
            print('not enough data for data splitting second stage')
            print(DC.active)
            data_split = False

        print(DC.active)
        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                if data_split:
                    split.append(DS.hypothesis_test(var))
                else:
                    split.append(np.random.sample())

            Xa = X[:,DC.active]

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)
        
    return return_value
Ejemplo n.º 44
0
def test_lasso(s=1, n=100, p=10):

    X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0)
    print 'sigma', sigma
    lam_frac = 1.

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.gaussian_Xfixed(X, y)

    random_Z = randomization.rvs(p)
    epsilon = 1.
    lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    random_Z = randomization.rvs(p)
    penalty = randomized.selective_l1norm_lan(p, lagrange=lam)

    #sampler1 = randomized.selective_sampler_MH_lan(loss,
    #                                           random_Z,
    #                                           epsilon,
    #                                           randomization,
    #                                          penalty)

    #loss_args = {'mean': np.zeros(n),
    #             'sigma': sigma,
    #             'linear_part':np.identity(y.shape[0]),
    #             'value': 0}

    #sampler1.setup_sampling(y, loss_args=loss_args)
    # data, opt_vars = sampler1.state

    # initial solution
    problem = rr.simple_problem(loss, penalty)
    random_term = rr.identity_quadratic(epsilon, 0, random_Z, 0)
    solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500}
    initial_soln = problem.solve(random_term, **solve_args)
    initial_grad = loss.smooth_objective(initial_soln,  mode='grad')
    betaE, cube = penalty.setup_sampling(initial_grad,
                                         initial_soln,
                                         random_Z,
                                         epsilon)

    data = y.copy()
    active = penalty.active_set
    if (np.sum(active)==0):
        print 'here'
        return [-1], [-1]
    inactive = ~active

    #betaE, cube = opt_vars
    ndata = data.shape[0];  nactive = betaE.shape[0];  ninactive = cube.shape[0]
    init_vec_state = np.zeros(ndata+nactive+ninactive)
    init_vec_state[:ndata] = data
    init_vec_state[ndata:(ndata+nactive)] = betaE
    init_vec_state[(ndata+nactive):] = cube

    def bootstrap_samples(y, P, R):
        nsample = 50
        boot_samples = []
        for _ in range(nsample):
            indices = np.random.choice(n, size=(n,), replace=True)
            y_star = y[indices]
            boot_samples.append(np.dot(P,y)+np.dot(R,y_star-y))

        return boot_samples

   #boot_samples = bootstrap_samples(y)


    def move_data(vec_state, boot_samples,
                   ndata = ndata, nactive = nactive, ninactive = ninactive, loss=loss):

        weights = []

        betaE = vec_state[ndata:(ndata+nactive)]
        cube = vec_state[(ndata+nactive):]
        opt_vars = [betaE, cube]
        params, _, opt_vec = penalty.form_optimization_vector(opt_vars)  # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty

        for i in range(len(boot_samples)):
            gradient = loss.gradient(boot_samples[i], params)
            weights.append(np.exp(-np.sum(np.abs(gradient + opt_vec))))
        weights /= np.sum(weights)

        #m = max(weights)
        #idx = [i for i, j in enumerate(weights) if j == m][0]
        idx = np.nonzero(np.random.multinomial(1, weights, size=1)[0])[0][0]
        return boot_samples[idx]


    def full_projection(vec_state, penalty=penalty,
                        ndata=ndata, nactive=nactive, ninactive = ninactive):
        data = vec_state[:ndata].copy()
        betaE = vec_state[ndata:(ndata+nactive)]
        cube = vec_state[(ndata+nactive):]

        signs = penalty.signs
        projected_betaE = betaE.copy()
        projected_cube = np.zeros_like(cube)

        for i in range(nactive):
            if (projected_betaE[i] * signs[i] < 0):
                projected_betaE[i] = 0

        projected_cube = np.clip(cube, -1, 1)

        return np.concatenate((data, projected_betaE, projected_cube), 0)



    def full_gradient(vec_state, loss=loss, penalty =penalty, X=X,
                      lam=lam, epsilon=epsilon, ndata=ndata, active=active, inactive=inactive):
        nactive = np.sum(active); ninactive=np.sum(inactive)

        data = vec_state[:ndata]
        betaE = vec_state[ndata:(ndata + nactive)]
        cube = vec_state[(ndata + nactive):]

        opt_vars = [betaE, cube]
        params , _ , opt_vec = penalty.form_optimization_vector(opt_vars) # opt_vec=\epsilon(\beta 0)+u, u=\grad P(\beta), P penalty

        gradient = loss.gradient(data, params)
        hessian = loss.hessian()

        ndata = data.shape[0]
        nactive = betaE.shape[0]
        ninactive = cube.shape[0]

        sign_vec = - np.sign(gradient + opt_vec)  # sign(w), w=grad+\epsilon*beta+lambda*u

        B = hessian + epsilon * np.identity(nactive + ninactive)
        A = B[:, active]

        _gradient = np.zeros(ndata + nactive + ninactive)
        _gradient[:ndata] = 0 #- (data + np.dot(X, sign_vec))
        _gradient[ndata:(ndata + nactive)] = np.dot(A.T, sign_vec)
        _gradient[(ndata + nactive):] = lam * sign_vec[inactive]

        return _gradient


    null, alt = pval(init_vec_state, full_gradient, full_projection, move_data, bootstrap_samples,
                      X, y, nonzero, active)

    return null, alt
Ejemplo n.º 45
0
def test_gaussian_many_targets():
    s, n, p = 5, 200, 20

    randomizer = randomization.laplace((p, ), scale=1.)
    X, Y, beta, nonzero, sigma = instance(n=n, p=p, s=s, rho=0.1, snr=7)

    lam_frac = 1.
    lam = lam_frac * np.mean(
        np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0)) * sigma
    W = np.ones(p) * lam
    epsilon = 1. / np.sqrt(n)

    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    M_est = fixedX_group_lasso(X, Y, epsilon, penalty, randomizer)

    mv = multiple_views([M_est])
    mv.solve()

    active = M_est.overall
    nactive = active.sum()

    if set(nonzero).issubset(
            np.nonzero(active)[0]) and active.sum() > len(nonzero):

        pvalues = []
        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]
        active_selected = A = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] in nonzero
        ]

        idx = I[0]
        boot_target, target_observed = resid_bootstrap(M_est.loss, active)

        X_active = X[:, active]
        beta_hat = np.linalg.pinv(X_active).dot(Y)
        resid_hat = Y - X_active.dot(beta_hat)
        sampler = lambda: X_active.dot(beta_hat) + np.random.choice(
            resid_hat, size=(n, ), replace=True)
        mv.setup_sampler(sampler)

        # null saturated

        def null_target(Y_star):
            result = boot_target(Y_star)
            return result[idx]

        null_observed = np.zeros(1)
        null_observed[0] = target_observed[idx]

        target_sampler = mv.setup_target(null_target, null_observed)

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, null_observed, burnin=10000,
            ndraw=10000)  # twosided by default
        pvalues.append(pval)

        # null selected

        def null_target(Y_star):
            result = boot_target(Y_star)
            return np.hstack([result[idx], result[nactive:]])

        null_observed = np.zeros_like(null_target(
            np.random.standard_normal(n)))
        null_observed[0] = target_observed[idx]
        null_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(null_target,
                                         null_observed,
                                         target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, null_observed, burnin=10000,
            ndraw=10000)  # twosided by default
        pvalues.append(pval)

        # true saturated

        idx = A[0]

        def active_target(Y_star):
            result = boot_target(Y_star)
            return result[idx]

        active_observed = np.zeros(1)
        active_observed[0] = target_observed[idx]

        sampler = lambda: np.random.choice(n, size=(n, ), replace=True)

        target_sampler = mv.setup_target(active_target, active_observed)

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, active_observed, burnin=10000,
            ndraw=10000)  # twosided by default
        pvalues.append(pval)

        # true selected

        def active_target(Y_star):
            result = boot_target(Y_star)
            return np.hstack([result[idx], result[nactive:]])

        active_observed = np.zeros_like(
            active_target(np.random.standard_normal(n)))
        active_observed[0] = target_observed[idx]
        active_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(active_target,
                                         active_observed,
                                         target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, active_observed, burnin=10000,
            ndraw=10000)  # twosided by default
        pvalues.append(pval)

        return pvalues
Ejemplo n.º 46
0
def test_fstep(s=0, n=50, p=10, weights = "gumbel", randomization_dist ="logistic",
               Langevin_steps = 10000, burning=1000):

    X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0)
    epsilon = 0.
    if randomization_dist == "laplace":
        randomization = laplace(loc=0, scale=1.)
        random_Z = randomization.rvs(p)
    if randomization_dist=="logistic":
        random_Z = np.random.logistic(loc=0, scale=1, size=p)

    T = np.dot(X.T,y)
    T_random = T + random_Z
    T_abs = np.abs(T_random)
    j_star = np.argmax(T_abs)
    s_star = np.sign(T_random[j_star])

    # this is the subgradient part of the projection
    projection = projection_cone(p, j_star, s_star)


    def full_projection(state, n=n, p=p):
        """
        State is (y, u) -- first n coordinates are y, last p are u.
        """
        new_state = np.empty(state.shape, np.float)
        new_state[:n] = state[:n]
        new_state[n:] = projection(state[n:])
        return new_state


    obs = np.max(np.abs(T))
    eta_star = np.zeros(p)
    eta_star[j_star] = s_star


    def full_gradient(state, n=n, p=p, X=X, y=y):
        #data = state[:n]

        alpha = state[:n]
        subgrad = state[n:]

        mat = np.dot(X.T, np.diag(y))
        omega = - mat.dot(alpha) + subgrad

        if randomization_dist == "laplace":
            randomization_derivative = np.sign(omega)
        if randomization_dist == "logistic":
            randomization_derivative = -(np.exp(-omega) - 1) / (np.exp(-omega) + 1)
        if randomization_dist == "normal":
            randomization_derivative = omega

        grad = np.empty(state.shape, np.float)
        #grad[:n] = - (data - X.dot(randomization_derivative))
        grad[:n] = np.dot(mat.T,randomization_derivative)

        if weights == "normal":
            grad[:n] -= alpha
        if (weights == "gumbel"):
            gumbel_beta = np.sqrt(6) / (1.14 * np.pi)
            euler = 0.57721
            gumbel_mu = -gumbel_beta * euler
            gumbel_sigma = 1. / 1.14
            grad[:n] -= (1. - np.exp(-(alpha * gumbel_sigma - gumbel_mu) / gumbel_beta)) * gumbel_sigma / gumbel_beta

        grad[n:] = - randomization_derivative

        return grad



    state = np.zeros(n+p)
    #state[:n] = y
    state[:n] = np.zeros(n)
    state[n:] = T_random

    sampler = projected_langevin(state,
                                 full_gradient,
                                 full_projection,
                                 1./p)
    samples = []

    for i in range(Langevin_steps):
        sampler.next()
        if (i>burning):
            samples.append(sampler.state.copy())

    samples = np.array(samples)
    Z = samples[:,:n]
    print Z.shape

    mat = np.dot(X.T,np.diag(y))

    #pop = [np.linalg.norm(np.dot(mat, Z[i,:].T)) for i in range(Z.shape[0])]
    #obs = np.linalg.norm(np.dot(X.T,y))
    pop = np.abs(np.dot(mat, Z.T)).max(0)
    fam = discrete_family(pop, np.ones_like(pop))
    pval = fam.cdf(0, obs)
    pval = 2 * min(pval, 1 - pval)

    #stop

    print 'pvalue:', pval
    return pval
Ejemplo n.º 47
0
def test_data_carving(n=100,
                      p=200,
                      s=7,
                      sigma=5,
                      rho=0.3,
                      snr=7.,
                      split_frac=0.9,
                      lam_frac=2.,
                      ndraw=8000,
                      burnin=2000, 
                      df=np.inf,
                      coverage=0.90,
                      compute_intervals=True):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)
        L, stage_one = split_model(y, X, 
                        sigma=sigma,
                        lam_frac=lam_frac,
                        split_frac=split_frac)[:2]

        if set(range(s)).issubset(L.active):
            while True:
                results, L = data_carving(y, X, lam_frac=lam_frac, 
                                          sigma=sigma,
                                          stage_one=stage_one,
                                          splitting=True, 
                                          ndraw=ndraw,
                                          burnin=burnin,
                                          coverage=coverage,
                                          compute_intervals=compute_intervals)
                if set(range(s)).issubset(L.active):
                    print "succeed"
                    break
                print "failed at least once"

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:,L.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            TP = s
            FP = L.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(L.active).intersection(range(s)))
            FP = L.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)
    return return_value
Ejemplo n.º 48
0
def test_data_carving(n=100,
                      p=200,
                      s=7,
                      sigma=5,
                      rho=0.3,
                      snr=7.,
                      split_frac=0.9,
                      lam_frac=2.,
                      ndraw=8000,
                      burnin=2000,
                      df=np.inf,
                      coverage=0.90,
                      compute_intervals=True):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n,
                                             p=p,
                                             s=s,
                                             sigma=sigma,
                                             rho=rho,
                                             snr=snr,
                                             df=df)
        mu = np.dot(X, beta)
        L, stage_one = split_model(y,
                                   X,
                                   sigma=sigma,
                                   lam_frac=lam_frac,
                                   split_frac=split_frac)[:2]

        if set(range(s)).issubset(L.active):
            while True:
                results, L = data_carving(y,
                                          X,
                                          lam_frac=lam_frac,
                                          sigma=sigma,
                                          stage_one=stage_one,
                                          splitting=True,
                                          ndraw=ndraw,
                                          burnin=burnin,
                                          coverage=coverage,
                                          compute_intervals=compute_intervals)
                if set(range(s)).issubset(L.active):
                    print "succeed"
                    break
                print "failed at least once"

            carve = [r[1] for r in results]
            split = [r[3] for r in results]

            Xa = X[:, L.active]
            truth = np.dot(np.linalg.pinv(Xa), mu)

            split_coverage = []
            carve_coverage = []
            for result, t in zip(results, truth):
                _, _, ci, _, si = result
                carve_coverage.append((ci[0] < t) * (t < ci[1]))
                split_coverage.append((si[0] < t) * (t < si[1]))

            TP = s
            FP = L.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter,
                 carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(L.active).intersection(range(s)))
            FP = L.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)
    return return_value
Ejemplo n.º 49
0
def test_kfstep(k=4, s=3, n=100, p=10):

    X, y, beta, nonzero, sigma = instance(n=n,
                                          p=p,
                                          random_signs=True,
                                          s=s,
                                          sigma=1.,
                                          rho=0,
                                          snr=10)
    epsilon = 0.

    randomization = laplace(loc=0, scale=1.)

    j_seq = np.empty(k, dtype=int)
    s_seq = np.empty(k)

    left = np.ones(p, dtype=bool)
    obs = 0

    initial_state = np.zeros(n + np.sum([i for i in range(p - k + 1, p + 1)]))
    #initial_state[:n] = y.copy()
    initial_state[:n] = np.zeros(n)

    mat = [np.array((n, ncol)) for ncol in range(p, p - k, -1)]

    curr = n

    keep = np.zeros(p, dtype=bool)

    for i in range(k):
        X_left = X[:, left]
        X_selected = X[:, ~left]
        if (np.sum(left) < p):
            P_perp = np.identity(n) - X_selected.dot(
                np.linalg.pinv(X_selected))
            mat[i] = P_perp.dot(X_left)
        else:
            mat[i] = X

        mat_complete = np.zeros((n, p))
        mat_complete[:, left] = mat[i]

        T = np.dot(mat[i].T, y)
        T_complete = np.dot(mat_complete.T, y)

        obs = np.max(np.abs(T))
        keep = np.copy(~left)

        random_Z = randomization.rvs(T.shape[0])
        T_random = T + random_Z
        initial_state[curr:(curr + p -
                            i)] = T_random  # initializing subgradients
        curr = curr + p - i

        j_seq[i] = np.argmax(np.abs(T_random))
        s_seq[i] = np.sign(T_random[j_seq[i]])

        #def find_index(v, idx1):
        #    _sumF = 0
        #    _sumT = 0
        #    idx = idx1+1
        #    for i in range(v.shape[0]):
        #        if (v[i] == False):
        #            _sumF = _sumF + 1
        #        else:
        #           _sumT = _sumT + 1
        #        if _sumT >= idx: break
        #    return (_sumT + _sumF-1)

        T_complete[left] += random_Z
        left[np.argmax(np.abs(T_complete))] = False

    # conditioning
    linear_part = X[:, keep].T
    P = np.dot(linear_part.T, np.linalg.pinv(linear_part).T)
    I = np.identity(linear_part.shape[1])
    R = I - P

    def full_projection(state, n=n, p=p, k=k):
        """
        """
        new_state = np.empty(state.shape, np.float)
        new_state[:n] = state[:n]
        curr = n
        for i in range(k):
            projection = projection_cone(p - i, j_seq[i], s_seq[i])
            new_state[curr:(curr + p - i)] = projection(state[curr:(curr + p -
                                                                    i)])
            curr = curr + p - i
        return new_state

    def full_gradient(state, n=n, p=p, k=k, X=X, mat=mat):
        data = state[:n]

        grad = np.empty(n + np.sum([i for i in range(p - k + 1, p + 1)]))
        grad[:n] = -data

        curr = n
        for i in range(k):
            subgrad = state[curr:(curr + p - i)]

            sign_vec = np.sign(-mat[i].T.dot(data) + subgrad)
            grad[curr:(curr + p - i)] = -sign_vec
            curr = curr + p - i
            grad[:n] += mat[i].dot(sign_vec)

        return grad

    sampler = projected_langevin(initial_state, full_gradient, full_projection,
                                 1. / p)
    samples = []

    for _ in range(5000):
        old_state = sampler.state.copy()
        old_data = old_state[:n]
        sampler.next()
        new_state = sampler.state.copy()
        new_data = new_state[:n]
        new_data = np.dot(P, old_data) + np.dot(R, new_data)
        sampler.state[:n] = new_data
        samples.append(sampler.state.copy())

    samples = np.array(samples)
    Z = samples[:, :n]

    pop = np.abs(mat[k - 1].T.dot(Z.T)).max(0)
    fam = discrete_family(pop, np.ones_like(pop))
    pval = fam.cdf(0, obs)
    pval = 2 * min(pval, 1 - pval)

    #stop

    print 'pvalue:', pval
    return pval
Ejemplo n.º 50
0
def test_data_carving_gaussian(n=100,
                               p=200,
                               s=7,
                               sigma=5,
                               rho=0.3,
                               snr=7.,
                               split_frac=0.8,
                               lam_frac=2.,
                               ndraw=8000,
                               burnin=2000, 
                               df=np.inf,
                               coverage=0.90,
                               compute_intervals=True,
                               nsim=None,
                               use_full_cov=True):

    counter = 0

    return_value = []

    while True:
        counter += 1
        X, y, beta, active, sigma = instance(n=n, 
                                             p=p, 
                                             s=s, 
                                             sigma=sigma, 
                                             rho=rho, 
                                             snr=snr, 
                                             df=df)
        mu = np.dot(X, beta)

        idx = np.arange(n)
        np.random.shuffle(idx)
        stage_one = idx[:int(n*split_frac)]

        lam_theor = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 5000)))).max(0)) * sigma
        DC = data_carving.gaussian(X, y, feature_weights=lam_theor,
                                   sigma=sigma,
                                   stage_one=stage_one)
        DC.fit()

        if len(DC.active) < n - int(n*split_frac):
            DS = data_splitting.gaussian(X, y, feature_weights=lam_theor,
                                         sigma=sigma,
                                         stage_one=stage_one)
            DS.fit(use_full_cov=True)
            DS.fit(use_full_cov=False)
            DS.fit(use_full_cov=use_full_cov)
            data_split = True
        else:
            print('not enough data for second stage data splitting')
            print(DC.active)
            data_split = False

                
        if set(range(s)).issubset(DC.active):
            carve = []
            split = []
            for var in DC.active:
                carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
                if data_split:
                    split.append(DS.hypothesis_test(var))
                else:
                    split.append(np.random.sample()) # appropriate p-value if data splitting can't estimate 2nd stage

            Xa = X[:,DC.active]
            truth = np.dot(np.linalg.pinv(Xa), mu) 

            split_coverage = np.nan
            carve_coverage = np.nan

            TP = s
            FP = DC.active.shape[0] - TP
            v = (carve[s:], split[s:], carve[:s], split[:s], counter, carve_coverage, split_coverage, TP, FP)
            return_value.append(v)
            break
        else:
            TP = len(set(DC.active).intersection(range(s)))
            FP = DC.active.shape[0] - TP
            v = (None, None, None, None, counter, np.nan, np.nan, TP, FP)
            return_value.append(v)

    return return_value
Ejemplo n.º 51
0
def test_tilting(nsim=100):

    P = []
    covered0 = 0
    coveredA = 0
    screen = 0

    for i in range(nsim):
        X, Y, beta, active, sigma = instance(n=20, p=30)

        Y0 = np.random.standard_normal(X.shape[0]) * sigma

        # null pvalues and intervals

        cone, pvalue, idx, sign = selected_covtest(X, Y0, sigma=sigma)
        eta = X[:,idx] * sign
        p1, _, _, fam = gibbs_test(cone, Y0, eta, 
                                   ndraw=50000,
                                   burnin=10000,
                                   alternative='twosided',
                                   sigma_known=True,
                                   tilt=eta,
                                   UMPU=False)

        observed_value = (Y0 * eta).sum()
        lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
        lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
        upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
        covered0 += (lower_lim_final < 0) * (upper_lim_final > 0)
        print covered0 / (i + 1.), 'coverage0'

        # compare to no tilting

        p2 = gibbs_test(cone, Y0, X[:,idx] * sign,
                        ndraw=50000,
                        burnin=10000,
                        alternative='twosided',
                        sigma_known=True,
                        tilt=None,
                        UMPU=False)[0]
        print p2, 'huh'
        P.append((p1, p2))
        Pa = np.array(P)

        # p1 and p2 should be very close, so have high correlation
        print np.corrcoef(Pa.T)[0,1], 'correlation'

        # they should also look uniform -- mean should be about 0.5, sd about 0.29

        print np.mean(Pa, 0), 'mean of nulls'
        print np.std(Pa, 0), 'sd of nulls'

        # alternative intervals

        mu = 3 * X[:,0] * sigma
        YA = np.random.standard_normal(X.shape[0]) * sigma + mu 

        cone, pvalue, idx, sign = selected_covtest(X, YA, sigma=sigma)
        _, _, _, fam = gibbs_test(cone, YA, X[:,idx] * sign,
                                  ndraw=15000,
                                  burnin=10000,
                                  alternative='greater',
                                  sigma_known=True,
                                  tilt=eta)

        if idx == 0:
            screen += 1

            eta = X[:,0] * sign
            observed_value = (YA * eta).sum()
            target = (eta * mu).sum()
            lower_lim, upper_lim = fam.equal_tailed_interval(observed_value)
            lower_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * lower_lim
            upper_lim_final = np.dot(eta, np.dot(cone.covariance, eta)) * upper_lim
            print lower_lim_final, upper_lim_final, target
            coveredA += (lower_lim_final < target) * (upper_lim_final > target)
            print coveredA / (screen * 1.), 'coverageA'

        print screen / (i + 1.), 'screening'

    plt.figure()
    plt.scatter(Pa[:,0], Pa[:,1])

    try:
        import statsmodels.api as sm
        plt.figure()
        G = np.linspace(0, 1, 101)
        plt.plot(G, sm.distributions.ECDF(Pa[:,0])(G))
        plt.plot(G, sm.distributions.ECDF(Pa[:,1])(G))
    except ImportError: # no statsmodels
        pass
Ejemplo n.º 52
0
def test_lasso(s=5, n=500, p=20, randomization=laplace(0, 1)):
    """ Returns null and alternative values for the lasso.

    Model chosen by lasso (non-randomized), inference done as if we randomized.
    """
    X, y, _, nonzero, sigma = instance(n=n,
                                       p=p,
                                       random_signs=True,
                                       s=s,
                                       sigma=1.,
                                       rho=0)
    print 'XTy', np.dot(X.T, y)
    lam_frac = 1.

    lam = sigma * lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    #penalty = glm.gaussian(X, Y, coef=1. / sigma**2, quadratic=quadratic)
    #loss =
    #problem = rr.simple_problem(loss, penalty)
    #solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500})
    #initial_soln = problem.solve(**solve_args)
    clf = linear_model.Lasso(
        alpha=lam /
        (2 * float(n)))  # should be alpha = lam/float(n) to be consistent
    clf.fit(X, y)
    soln = clf.coef_
    active = (soln != 0)  # boolean vector
    active_set = np.where(active)[
        0]  # column numbers of covariates chosen by lasso
    # print 'active', active
    print 'active_set', active_set
    active_size = np.sum(active)
    print 'size of the active set', active_size

    inactive = ~active
    signs = np.sign(soln[active])

    print 'true support', nonzero
    # LASSO region Ay < b
    pseudo_X_M = np.linalg.pinv(X[:, active])
    pseudo_XT_M = np.linalg.pinv(X[:, active].T)

    P_M = np.dot(X[:, active], pseudo_X_M)
    #print 'active', X[:, active_set]
    #print np.dot(P_M, X[:, active_set])
    A01 = np.dot(X[:, inactive].T, np.identity(n) - P_M) / lam
    A02 = -A01.copy()
    #print 'A01',A01
    #print 'A02',A02
    A0 = np.concatenate((A01, A02), axis=0)
    #print 'A0', A0

    A1 = -np.dot(np.diag(signs), pseudo_X_M)
    A = np.concatenate((A0, A1), axis=0)
    #print signs
    #print pseudo_X_M
    #print A1
    b01 = np.ones(p - active_size) - np.dot(
        np.dot(X[:, inactive].T, pseudo_XT_M), signs)
    b02 = np.ones(p - active_size) + np.dot(
        np.dot(X[:, inactive].T, pseudo_XT_M), signs)
    b0 = np.concatenate((b01, b02), axis=0)
    mat = np.linalg.inv(np.dot(X[:, active].T, X[:, active]))
    b1 = -lam * np.dot(np.dot(np.diag(signs), mat), signs)
    b = np.concatenate((b0, b1), axis=0)

    beta_bar = np.linalg.lstsq(X[:, active], y)[0]

    null, alt = [], []

    for i, j in enumerate(
            active_set):  # testing beta_i=0, corresponds to column X_j
        boot_samples, comparison = bootstrap(y, X, active, i, j)
        prob_selection = randomization_cdf(randomization, boot_samples, A, b)
        # print 'comparison', np.sum(comparison)
        # print np.asarray(comparison, dtype=int).shape
        num = np.inner(np.asarray(comparison, dtype=int),
                       np.asarray(prob_selection))
        #print 'num', num
        den = np.sum(np.asarray(prob_selection))
        #print 'den', den
        p_value = num / den
        #p_value = 2 * min(p_value, 1-p_value)
        obs = beta_bar[i]
        print "observed: ", obs, "p value: ", p_value
        if j in nonzero:
            alt.append(p_value)
        else:
            null.append(p_value)
    return null, alt