コード例 #1
0
def test_MSE(signal=1, n=100, p=10, s=1):

    ninstance = 1
    total_mse = 0
    nvalid_instance = 0
    data_instance = gaussian_instance(n, p, s, signal)
    tau = 1.
    for i in range(ninstance):

        X, y, true_beta, nonzero, sigma = gaussian_instance(n=n,
                                                            p=p,
                                                            s=s,
                                                            signal=signal)
        random_Z = np.random.standard_normal(p)
        lam, epsilon, active, betaE, cube, initial_soln = selection(
            X, y,
            random_Z)  # selection not defined -- is in a file that was deleted
        print("active set", np.where(active)[0])
        if lam < 0:
            print("no active covariates")
        else:
            est = estimation(X, y, active, betaE, cube, epsilon, lam, sigma,
                             tau)
            est.compute_mle_all()

            mse_mle = est.mse_mle(true_beta[active])
            print("MLE", est.mle)
            total_mse += mse_mle
            nvalid_instance += np.sum(active)

    return np.true_divide(total_mse, nvalid_instance)
コード例 #2
0
def simulate(n=100, p=40, rho=0.3, 
             signal=(5,5),
             do_knockoff=False,
             do_AIC=True,
             do_BIC=True,
             full_results={},
             alpha=0.05,
             s=7,
             correlation='equicorrelated',
             maxstep=np.inf,
             compute_maxT_identify=True,
             ndraw=8000,
             burnin=2000):

    if correlation == 'equicorrelated':
        X, y, _, active, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   rho=rho,
                                                   signal=signal,
                                                   s=s,
                                                   random_signs=False,
                                                   equicorrelated=True)
    elif correlation == 'AR':
        X, y, _, active, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   rho=rho,
                                                   signal=signal,
                                                   s=s,
                                                   random_signs=True,
                                                   equicorrelated=False)
    else:
        raise ValueError('correlation must be one of ["equicorrelated", "AR"]')

    full_results.setdefault('n', []).append(n)
    full_results.setdefault('p', []).append(p)
    full_results.setdefault('rho', []).append(rho)
    full_results.setdefault('s', []).append(len(active))
    full_results.setdefault('signalU', []).append(max(signal))
    full_results.setdefault('signalL', []).append(min(signal))
    full_results.setdefault('correlation', []).append(correlation)

    return compute_results(y, X, sigma, active, 
                           do_knockoff=do_knockoff,
                           full_results=full_results,
                           maxstep=maxstep,
                           compute_maxT_identify=compute_maxT_identify,
                           alpha=alpha,
                           ndraw=ndraw,
                           burnin=burnin,
                           do_AIC=True,
                           do_BIC=True,
                           )
コード例 #3
0
    def generate_X(self):

        (n, p, s, rho) = (self.n, self.p, self.s, self.rho)

        X_equi = gaussian_instance(n=n,
                                   p=p,
                                   equicorrelated=True,
                                   rho=self.equicor_rho)[0]
        X_AR = gaussian_instance(n=n, p=p, equicorrelated=False, rho=rho)[0]

        X = np.sqrt(
            self.AR_weight) * X_AR + np.sqrt(1 - self.AR_weight) * X_equi
        X /= np.sqrt((X**2).mean(0))[None, :]

        return X
コード例 #4
0
def test_full_pvals(n=100, p=40, rho=0.3, signal=4, ndraw=8000, burnin=2000):

    X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    from scipy.stats import norm as ndist
    pval = []
    completed_yet = False
    for i in range(min(n, p)):
        FS.step()
        var_select, pval_select = FS.model_pivots(i+1, alternative='twosided',
                                                  which_var=[FS.variables[-1]],
                                                  saturated=False,
                                                  burnin=burnin,
                                                  ndraw=ndraw)[0]
        pval_saturated = FS.model_pivots(i+1, alternative='twosided',
                                         which_var=[FS.variables[-1]],
                                         saturated=True)[0][1]

        # now, nominal ones

        LSfunc = np.linalg.pinv(FS.X[:,FS.variables])
        Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma)
        pval_nominal = 2 * ndist.sf(np.fabs(Z))
        pval.append((var_select, pval_select, pval_saturated, pval_nominal))
            
        if set(active).issubset(np.array(pval)[:,0]) and not completed_yet:
            completed_yet = True
            completion_index = i + 1

    return X, y, beta, active, sigma, np.array(pval), completion_index
コード例 #5
0
def test_mcmc_tests(n=100, p=40, s=4, rho=0.3, signal=5, ndraw=None, burnin=2000,
                    nstep=200,
                    method='serial'):

    X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4

    null_rank, alt_rank = None, None

    for i in range(min(n, p)):
        FS.step()

        if extra_steps <= 0:
            null_rank = forward_mod.mcmc_test(FS, 
                                              i+1, 
                                              variable=FS.variables[i-2], 
                                              nstep=nstep,
                                              burnin=burnin,
                                              method="serial")
            alt_rank = forward_mod.mcmc_test(FS, i+1,
                                             variable=FS.variables[0], 
                                             burnin=burnin,
                                             nstep=nstep, 
                                             method="parallel")
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1

    return null_rank, alt_rank
コード例 #6
0
def sim2():
    X, Y, _, active, sigma = gaussian_instance(n=150, s=3)
    G = data_splitting.gaussian(X, Y, 5., split_frac=0.5, sigma=sigma)
    G.fit(use_full=True)
    if set(active).issubset(G.active) and G.active.shape[0] > len(active):
        return [G.hypothesis_test(G.active[len(active)])]
    return []
コード例 #7
0
    def generate_X(self):

        n, p, s, rho = self.n, self.p, self.s, self.rho
        X = gaussian_instance(n=n, p=p, equicorrelated=False, rho=rho)[0]

        X *= np.sqrt(n)
        return X
コード例 #8
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(XTX, XTXi, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.5
        noisy_S = sampler(scale=scale)
        loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)
        problem = rr.simple_problem(loss, pen)
        soln = problem.solve(max_its=100, tol=1.e-10)
        success += soln != 0
        return set(np.nonzero(success)[0])

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 20,
                                    'sizes': [100] * 5,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
コード例 #9
0
def test_independence_null_mcmc(n=100, p=40, s=4, rho=0.5, signal=5, 
                                ndraw=None, burnin=2000,
                                nstep=200,
                                method='serial'):

    X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho, s=s)
    FS = forward_step(X, y, covariance=sigma**2 * np.identity(n))

    extra_steps = 4
    completed = False

    null_ranks = []
    for i in range(min(n, p)):
        FS.step()

        if completed and extra_steps > 0:
            null_rank = forward_mod.mcmc_test(FS, 
                                              i+1, 
                                              variable=FS.variables[-1], 
                                              nstep=nstep,
                                              burnin=burnin,
                                              method="serial")
            null_ranks.append(int(null_rank))

        if extra_steps <= 0:
            break

        if set(active).issubset(FS.variables):
            extra_steps -= 1
            completed = True

    return tuple(null_ranks)
コード例 #10
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(XTX, XTXi, dispersion, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.
        noisy_S = sampler(scale=scale)
        soln = XTXi.dot(noisy_S)
        solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion))
        pval = ndist.cdf(solnZ)
        pval = 2 * np.minimum(pval, 1 - pval)
        return set(BHfilter(pval, q=0.2))

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi,
                                            dispersion, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 5,
                                    'sizes': [200] * 10,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
コード例 #11
0
def test_approximate_mle(n=100,
                         p=10,
                         s=3,
                         snr=5,
                         rho=0.1,
                         lam_frac = 1.,
                         loss='gaussian',
                         randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)
    if randomizer == 'gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer == 'laplace':
        randomization = randomization.laplace((p,), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer)
    M_est.solve_approx()

    inf = approximate_conditional_density(M_est)
    inf.solve_approx()

    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])

    true_support = np.asarray([i for i in range(p) if i < s])

    nactive = np.sum(active)

    print("active set, true_support", active_set, true_support)

    true_vec = beta[active]

    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support)) == True:

        mle_active = np.zeros(nactive)

        for j in range(nactive):
            mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0]

        print("mle for target", mle_active)
コード例 #12
0
    def generate_X(self):

        (n, p, s, rho) = (self.n, self.p, self.s, self.rho)

        X = gaussian_instance(n=n, p=p, equicorrelated=True, rho=rho, s=0)[0]
        X /= np.sqrt((X**2).sum(0))[None, :]
        X *= np.sqrt(n)

        return X
コード例 #13
0
def test_selection():
    n = 500
    p = 100
    s = 0
    signal = 0.

    np.random.seed(3)  # ensures different y
    X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   sigma=1.,
                                                   rho=0,
                                                   signal=signal)
    lam = 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal(
            (n, 2000)))).max(0)) * sigma

    n, p = X.shape

    loss = rr.glm.gaussian(X, y)
    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    randomizer = randomization.isotropic_gaussian((p, ), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomizer, 'gaussian',
                               'parametric')
    M_est.solve_approx()
    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])
    nactive = np.sum(active)

    prior_variance = 1000.
    noise_variance = sigma**2

    generative_mean = np.zeros(p)
    generative_mean[:nactive] = M_est.initial_soln[active]
    sel_split = selection_probability_random_lasso(M_est, generative_mean)
    min = sel_split.minimize2(nstep=200)
    print(min[0], min[1])

    test_point = np.append(M_est.observed_score_state,
                           np.abs(M_est.initial_soln[M_est._overall]))
    print("value of likelihood",
          sel_split.likelihood_loss.smooth_objective(test_point, mode="func"))

    inv_cov = np.linalg.inv(M_est.score_cov)
    lik = (M_est.observed_score_state -
           generative_mean).T.dot(inv_cov).dot(M_est.observed_score_state -
                                               generative_mean) / 2.
    print("value of likelihood check", lik)
    grad = inv_cov.dot(M_est.observed_score_state - generative_mean)
    print("grad at likelihood loss", grad)
コード例 #14
0
def simulate(n=200, p=50, s=5, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, dispersion, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.5
        noisy_S = sampler(scale=scale)
        soln = XTXi.dot(noisy_S)
        solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion))
        return set(np.nonzero(np.fabs(solnZ) > 2.1)[0])

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi,
                                            dispersion, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                splitting_sampler,
                                success_params=(5, 7),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 30,
                                    'sizes': [100, 100],
                                    'activation': 'relu'
                                })
コード例 #15
0
def sim():
    X, Y, _, active, sigma = gaussian_instance()
    print(sigma)
    G = data_carving.gaussian(X, Y, 1., split_frac=0.9, sigma=sigma)
    G.fit()
    if set(active).issubset(G.active) and G.active.shape[0] > len(active):
        return [
            G.hypothesis_test(G.active[len(active)], burnin=5000, ndraw=10000)
        ]
    return []
コード例 #16
0
def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=5000):

    # description of statistical problem

    np.random.seed(seed)
    X, y, truth = gaussian_instance(n=n,
                                    p=p, 
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5, 
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False,
                                    center=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        n, p = X.shape

        rho = 0.8
        S = sampler(scale=0.) # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X
        Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape)

        X_full = np.hstack([X, Xnew])
        beta_full = np.linalg.pinv(X_full).dot(ynew)
        winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:]
        return set(np.nonzero(winners)[0])

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)


    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(8, 10),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
コード例 #17
0
    def generate_X(self):

        n, p, s, rho = self.n, self.p, self.s, self.rho
        X = gaussian_instance(n=n, p=p, equicorrelated=False, rho=rho)[0]

        beta = np.zeros(p)
        beta[:s] = self.signal
        np.random.shuffle(beta)
        beta = randomize_signs(beta)

        X *= np.sqrt(n)
        return X
コード例 #18
0
def simulate(n=200, p=100, s=10, signal=(1.5, 2), sigma=2, alpha=0.1, B=3000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        S = sampler(scale=0.)  # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid  # will be ok for n>p and non-degen X
        G = lasso_glmnet(X, ynew, *[None] * 4)
        select = G.select()
        return set(list(select[0]))

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                splitting_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 10,
                                    'sizes': [100] * 5,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
コード例 #19
0
def test_greedy_step(n=50, p=100, s=5, signal=5):
    X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0., signal=signal, sigma=1.)
    greedy_step = approximate_inference(X,
                                        y,
                                        beta,
                                        sigma,
                                        seed_n=0,
                                        lam_frac=1.,
                                        loss='gaussian')

    if greedy_step is not None:
        print("output of selection adjusted inference", greedy_step)
        return(greedy_step)
コード例 #20
0
    def generate(self):

        n, p, s = self.n, self.p, self.s
        X = gaussian_instance(n=n, p=p, equicorrelated=True, rho=0., s=s)[0]
        X /= np.sqrt((X**2).sum(0))[None, :]

        beta = np.zeros(p)
        beta[:s] = self.signal
        beta = randomize_signs(beta)
        np.random.shuffle(beta)

        X *= np.sqrt(n)
        Y = X.dot(beta) + np.random.standard_normal(n)

        return X, Y, beta
コード例 #21
0
def test_randomized_lasso(n=300, p=500, s=5, signal=7.5, rho=0.2):

    X, Y, beta, active, sigma = gaussian_instance(n=n,
                                                  p=p,
                                                  s=s,
                                                  rho=rho,
                                                  signal=signal,
                                                  equicorrelated=False)

    L = randomized_lasso.gaussian(X, Y, 3.5 * sigma * np.ones(p))
    signs = L.fit()

    print(np.nonzero(signs != 0)[0])
    print(np.nonzero(beta != 0)[0])
    print(
        L.summary(signs != 0, ndraw=1000, burnin=200, compute_intervals=False))
コード例 #22
0
def test_CV(ndraw=500, sigma_known=True,
            burnin=100,
            s=7,
            rho=0.3,
            method=lasso_tuned,
            snr=5):
    # generate a null and alternative pvalue
    # from a particular model

    X, Y, beta, active, sigma = gaussian_instance(n=500, p=100, s=s, rho=rho, snr=snr)
    if sigma_known:
        sigma = sigma
    else:
        sigma = None

    method_ = method(Y, X, scale_inter=0.0001, scale_valid=0.0001, scale_select=0.0001)

    if True: 
        do_null = True
        if do_null:
            which_var = method_.active_set[s] # the first null one
            method_.setup_inference(which_var) ; iter(method_)

            for i in range(ndraw + burnin):
                method_.next()

            Z = np.array(method_.null_sample[which_var][burnin:])
            family = discrete_family(Z, 
                                     np.ones_like(Z))
            obs = method_._gaussian_obs[which_var]

            pval0 = family.cdf(0, obs)
            pval0 = 2 * min(pval0, 1 - pval0)
        else:
            pval0 = np.random.sample()

        which_var = 0
        method_.setup_inference(which_var); iter(method_)
        for i in range(ndraw + burnin):
            method_.next()

        family = discrete_family(method_.null_sample[which_var][burnin:], 
                                 np.ones(ndraw))
        obs = method_._gaussian_obs[which_var]
        pvalA = family.cdf(0, obs)
        pvalA = 2 * min(pvalA, 1 - pvalA)
        return pval0, pvalA, method_
コード例 #23
0
def test_BIC(do_sample=True, ndraw=8000, burnin=2000, 
             force=False):

    X, Y, beta, active, sigma, _ = gaussian_instance()
    n, p = X.shape
    FS = info_crit_stop(Y, X, sigma, cost=np.log(n))
    final_model = len(FS.variables) 

    active = set(list(active))
    if active.issubset(FS.variables) or force:
        which_var = [v for v in FS.variables if v not in active]

        if do_sample:
            return [pval[-1] for pval in FS.model_pivots(final_model, saturated=False, burnin=burnin, ndraw=ndraw, which_var=which_var)]
        else:
            saturated_pivots = FS.model_pivots(final_model, which_var=which_var)
            return [pval[-1] for pval in saturated_pivots]
    return []
コード例 #24
0
def test_threshold(n, p, s, signal):
    X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=0.,
                                                   signal=signal,
                                                   sigma=1.)
    true_mean = X.dot(beta)
    threshold = test_approximate_inference(X,
                                           y,
                                           true_mean,
                                           sigma,
                                           seed_n=0,
                                           lam_frac=1.,
                                           loss='gaussian')

    if threshold is not None:
        print("output of selection adjusted inference", threshold)
        return (threshold)
コード例 #25
0
ファイル: cv.py プロジェクト: sophial05/selective-inference
def main():
    from selection.tests.instance import gaussian_instance
    np.random.seed(1)
    n, p = 3000, 1000
    X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=30, rho=0., sigma=1)
    loss = rr.glm.gaussian(X,y)
    lam_seq = np.exp(np.linspace(np.log(1.e-6), np.log(1), 30)) * np.fabs(np.dot(X.T,y)).max()
    K = 5
    folds = np.arange(n) % K
    CV_compute = CV(loss, folds, lam_seq)
    lam_CV, CV_val, SD_val, lam_CV_randomized, CV_val_randomized, SD_val_randomized = CV_compute.choose_lambda_CVr()
    #print("CV error curve (nonrandomized):", CV_val)

    minimum_CV = np.min(CV_val)
    lam_idx = list(lam_seq).index(lam_CV)
    SD_min = SD_val[lam_idx]
    lam_1SD = lam_seq[max([i for i in range(lam_seq.shape[0]) if CV_val[i] <= minimum_CV + SD_min])]

    #print(lam_CV, lam_1SD)
    import matplotlib.pyplot as plt
    plt.plot(np.log(lam_seq), CV_val)
    plt.show()
コード例 #26
0
def test_nonrandomized(s=0,
                       n=200,
                       p=10,
                       signal=7,
                       rho=0,
                       lam_frac=0.8,
                       loss='gaussian',
                       solve_args={
                           'min_its': 20,
                           'tol': 1.e-10
                       }):
    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    nonzero = np.where(beta)[0]
    print("lam", lam)
    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    true_vec = beta
    M_est = M_estimator(lam, loss, penalty)
    M_est.solve()
    active = M_est._overall
    nactive = np.sum(active)
    print("nactive", nactive)
    if nactive == 0:
        return None

    #score_mean = M_est.observed_internal_state.copy()
    #score_mean[nactive:] = 0
    M_est.setup_sampler(score_mean=np.zeros(p))
    #M_est.setup_sampler(score_mean=score_mean)
    #M_est.sample(ndraw = 1000, burnin=1000, stepsize=1./p)

    if set(nonzero).issubset(np.nonzero(active)[0]):
        check_screen = True
        #test_stat = lambda x: np.linalg.norm(x)
        #return M_est.hypothesis_test(test_stat, test_stat(M_est.observed_internal_state), stepsize=1./p)

        ci = M_est.confidence_intervals(M_est.observed_internal_state)
        pivots = M_est.coefficient_pvalues(M_est.observed_internal_state)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                    if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                        covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered = coverage(ci)[0]
        #print(pivots)
        #print(coverage)
        return pivots, covered
コード例 #27
0
def simulate(n=100, p=50, s=10, signal=(0, 0), sigma=2, alpha=0.1):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.0,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        S = sampler(scale=0.)  # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid  # will be ok for n>p and non-degen X
        G = lasso_glmnet(X, ynew, *[None] * 4)
        select = G.select()
        return set(list(select[0]))

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    observed_set = selection_algorithm(splitting_sampler)

    # find the target, based on the observed outcome

    # we just take the first target

    pivots, covered, lengths = [], [], []
    naive_pivots, naive_covered, naive_lengths = [], [], []

    for idx in list(observed_set)[:1]:
        print("variable: ", idx, "total selected: ", len(observed_set))
        true_target = truth[idx]

        (pivot, interval) = infer_full_target(selection_algorithm,
                                              observed_set,
                                              idx,
                                              splitting_sampler,
                                              dispersion,
                                              hypothesis=true_target,
                                              fit_probability=probit_fit,
                                              alpha=alpha,
                                              B=500)

        pivots.append(pivot)
        covered.append(
            (interval[0] < true_target) * (interval[1] > true_target))
        lengths.append(interval[1] - interval[0])

        target_sd = np.sqrt(dispersion * XTXi[idx, idx])
        observed_target = np.squeeze(XTXi[idx].dot(X.T.dot(y)))
        quantile = ndist.ppf(1 - 0.5 * alpha)
        naive_interval = (observed_target - quantile * target_sd,
                          observed_target + quantile * target_sd)
        naive_pivots.append((1 - ndist.cdf(
            (observed_target - true_target) / target_sd)))  # one-sided

        naive_covered.append((naive_interval[0] < true_target) *
                             (naive_interval[1] > true_target))
        naive_lengths.append(naive_interval[1] - naive_interval[0])

    return pivots, covered, lengths, naive_pivots, naive_covered, naive_lengths
コード例 #28
0
def test_without_screening(s=10,
                           n=300,
                           p=100,
                           rho=0.,
                           signal=3.5,
                           lam_frac=1.,
                           ndraw=10000,
                           burnin=2000,
                           loss='gaussian',
                           randomizer='laplace',
                           randomizer_scale=1.,
                           scalings=False,
                           subgrad=True,
                           check_screen=False):

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1,
                                                       random_signs=False)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
        X_indep, y_indep, _, _, _ = gaussian_instance(n=n,
                                                      p=p,
                                                      s=s,
                                                      rho=rho,
                                                      signal=signal,
                                                      sigma=1)
        loss_indep = rr.glm.gaussian(X_indep, y_indep)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))
        X_indep, y_indep, _, _ = logistic_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=rho,
                                                   signal=signal,
                                                   random_signs=False)
        loss_indep = rr.glm.logistic(X_indep, y_indep)
    nonzero = np.where(beta)[0]

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ),
                                                      scale=randomizer_scale)

    epsilon = 1. / np.sqrt(n)
    W = np.ones(p) * lam
    #W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)
    M_est.solve()
    active_union = M_est._overall
    nactive = np.sum(active_union)
    print("nactive", nactive)
    active_set = np.nonzero(active_union)[0]
    print("active set", active_set)
    print("true nonzero", np.nonzero(beta)[0])

    views = [M_est]
    queries = multiple_queries(views)
    queries.solve()

    screened = False
    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        screened = True

    if check_screen == False or (check_screen == True and screened == True):

        #if nactive==s:
        #    return None

        if scalings:  # try condition on some scalings
            M_est.condition_on_subgradient()
            M_est.condition_on_scalings()
        if subgrad:
            M_est.decompose_subgradient(conditioning_groups=np.zeros(
                p, dtype=bool),
                                        marginalizing_groups=np.ones(p, bool))

        boot_target1, boot_target_observed1 = pairs_bootstrap_glm(
            loss, active_union, inactive=~active_union)
        boot_target2, boot_target_observed2 = pairs_bootstrap_glm(
            loss_indep, active_union, inactive=~active_union)
        target_observed = (boot_target_observed1 -
                           boot_target_observed2)[:nactive]

        def _target(indices):
            return boot_target1(indices)[:nactive] - boot_target2(
                indices)[:nactive]

        form_covariances = glm_nonparametric_bootstrap(n, n)
        queries.setup_sampler(form_covariances)
        queries.setup_opt_state()

        target_sampler = queries.setup_target(_target,
                                              target_observed,
                                              reference=target_observed)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
        LU = target_sampler.confidence_intervals(target_observed,
                                                 sample=target_sample,
                                                 level=0.9)
        pivots = target_sampler.coefficient_pvalues(
            target_observed, parameter=np.zeros(nactive), sample=target_sample)

        #test_stat = lambda x: np.linalg.norm(x - beta[active_union])
        #observed_test_value = test_stat(target_observed)
        #pivots = target_sampler.hypothesis_test(test_stat,
        #                                       observed_test_value,
        #                                       alternative='twosided',
        #                                       parameter = beta[active_union],
        #                                       ndraw=ndraw,
        #                                       burnin=burnin,
        #                                       stepsize=None)

        true_vec = np.zeros(nactive)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)
            for j in range(nactive):
                if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                    covered[j] = 1
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        covered_naive, ci_length_naive = coverage(LU_naive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)
        return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
コード例 #29
0
def simulate(n=100, p=40, rho=0.3, 
             signal=(5,5),
             do_knockoff=False,
             do_AIC=True,
             do_BIC=True,
             do_glmnet=True,
             full_results={},
             alpha=0.05,
             s=7,
             correlation='equicorrelated',
             maxstep=np.inf,
             compute_maxT_identify=True,
             ndraw=8000,
             burnin=2000):

    if correlation == 'equicorrelated':
        X, y, _, active, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   rho=rho,
                                                   signal=signal,
                                                   s=s,
                                                   random_signs=False,
                                                   equicorrelated=True)[:5]
    elif correlation == 'AR':
        X, y, _, active, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   rho=rho,
                                                   signal=signal,
                                                   s=s,
                                                   random_signs=True,
                                                   equicorrelated=False)[:5]
    else:
        raise ValueError('correlation must be one of ["equicorrelated", "AR"]')

    value = compute_results(y, X, sigma, active, 
                            do_knockoff=do_knockoff,
                            full_results=full_results,
                            maxstep=maxstep,
                            compute_maxT_identify=compute_maxT_identify,
                            alpha=alpha,
                            ndraw=ndraw,
                            burnin=burnin,
                            do_AIC=do_AIC,
                            do_BIC=do_BIC,
                            do_glmnet=do_glmnet,
                            )

    full_results.setdefault('n', []).append(n)
    full_results.setdefault('p', []).append(p)
    full_results.setdefault('rho', []).append(rho)
    full_results.setdefault('s', []).append(len(active))
    full_results.setdefault('signalU', []).append(max(signal))
    full_results.setdefault('signalL', []).append(min(signal))
    full_results.setdefault('correlation', []).append(correlation)

    min_len = min([len(full_results[k]) for k in full_results.keys()])

    for k in full_results.keys():
        full_results[k] = full_results[k][:min_len]

    return value
コード例 #30
0
def test_kfstep(k=4, s=3, n=100, p=10, Langevin_steps=10000, burning=2000):

    X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0, signal=10)
    epsilon = 0.

    randomization = laplace(loc=0, scale=1.)

    j_seq = np.empty(k, dtype=int)
    s_seq = np.empty(k)

    left = np.ones(p, dtype=bool)
    obs = 0

    initial_state = np.zeros(n + np.sum([i for i in range(p-k+1,p+1)]))
    initial_state[:n] = y.copy()

    mat = [np.array((n, ncol)) for ncol in range(p,p-k,-1)]

    curr = n

    keep = np.zeros(p, dtype=bool)

    for i in range(k):
        X_left = X[:,left]
        X_selected = X[:, ~left]
        if (np.sum(left)<p):
            P_perp = np.identity(n) - X_selected.dot(np.linalg.pinv(X_selected))
            mat[i] = P_perp.dot(X_left)
        else:
            mat[i] = X

        mat_complete = np.zeros((n,p))
        mat_complete[:, left] = mat[i]

        T = np.dot(mat[i].T, y)
        T_complete = np.dot(mat_complete.T, y)

        obs = np.max(np.abs(T))
        keep = np.copy(~left)

        random_Z = randomization.rvs(T.shape[0])
        T_random = T + random_Z
        initial_state[curr:(curr+p-i)] = T_random # initializing subgradients
        curr = curr + p-i

        j_seq[i] = np.argmax(np.abs(T_random))
        s_seq[i] = np.sign(T_random[j_seq[i]])

        #def find_index(v, idx1):
        #    _sumF = 0
        #    _sumT = 0
        #    idx = idx1+1
        #    for i in range(v.shape[0]):
        #        if (v[i] == False):
        #            _sumF = _sumF + 1
        #        else:
        #           _sumT = _sumT + 1
        #        if _sumT >= idx: break
        #    return (_sumT + _sumF-1)

        T_complete[left] += random_Z
        left[np.argmax(np.abs(T_complete))] = False


    # conditioning
    linear_part = X[:, keep].T
    P = np.dot(linear_part.T, np.linalg.pinv(linear_part).T)
    I = np.identity(linear_part.shape[1])
    R = I - P


    def full_projection(state, n=n, p=p, k=k):
        """
        """
        new_state = np.empty(state.shape, np.float)
        new_state[:n] = state[:n]
        curr = n
        for i in range(k):
            projection = projection_cone(p-i, j_seq[i], s_seq[i])
            new_state[curr:(curr+p-i)] = projection(state[curr:(curr+p-i)])
            curr = curr+p-i
        return new_state


    def full_gradient(state, n=n, p=p, k=k, X=X, mat=mat):
        data = state[:n]

        grad = np.empty(n + np.sum([i for i in range(p-k+1,p+1)]))
        grad[:n] = - data

        curr = n
        for i in range(k):
            subgrad = state[curr:(curr+p-i)]

            sign_vec = np.sign(-mat[i].T.dot(data) + subgrad)
            grad[curr:(curr + p - i)] = -sign_vec
            curr = curr+p-i
            grad[:n] += mat[i].dot(sign_vec)

        return grad



    sampler = projected_langevin(initial_state,
                                 full_gradient,
                                 full_projection,
                                 1./p)
    samples = []


    for i in range(Langevin_steps):
        if i>burning:
            old_state = sampler.state.copy()
            old_data = old_state[:n]
            sampler.next()
            new_state = sampler.state.copy()
            new_data = new_state[:n]
            new_data = np.dot(P, old_data) + np.dot(R, new_data)
            sampler.state[:n] = new_data
            samples.append(sampler.state.copy())


    samples = np.array(samples)
    Z = samples[:,:n]

    pop = np.abs(mat[k-1].T.dot(Z.T)).max(0)
    fam = discrete_family(pop, np.ones_like(pop))
    pval = fam.cdf(0, obs)
    pval = 2 * min(pval, 1 - pval)

    #stop

    print('pvalue:', pval)
    return pval
コード例 #31
0
def simulate(n=1000, p=100, s=20, signal=(2, 4), sigma=2, alpha=0.1, B=2000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.1,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=True)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, sampler):

        min_success = 6
        ntries = 10

        def _alpha_grid(X, y, center, XTX):
            n, p = X.shape
            alphas, coefs, _ = lasso_path(X, y, Xy=center, precompute=XTX)
            nselected = np.count_nonzero(coefs, axis=0)
            return alphas[nselected < np.sqrt(0.8 * p)]

        alpha_grid = _alpha_grid(X, y, sampler(scale=0.), XTX)
        success = np.zeros((p, alpha_grid.shape[0]))

        for _ in range(ntries):
            scale = 1.  # corresponds to sub-samples of 50%
            noisy_S = sampler(scale=scale)
            _, coefs, _ = lasso_path(X, y, Xy = noisy_S, precompute=XTX, alphas=alpha_grid)
            success += np.abs(np.sign(coefs))

        selected = np.apply_along_axis(lambda row: any(x>min_success for x in row), 1, success)
        vars = set(np.nonzero(selected)[0])
        return vars

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)

    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi)

    # run selection algorithm


    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                splitting_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})