Ejemplo n.º 1
0
    def choose_lambda_CVR(self, scale1=None, scale2=None, loss=None):
        """
        Minimizes CV error curve with additive randomization (CVR=CV+R1+R2=CV1+R2)
        """
        if loss is None:
            loss = copy.copy(self.loss)
        CV_curve = []
        X, _ = loss.data
        p = X.shape[1]
        for lam in self.lam_seq:
            penalty = rr.l1norm(p, lagrange=lam)
            #CV_curve.append(self.CV_err(penalty, loss) + (lam,))
            CV_curve.append(self.CV_err(penalty, loss))

        CV_curve = np.array(CV_curve)

        rv1, rv2 = np.zeros(self.lam_seq.shape[0]), np.zeros(
            self.lam_seq.shape[0])
        if scale1 is not None:
            randomization1 = randomization.isotropic_gaussian(
                (self.lam_seq.shape[0], ), scale=scale1)
            rv1 = np.asarray(randomization1._sampler(size=(1, )))
        if scale2 is not None:
            randomization2 = randomization.isotropic_gaussian(
                (self.lam_seq.shape[0], ), scale=scale2)
            rv2 = np.asarray(randomization2._sampler(size=(1, )))
        CVR_val = CV_curve[:, 0] + rv1.flatten() + rv2.flatten()
        lam_CVR = self.lam_seq[np.argmin(CVR_val)]  # lam_CVR minimizes CVR
        CV1_val = CV_curve[:, 0] + rv1.flatten()

        SD = CV_curve[:, 1]
        return lam_CVR, SD, CVR_val, CV1_val, self.lam_seq
def test_approximate_mle(n=100,
                         p=10,
                         s=3,
                         snr=5,
                         rho=0.1,
                         lam_frac = 1.,
                         loss='gaussian',
                         randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)
    if randomizer == 'gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer == 'laplace':
        randomization = randomization.laplace((p,), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer)
    M_est.solve_approx()

    inf = approximate_conditional_density(M_est)
    inf.solve_approx()

    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])

    true_support = np.asarray([i for i in range(p) if i < s])

    nactive = np.sum(active)

    print("active set, true_support", active_set, true_support)

    true_vec = beta[active]

    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support)) == True:

        mle_active = np.zeros(nactive)

        for j in range(nactive):
            mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0]

        print("mle for target", mle_active)
Ejemplo n.º 3
0
def test_selection():
    n = 500
    p = 100
    s = 0
    signal = 0.

    np.random.seed(3)  # ensures different y
    X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   sigma=1.,
                                                   rho=0,
                                                   signal=signal)
    lam = 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal(
            (n, 2000)))).max(0)) * sigma

    n, p = X.shape

    loss = rr.glm.gaussian(X, y)
    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    randomizer = randomization.isotropic_gaussian((p, ), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomizer, 'gaussian',
                               'parametric')
    M_est.solve_approx()
    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])
    nactive = np.sum(active)

    prior_variance = 1000.
    noise_variance = sigma**2

    generative_mean = np.zeros(p)
    generative_mean[:nactive] = M_est.initial_soln[active]
    sel_split = selection_probability_random_lasso(M_est, generative_mean)
    min = sel_split.minimize2(nstep=200)
    print(min[0], min[1])

    test_point = np.append(M_est.observed_score_state,
                           np.abs(M_est.initial_soln[M_est._overall]))
    print("value of likelihood",
          sel_split.likelihood_loss.smooth_objective(test_point, mode="func"))

    inv_cov = np.linalg.inv(M_est.score_cov)
    lik = (M_est.observed_score_state -
           generative_mean).T.dot(inv_cov).dot(M_est.observed_score_state -
                                               generative_mean) / 2.
    print("value of likelihood check", lik)
    grad = inv_cov.dot(M_est.observed_score_state - generative_mean)
    print("grad at likelihood loss", grad)
Ejemplo n.º 4
0
def test_without_screening(s=10,
                           n=300,
                           p=100,
                           rho=0.,
                           signal=3.5,
                           lam_frac=1.,
                           ndraw=10000,
                           burnin=2000,
                           loss='gaussian',
                           randomizer='laplace',
                           randomizer_scale=1.,
                           scalings=False,
                           subgrad=True,
                           check_screen=False):

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1,
                                                       random_signs=False)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
        X_indep, y_indep, _, _, _ = gaussian_instance(n=n,
                                                      p=p,
                                                      s=s,
                                                      rho=rho,
                                                      signal=signal,
                                                      sigma=1)
        loss_indep = rr.glm.gaussian(X_indep, y_indep)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))
        X_indep, y_indep, _, _ = logistic_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=rho,
                                                   signal=signal,
                                                   random_signs=False)
        loss_indep = rr.glm.logistic(X_indep, y_indep)
    nonzero = np.where(beta)[0]

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ),
                                                      scale=randomizer_scale)

    epsilon = 1. / np.sqrt(n)
    W = np.ones(p) * lam
    #W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)
    M_est.solve()
    active_union = M_est._overall
    nactive = np.sum(active_union)
    print("nactive", nactive)
    active_set = np.nonzero(active_union)[0]
    print("active set", active_set)
    print("true nonzero", np.nonzero(beta)[0])

    views = [M_est]
    queries = multiple_queries(views)
    queries.solve()

    screened = False
    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        screened = True

    if check_screen == False or (check_screen == True and screened == True):

        #if nactive==s:
        #    return None

        if scalings:  # try condition on some scalings
            M_est.condition_on_subgradient()
            M_est.condition_on_scalings()
        if subgrad:
            M_est.decompose_subgradient(conditioning_groups=np.zeros(
                p, dtype=bool),
                                        marginalizing_groups=np.ones(p, bool))

        boot_target1, boot_target_observed1 = pairs_bootstrap_glm(
            loss, active_union, inactive=~active_union)
        boot_target2, boot_target_observed2 = pairs_bootstrap_glm(
            loss_indep, active_union, inactive=~active_union)
        target_observed = (boot_target_observed1 -
                           boot_target_observed2)[:nactive]

        def _target(indices):
            return boot_target1(indices)[:nactive] - boot_target2(
                indices)[:nactive]

        form_covariances = glm_nonparametric_bootstrap(n, n)
        queries.setup_sampler(form_covariances)
        queries.setup_opt_state()

        target_sampler = queries.setup_target(_target,
                                              target_observed,
                                              reference=target_observed)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
        LU = target_sampler.confidence_intervals(target_observed,
                                                 sample=target_sample,
                                                 level=0.9)
        pivots = target_sampler.coefficient_pvalues(
            target_observed, parameter=np.zeros(nactive), sample=target_sample)

        #test_stat = lambda x: np.linalg.norm(x - beta[active_union])
        #observed_test_value = test_stat(target_observed)
        #pivots = target_sampler.hypothesis_test(test_stat,
        #                                       observed_test_value,
        #                                       alternative='twosided',
        #                                       parameter = beta[active_union],
        #                                       ndraw=ndraw,
        #                                       burnin=burnin,
        #                                       stepsize=None)

        true_vec = np.zeros(nactive)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)
            for j in range(nactive):
                if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                    covered[j] = 1
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        covered_naive, ci_length_naive = coverage(LU_naive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)
        return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
Ejemplo n.º 5
0
def randomized_marginal_lasso_screening(X, y, beta, sigma):

    from selection.api import randomization

    n, p = X.shape

    random_Z = np.random.standard_normal(p)
    Z_stats = X.T.dot(y)
    randomized_Z_stats = np.true_divide(Z_stats, sigma) + random_Z

    active_1 = np.zeros(p, bool)
    active_1[np.fabs(randomized_Z_stats) > 2.33] = 1
    active_signs_1 = np.sign(randomized_Z_stats[active_1])
    nactive_1 = active_1.sum()
    threshold = 2.33 * np.ones(p)

    #print("active_1", active_1, nactive_1)

    X_step2 = X[:, active_1]
    random_Z_2 = np.random.standard_normal(nactive_1)
    sel = selection(X_step2, y, random_Z_2)
    lam, epsilon, active_2, betaE, cube, initial_soln = sel
    noise_variance = 1.
    lagrange = lam * np.ones(nactive_1)
    nactive_2 = betaE.shape[0]
    #print("active_2", active_2, nactive_2)
    active_signs_2 = np.sign(betaE)

    # getting the active indices
    active = np.zeros(p, bool)
    indices_stage2 = np.where(active_1 == 1)[0]
    active[indices_stage2[active_2]] = 1
    nactive = active.sum()
    print("the active indices after two stages of screening", active.sum())

    primal_feasible_1 = np.fabs(randomized_Z_stats[active_1])
    primal_feasible_2 = np.fabs(betaE)
    feasible_point = np.append(primal_feasible_1, primal_feasible_2)

    randomizer = randomization.isotropic_gaussian((p, ), 1.)

    generative_X = X_step2[:, active_2]
    prior_variance = 1000.

    projection_active = X[:, active].dot(
        np.linalg.inv(X[:, active].T.dot(X[:, active])))
    M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n)
    M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active))
    M_3 = prior_variance * (projection_active.T.dot(X.dot(
        X.T)).dot(projection_active))
    post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y)

    #print("observed data", post_mean)

    post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2)

    unadjusted_intervals = np.vstack([
        post_mean - 1.65 * (np.sqrt(post_var.diagonal())),
        post_mean + 1.65 * (np.sqrt(post_var.diagonal()))
    ])

    grad_map = sel_prob_gradient_map_ms_lasso(
        X,
        feasible_point,  # in R^{|E|_1 + |E|_2}
        active_1,  # the active set chosen by randomized marginal screening
        active_2,  # the active set chosen by randomized lasso
        active_signs_1,  # the set of signs of active coordinates chosen by ms
        active_signs_2,  # the set of signs of active coordinates chosen by lasso
        lagrange,  # in R^p
        threshold,  # in R^p
        generative_X,  # in R^{p}\times R^{n}
        noise_variance,
        randomizer,
        epsilon)

    ms = selective_map_credible_ms_lasso(y, grad_map, prior_variance)

    samples = ms.posterior_samples()

    adjusted_intervals = np.vstack([
        np.percentile(samples, 5, axis=0),
        np.percentile(samples, 95, axis=0)
    ])

    selective_mean = np.mean(samples, axis=0)

    coverage_ad = np.zeros(nactive)
    coverage_unad = np.zeros(nactive)
    ad_length = np.zeros(nactive)
    unad_length = np.zeros(nactive)

    true_val = projection_active.T.dot(X.dot(beta))

    for l in range(nactive):
        if (adjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]):
            coverage_ad[l] += 1
        ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l]
        if (unadjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]):
            coverage_unad[l] += 1
        unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0,
                                                                           l]

    sel_cov = coverage_ad.sum() / nactive
    naive_cov = coverage_unad.sum() / nactive
    ad_len = ad_length.sum() / nactive
    unad_len = unad_length.sum() / nactive
    bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive
    bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / nactive

    return np.vstack(
        [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def test_intervals(s=0,
                   n=200,
                   p=10,
                   signal=7,
                   rho=0.,
                   lam_frac=6.,
                   ndraw=10000,
                   burnin=2000,
                   bootstrap=True,
                   loss='gaussian',
                   intervals='old',
                   randomizer='laplace',
                   solve_args={
                       'min_its': 50,
                       'tol': 1.e-10
                   }):

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=1.)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), scale=1.)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=1.)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    nonzero = np.where(beta)[0]
    epsilon = 1. / np.sqrt(n)

    W = lam_frac * np.ones(p) * lam
    # W[0] = 0 # use at least some unpenalized
    groups = np.concatenate([np.arange(10) for i in range(p / 10)])
    #print(groups)
    #groups = np.arange(p)
    penalty = rr.group_lasso(groups,
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    mv = multiple_queries([M_est1])
    # second randomization
    #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    #mv = multiple_queries([M_est1, M_est2])

    mv.solve()

    active_union = M_est1.selection_variable['variables']
    print("active set", np.nonzero(active_union)[0])
    nactive = np.sum(active_union)

    if nactive == 0:
        return None

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        target_sampler, target_observed = glm_target(loss,
                                                     active_union,
                                                     mv,
                                                     bootstrap=bootstrap)

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
            pivots_mle = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=target_sampler.reference,
                sample=target_sample)
            pivots_truth = target_sampler.coefficient_pvalues(
                target_observed, parameter=true_vec, sample=target_sample)
            pvalues = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=target_sample)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots_mle = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=target_sampler.reference,
                sample=full_sample)
            pivots_truth = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)
            pvalues = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=full_sample)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        L, U = LU.T
        ci_length_sel = np.zeros(nactive)
        covered = np.zeros(nactive, np.bool)
        naive_covered = np.zeros(nactive, np.bool)
        ci_length_naive = np.zeros(nactive)
        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                covered[j] = 1
            ci_length_sel[j] = U[j] - L[j]
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0]
            active_var[j] = active_set[j] in nonzero

        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)

        return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\
               naive_pvals, naive_covered, ci_length_naive, active_var
def randomized_marginal_screening(X,
                                  y,
                                  beta,
                                  sigma):

    from selection.api import randomization

    n, p = X.shape

    random_Z = np.random.standard_normal(p)
    Z_stats = X.T.dot(y)
    randomized_Z_stats = np.true_divide(Z_stats, sigma) + random_Z

    active = np.zeros(p, bool)
    active[np.fabs(randomized_Z_stats) > 2.33] = 1
    active_signs = np.sign(randomized_Z_stats[active])
    nactive = active.sum()
    threshold = 2.33 * np.ones(p)

    if nactive >= 1:

        feasible_point = np.fabs(randomized_Z_stats[active])

        noise_variance = sigma ** 2

        randomizer = randomization.isotropic_gaussian((p,), 1.)

        generative_X = X[:, active]
        prior_variance = 1000.

        grad_map = sel_prob_gradient_map_ms(X,
                                            feasible_point,
                                            active,
                                            active_signs,
                                            threshold,
                                            generative_X,
                                            noise_variance,
                                            randomizer)

        inf = selective_inf_ms(y, grad_map, prior_variance)

        samples = inf.posterior_samples()

        adjusted_intervals = np.vstack([np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0)])

        projection_active = X[:, active].dot(np.linalg.inv(X[:, active].T.dot(X[:, active])))
        M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n)
        M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active))
        M_3 = prior_variance * (projection_active.T.dot(X.dot(X.T)).dot(projection_active))
        post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y)

        print("observed data", post_mean)

        post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2)

        unadjusted_intervals = np.vstack([post_mean - 1.65 * (np.sqrt(post_var.diagonal())),
                                          post_mean + 1.65 * (np.sqrt(post_var.diagonal()))])

        coverage_ad = np.zeros(nactive)
        coverage_unad = np.zeros(nactive)
        nerr = 0.

        true_val = projection_active.T.dot(X.dot(beta))

        active_set = [i for i in range(p) if active[i]]


        for l in range(nactive):
            if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]):
                coverage_ad[l] += 1
            if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]):
                coverage_unad[l] += 1


        sel_cov = coverage_ad.sum() / nactive
        naive_cov = coverage_unad.sum() / nactive

        return sel_cov, naive_cov

    else:
        return None
Ejemplo n.º 8
0
def hiv_inference_test():
    if not os.path.exists("NRTI_DATA.txt"):
        NRTI = pandas.read_table(
            "http://hivdb.stanford.edu/pages/published_analysis/genophenoPNAS2006/DATA/NRTI_DATA.txt", na_values="NA")
    else:
        NRTI = pandas.read_table("NRTI_DATA.txt")

    NRTI_specific = []
    NRTI_muts = []
    for i in range(1, 241):
        d = NRTI['P%d' % i]
        for mut in np.unique(d):
            if mut not in ['-', '.'] and len(mut) == 1:
                test = np.equal(d, mut)
                if test.sum() > 10:
                    NRTI_specific.append(np.array(np.equal(d, mut)))
                    NRTI_muts.append("P%d%s" % (i, mut))

    NRTI_specific = NRTI.from_records(np.array(NRTI_specific).T, columns=NRTI_muts)

    X_NRTI = np.array(NRTI_specific, np.float)
    Y = NRTI['3TC']  # shorthand
    keep = ~np.isnan(Y).astype(np.bool)
    X_NRTI = X_NRTI[np.nonzero(keep)];
    Y = Y[keep]
    Y = np.array(np.log(Y), np.float);
    Y -= Y.mean()
    X_NRTI -= X_NRTI.mean(0)[None, :];
    X_NRTI /= X_NRTI.std(0)[None, :]
    X = X_NRTI  # shorthand
    n, p = X.shape
    X /= np.sqrt(n)

    ols_fit = sm.OLS(Y, X).fit()
    sigma_3TC = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1)

    lam_frac = 1.
    loss = rr.glm.gaussian(X, Y)
    epsilon = 1. / np.sqrt(n)
    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma_3TC
    print(lam)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.)

    from selection.api import randomization
    randomization = randomization.isotropic_gaussian((p,), scale=1.)

    #change grid for parameter for HIV data
    M_est = M_estimator_map(loss, epsilon, penalty, randomization, randomization_scale=0.7)
    M_est.solve_approx()
    active = M_est._overall
    nactive = np.sum(active)

    ci_active = np.zeros((nactive, 2))
    ci_length = np.zeros(nactive)
    mle_active = np.zeros((nactive, 1))

    ci = approximate_conditional_density(M_est)
    ci.solve_approx()

    class target_class(object):
        def __init__(self, target_cov):
            self.target_cov = target_cov
            self.shape = target_cov.shape

    target = target_class(M_est.target_cov)
    ci_naive = naive_confidence_intervals(target, M_est.target_observed)

    for j in range(nactive):
        ci_active[j, :] = np.array(ci.approximate_ci(j))
        ci_length[j] = ci_active[j, 1] - ci_active[j, 0]
        mle_active[j, :] = ci.approx_MLE_solver(j, nstep=100)[0]

    unadjusted_mle = np.zeros((nactive, 1))
    for j in range(nactive):
        unadjusted_mle[j, :] = ci.target_observed[j]

    adjusted_intervals = np.hstack([mle_active, ci_active]).T
    unadjusted_intervals = np.hstack([unadjusted_mle, ci_naive]).T

    print("adjusted confidence", adjusted_intervals)
    print("naive confidence", unadjusted_intervals)

    intervals = np.vstack([unadjusted_intervals, adjusted_intervals])

    return intervals
Ejemplo n.º 9
0
def randomized_forward_step(X, y, beta, sigma):
    from selection.api import randomization

    n, p = X.shape

    random_Z = np.random.standard_normal(p)
    Z_stats = X.T.dot(y)
    random_obs = X.T.dot(y) + random_Z

    active_index = np.argmax(np.fabs(random_obs))
    active = np.zeros(p, bool)
    active[active_index] = 1
    active_sign = np.sign(random_obs[active_index])
    print("observed statistic", random_obs[active_index],
          Z_stats[active_index])
    print("first step--chosen index and sign", active_index, active_sign)

    feasible_point = np.fabs(random_obs[active_index])

    noise_variance = sigma**2

    randomizer = randomization.isotropic_gaussian((p, ), 1.)

    generative_X = X[:, active]
    prior_variance = 1000.

    grad_map = sel_prob_gradient_map_fs(X, feasible_point, active, active_sign,
                                        generative_X, noise_variance,
                                        randomizer)

    inf = selective_map_credible_fs(y, grad_map, prior_variance)

    samples = inf.posterior_samples()

    adjusted_intervals = np.vstack([
        np.percentile(samples, 5, axis=0),
        np.percentile(samples, 95, axis=0)
    ])

    selective_mean = np.mean(samples, axis=0)

    projection_active = X[:, active].dot(
        np.linalg.inv(X[:, active].T.dot(X[:, active])))
    M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n)
    M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active))
    M_3 = prior_variance * (projection_active.T.dot(X.dot(
        X.T)).dot(projection_active))
    post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y)

    print("observed data", post_mean)

    post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2)

    unadjusted_intervals = np.vstack([
        post_mean - 1.65 * (np.sqrt(post_var.diagonal())),
        post_mean + 1.65 * (np.sqrt(post_var.diagonal()))
    ])

    coverage_ad = np.zeros(1)
    coverage_unad = np.zeros(1)
    ad_length = np.zeros(1)
    unad_length = np.zeros(1)

    true_val = projection_active.T.dot(X.dot(beta))

    if (adjusted_intervals[0, 0] <=
            true_val[0]) and (true_val[0] <= adjusted_intervals[1, 0]):
        coverage_ad[0] += 1

    ad_length[0] = adjusted_intervals[1, 0] - adjusted_intervals[0, 0]
    if (unadjusted_intervals[0, 0] <=
            true_val[0]) and (true_val[0] <= unadjusted_intervals[1, 0]):
        coverage_unad[0] += 1

    unad_length[0] = unadjusted_intervals[1, 0] - unadjusted_intervals[0, 0]

    sel_cov = coverage_ad.sum() / 1.
    naive_cov = coverage_unad.sum() / 1.
    ad_len = ad_length.sum() / 1.
    unad_len = unad_length.sum() / 1.
    bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / 1.
    bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / 1.

    return np.vstack(
        [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
Ejemplo n.º 10
0
def randomized_lasso_trial(X,
                           y,
                           beta,
                           sigma,
                           lam,
                           loss='gaussian',
                           randomizer='gaussian',
                           estimation='parametric'):

    from selection.api import randomization

    n, p = X.shape
    if loss == "gaussian":
        loss = rr.glm.gaussian(X, y)

    elif loss == "logistic":
        loss = rr.glm.logistic(X, y)

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    randomization = randomization.isotropic_gaussian((p, ), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomization,
                               randomizer, estimation)
    M_est.solve_approx()
    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])
    nactive = np.sum(active)

    prior_variance = 1000.
    noise_variance = sigma**2
    projection_active = X[:, active].dot(
        np.linalg.inv(X[:, active].T.dot(X[:, active])))
    M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n)
    M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active))
    M_3 = prior_variance * (projection_active.T.dot(X.dot(
        X.T)).dot(projection_active))
    post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y)

    print("observed data", post_mean)

    post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2)

    unadjusted_intervals = np.vstack([
        post_mean - 1.65 * (np.sqrt(post_var.diagonal())),
        post_mean + 1.65 * (np.sqrt(post_var.diagonal()))
    ])

    #generative_mean = np.zeros(p)
    #sel_split = selection_probability_random_lasso(M_est, generative_mean)
    #test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall]))

    #print("gradient at test point", sel_split.smooth_objective(test_point, mode= "grad"))

    grad_lasso = sel_inf_random_lasso(M_est, prior_variance)
    samples = grad_lasso.posterior_samples()
    adjusted_intervals = np.vstack([
        np.percentile(samples, 5, axis=0),
        np.percentile(samples, 95, axis=0)
    ])

    selective_mean = np.mean(samples, axis=0)

    coverage_ad = np.zeros(nactive)
    coverage_unad = np.zeros(nactive)
    ad_length = np.zeros(nactive)
    unad_length = np.zeros(nactive)

    true_val = projection_active.T.dot(X.dot(beta))

    for l in range(nactive):
        if (adjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]):
            coverage_ad[l] += 1
        ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l]
        if (unadjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]):
            coverage_unad[l] += 1
        unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0,
                                                                           l]

    sel_cov = coverage_ad.sum() / nactive
    naive_cov = coverage_unad.sum() / nactive
    ad_len = ad_length.sum() / nactive
    unad_len = unad_length.sum() / nactive
    bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive
    bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / nactive

    return np.vstack(
        [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
Ejemplo n.º 11
0
def test_cv(n=100,
            p=50,
            s=5,
            signal=7.5,
            K=5,
            rho=0.,
            randomizer='gaussian',
            randomizer_scale=1.,
            scale1=0.1,
            scale2=0.2,
            lam_frac=1.,
            glmnet=True,
            loss='gaussian',
            intervals='old',
            bootstrap=False,
            condition_on_CVR=True,
            marginalize_subgrad=True,
            ndraw=10000,
            burnin=2000,
            nboot=nboot):

    print(n, p, s, condition_on_CVR, scale1, scale2)
    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=randomizer_scale)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        glm_loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        glm_loss = rr.glm.logistic(X, y)

    epsilon = 1. / np.sqrt(n)

    # view 1
    cv = CV_view(glm_loss,
                 loss_label=loss,
                 lasso_randomization=randomizer,
                 epsilon=epsilon,
                 scale1=scale1,
                 scale2=scale2)
    if glmnet:
        try:
            cv.solve(glmnet=glmnet)
        except ImportError:
            cv.solve(glmnet=False)
    else:
        cv.solve(glmnet=False)

    # for the test make sure we also run the python code

    cv_py = CV_view(glm_loss,
                    loss_label=loss,
                    lasso_randomization=randomizer,
                    epsilon=epsilon,
                    scale1=scale1,
                    scale2=scale2)
    cv_py.solve(glmnet=False)

    lam = cv.lam_CVR
    print("lam", lam)

    if condition_on_CVR:
        cv.condition_on_opt_state()
        lam = cv.one_SD_rule(direction="up")
        print("new lam", lam)

    # non-randomized Lasso, just looking how many vars it selects
    problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam))
    beta_hat = problem.solve()
    active_hat = beta_hat != 0
    print("non-randomized lasso ", active_hat.sum())

    # view 2
    W = lam_frac * np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    M_est1 = glm_group_lasso(glm_loss, epsilon, penalty, randomizer)

    if nboot > 0:
        cv.nboot = M_est1.nboot = nboot

    mv = multiple_queries([cv, M_est1])
    mv.solve()

    active_union = M_est1._overall
    nactive = np.sum(active_union)
    print("nactive", nactive)
    if nactive == 0:
        return None

    nonzero = np.where(beta)[0]

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        if marginalize_subgrad == True:
            M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, bool),
                                         marginalizing_groups=np.ones(p, bool))

        target_sampler, target_observed = glm_target(glm_loss,
                                                     active_union,
                                                     mv,
                                                     bootstrap=bootstrap)

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)

            pivots_truth = target_sampler.coefficient_pvalues(
                target_observed, parameter=true_vec, sample=target_sample)
            pvalues = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=target_sample)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots_truth = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)
            pvalues = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=full_sample)

        L, U = LU.T
        sel_covered = np.zeros(nactive, np.bool)
        sel_length = np.zeros(nactive)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        naive_covered = np.zeros(nactive, np.bool)
        naive_length = np.zeros(nactive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)

        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                sel_covered[j] = 1
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            sel_length[j] = U[j] - L[j]
            naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0]
            active_var[j] = active_set[j] in nonzero

        q = 0.2
        BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0]
        return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
Ejemplo n.º 12
0
def approximate_inference(X,
                          y,
                          beta,
                          sigma,
                          seed_n = 0,
                          lam_frac = 1.,
                          loss='gaussian',
                          randomization_scale = 1.):

    from selection.api import randomization
    n, p = X.shape
    np.random.seed(seed_n)
    if loss == "gaussian":
        loss = rr.glm.gaussian(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
    elif loss == "logistic":
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))

    randomization = randomization.isotropic_gaussian((p,), scale=randomization_scale)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    GS = greedy_score_map(loss,
                          penalty,
                          np.zeros(p, dtype=bool),
                          np.ones(p, dtype=bool),
                          randomization,
                          randomization_scale)

    GS.solve_approx()
    active = GS._overall
    nactive = np.sum(active)

    if nactive == 0:
        return None
    else:
        active_set = np.asarray([i for i in range(p) if active[i]])
        s = beta.sum()
        true_support = np.asarray([i for i in range(p) if i < s])
        true_vec = beta[active]

        if (set(active_set).intersection(set(true_support)) == set(true_support)) == True:
            ci = approximate_conditional_density(GS)
            ci.solve_approx()
            sys.stderr.write("True target to be covered" + str(true_vec) + "\n")

            class target_class(object):
                def __init__(self, target_cov):
                    self.target_cov = target_cov
                    self.shape = target_cov.shape

            target = target_class(GS.target_cov)
            ci_naive = naive_confidence_intervals(target, GS.target_observed)
            naive_covered = np.zeros(nactive)
            naive_risk = np.zeros(nactive)

            ci_sel = np.zeros((nactive, 2))
            sel_MLE = np.zeros(nactive)
            sel_length = np.zeros(nactive)

            for j in range(nactive):
                ci_sel[j, :] = np.array(ci.approximate_ci(j))
                sel_MLE[j] = ci.approx_MLE_solver(j, step=1, nstep=150)[0]
                sel_length[j] = ci_sel[j, 1] - ci_sel[j, 0]

            sel_covered = np.zeros(nactive, np.bool)
            sel_risk = np.zeros(nactive)

            for j in range(nactive):

                sel_risk[j] = (sel_MLE[j] - true_vec[j]) ** 2.
                naive_risk[j] = (GS.target_observed[j] - true_vec[j]) ** 2.

                if (ci_sel[j, 0] <= true_vec[j]) and (ci_sel[j, 1] >= true_vec[j]):
                    sel_covered[j] = 1
                if (ci_naive[j, 0] <= true_vec[j]) and (ci_naive[j, 1] >= true_vec[j]):
                    naive_covered[j] = 1

            print("lengths", sel_length.sum() / nactive)
            print("selective intervals", ci_sel.T)
            print("risks", sel_risk.sum() / nactive)

            return np.transpose(np.vstack((ci_sel[:, 0],
                                           ci_sel[:, 1],
                                           ci_naive[:, 0],
                                           ci_naive[:, 1],
                                           sel_MLE,
                                           GS.target_observed,
                                           sel_covered,
                                           naive_covered,
                                           sel_risk,
                                           naive_risk)))
Ejemplo n.º 13
0
def test_condition(ndraw=10000, burnin=2000, scalings=True):
    s, n, p = 6, 600, 40

    X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=0.2,
                                                   snr=5)
    randomizer = randomization.isotropic_gaussian((p, ), scale=sigma)

    lam_frac = 1.5

    loss = rr.glm.gaussian(X, y)
    epsilon = 1. / np.sqrt(n)

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal(
            (n, 2000)))).max(0)) * sigma
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    views = []
    nview = 3
    for i in range(nview):
        views.append(glm_group_lasso(loss, epsilon, penalty, randomizer))

    queries = multiple_queries(views)
    queries.solve()

    active_union = np.zeros(p, np.bool)
    for view in views:
        active_union += view.selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)

    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        if nactive == s:
            return None

        if scalings:  # try condition on some scalings
            views[0].condition_on_scalings()
            views[0].condition_on_subgradient()
            views[1].condition_on_subgradient()
            views[2].condition_on_scalings()
        else:
            views[0].condition_on_subgradient()
            views[1].condition_on_subgradient()
            views[2].condition_on_subgradient()

        active_set = np.nonzero(active_union)[0]
        target_sampler, target_observed = glm_target(loss, active_union,
                                                     queries)

        pvalues = target_sampler.coefficient_pvalues(target_observed,
                                                     alternative='twosided',
                                                     ndraw=ndraw,
                                                     burnin=burnin)

        active_var = np.zeros_like(pvalues, np.bool)
        _nonzero = np.array([i in nonzero for i in active_set])
        active_var[_nonzero] = True
        return pvalues, active_var
def test_marginalize(s=4,
                     n=600,
                     p=200,
                     rho=0.,
                     signal=3.5,
                     lam_frac=2.5,
                     ndraw=10000,
                     burnin=2000,
                     loss='gaussian',
                     randomizer='gaussian',
                     randomizer_scale=1.,
                     nviews=3,
                     scalings=True,
                     subgrad=True,
                     parametric=False,
                     intervals='old'):
    print(n, p, s)

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=randomizer_scale)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    epsilon = 1. / np.sqrt(n)

    W = lam_frac * np.ones(p) * lam
    #W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    views = []
    for i in range(nviews):
        if parametric == False:
            views.append(glm_group_lasso(loss, epsilon, penalty, randomizer))
        else:
            views.append(
                glm_group_lasso_parametric(loss, epsilon, penalty, randomizer))

    queries = multiple_queries(views)
    queries.solve()

    active_union = np.zeros(p, np.bool)
    for view in views:
        active_union += view.selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)

    nonzero = np.where(beta)[0]
    true_vec = beta[active_union]

    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        check_screen = True

        if nactive == s:
            return None

        # BUG: if this scalings code is moveed after the decompose_subgradient,
        # code seems to run fine

        if scalings:  # try condition on some scalings
            for i in range(nviews):
                views[i].condition_on_scalings()
        if subgrad:
            for i in range(nviews):
                conditioning_groups = np.zeros(p, dtype=bool)
                conditioning_groups[:(p / 2)] = True
                marginalizing_groups = np.zeros(p, dtype=bool)
                marginalizing_groups[(p / 2):] = True
                views[i].decompose_subgradient(
                    conditioning_groups=conditioning_groups,
                    marginalizing_groups=marginalizing_groups)

        active_set = np.nonzero(active_union)[0]
        target_sampler, target_observed = glm_target(loss,
                                                     active_union,
                                                     queries,
                                                     bootstrap=False,
                                                     parametric=parametric)
        #reference= beta[active_union])

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
            pivots = target_sampler.coefficient_pvalues(target_observed,
                                                        parameter=true_vec,
                                                        sample=target_sample)
        elif intervals == 'new':
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)

        #test_stat = lambda x: np.linalg.norm(x - beta[active_union])
        #observed_test_value = test_stat(target_observed)
        #pivots = target_sampler.hypothesis_test(test_stat,
        #                                       observed_test_value,
        #                                       alternative='twosided',
        #                                       parameter = beta[active_union],
        #                                       ndraw=ndraw,
        #                                       burnin=burnin,
        #                                       stepsize=None)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                    if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                        covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        covered_naive, ci_length_naive = coverage(LU_naive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)

        return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
Ejemplo n.º 15
0
def randomized_lasso_trial(X,
                           y,
                           beta,
                           sigma,
                           ndraw=1000,
                           burnin=50):

    n, p = X.shape

    random_Z = np.random.standard_normal(p)
    sel = selection(X, y, random_Z)
    lam, epsilon, active, betaE, cube, initial_soln = sel

    if sel is not None:

        lagrange = lam * np.ones(p)
        active_sign = np.sign(betaE)
        nactive = active.sum()
        print("number of selected variables by Lasso", nactive)

        feasible_point = np.fabs(betaE)

        noise_variance = sigma ** 2

        randomizer = randomization.isotropic_gaussian((p,), 1.)

        generative_X = X[:, active]
        prior_variance = 1000.

        grad_map = sel_prob_gradient_map_lasso(X,
                                               feasible_point,
                                               active,
                                               active_sign,
                                               lagrange,
                                               generative_X,
                                               noise_variance,
                                               randomizer,
                                               epsilon)

        inf = selective_inf_lasso(y, grad_map, prior_variance)

        # for the tests, just take a few steps
        samples = inf.posterior_samples(langevin_steps=ndraw, burnin=burnin)

        adjusted_intervals = np.vstack([np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0)])

        selective_mean = np.mean(samples, axis=0)

        projection_active = X[:, active].dot(np.linalg.inv(X[:, active].T.dot(X[:, active])))
        M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n)
        M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active))
        M_3 = prior_variance * (projection_active.T.dot(X.dot(X.T)).dot(projection_active))
        post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y)

        print("observed data", post_mean)

        post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2)

        unadjusted_intervals = np.vstack([post_mean - 1.65 * (np.sqrt(post_var.diagonal())),
                                          post_mean + 1.65 * (np.sqrt(post_var.diagonal()))])

        coverage_ad = np.zeros(nactive)
        coverage_unad = np.zeros(nactive)
        ad_length = np.zeros(nactive)
        unad_length = np.zeros(nactive)

        true_val = projection_active.T.dot(X.dot(beta))

        for l in range(nactive):
            if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]):
                coverage_ad[l] += 1
            ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l]
            if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]):
                coverage_unad[l] += 1
            unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0, l]


        sel_cov = coverage_ad.sum() / nactive
        naive_cov = coverage_unad.sum() / nactive
        ad_len = ad_length.sum() / nactive
        unad_len = unad_length.sum() / nactive
        bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive
        bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / nactive

        return np.vstack([sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])

    else:
        return None
def test_approximate_ci(n=200,
                        p=50,
                        s=0,
                        snr=5,
                        threshold = 3.,
                        rho=0.1,
                        lam_frac = 1.,
                        loss='gaussian',
                        randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)

    if randomizer=='gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer=='laplace':
        randomization = randomization.laplace((p,), scale=1.)

    active_bool = np.zeros(p, np.bool)
    #active_bool[range(3)] = 1
    inactive_bool = ~active_bool

    TS = threshold_score_approx(loss,
                                threshold,
                                randomization,
                                active_bool,
                                inactive_bool,
                                randomizer)

    TS.solve_approx()
    active = TS._overall
    print("nactive", active.sum())

    ci = approximate_conditional_density(TS)
    ci.solve_approx()

    active_set = np.asarray([i for i in range(p) if active[i]])
    true_support = np.asarray([i for i in range(p) if i < s])
    nactive = np.sum(active)
    print("active set, true_support", active_set, true_support)
    true_vec = beta[active]
    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support))== True:

        ci_active = np.zeros((nactive, 2))
        covered = np.zeros(nactive, np.bool)
        ci_length = np.zeros(nactive)
        pivots = np.zeros(nactive)

        class target_class(object):
            def __init__(self, target_cov):
                self.target_cov = target_cov
                self.shape = target_cov.shape

        target = target_class(TS.target_cov)
        ci_naive = naive_confidence_intervals(target, TS.target_observed)
        naive_pvals = naive_pvalues(target, TS.target_observed, true_vec)
        naive_covered = np.zeros(nactive)
        toc = time.time()

        for j in range(nactive):
            ci_active[j, :] = np.array(ci.approximate_ci(j))
            if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j,1] >= true_vec[j]):
                covered[j] = 1
            ci_length[j] = ci_active[j,1] - ci_active[j,0]
            print(ci_active[j, :])
            pivots[j] = ci.approximate_pvalue(j, true_vec[j])

            # naive ci
            if (ci_naive[j,0]<=true_vec[j]) and (ci_naive[j,1]>=true_vec[j]):
                naive_covered[j]+=1

        tic = time.time()
        print('ci time now', tic - toc)

        return covered, ci_length, pivots, naive_covered, naive_pvals
Ejemplo n.º 17
0
def randomized_lasso_trial(X,
                           y,
                           beta,
                           sigma,
                           lam,
                           loss='logistic',
                           randomizer='gaussian',
                           estimation='parametric'):

    from selection.api import randomization

    n, p = X.shape
    if loss == "gaussian":
        loss = rr.glm.gaussian(X, y)

    elif loss == "logistic":
        loss = rr.glm.logistic(X, y)

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    randomization = randomization.isotropic_gaussian((p, ), scale=1.)

    M_est = M_estimator_approx_logistic(loss, epsilon, penalty, randomization,
                                        randomizer, estimation)
    M_est.solve_approx()
    active = M_est._overall
    #print("here",glm.shape)
    active_set = np.asarray([i for i in range(p) if active[i]])
    nactive = np.sum(active)
    glm = M_est.observed_score_state[:nactive]

    prior_variance = 100000.

    #generative_mean = np.zeros(p)
    #sel_split = selection_probability_random_lasso(M_est, generative_mean)
    #test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall]))

    #print("gradient at test point", sel_split.smooth_objective(test_point, mode= "grad"))

    class target_class(object):
        def __init__(self, target_cov):
            self.target_cov = target_cov
            self.shape = target_cov.shape

    target = target_class(M_est.target_cov)
    unadjusted_intervals = (naive_confidence_intervals(
        target, M_est.target_observed)).T

    grad_lasso = sel_inf_random_lasso(M_est, prior_variance)
    samples = grad_lasso.posterior_samples()
    adjusted_intervals = np.vstack([
        np.percentile(samples, 5, axis=0),
        np.percentile(samples, 95, axis=0)
    ])

    selective_mean = np.mean(samples, axis=0)

    true_val = np.zeros(nactive)

    coverage_ad = np.zeros(nactive)
    coverage_unad = np.zeros(nactive)
    ad_length = np.zeros(nactive)
    unad_length = np.zeros(nactive)

    for l in range(nactive):
        if (adjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]):
            coverage_ad[l] += 1
        ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l]
        if (unadjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]):
            coverage_unad[l] += 1
        unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0,
                                                                           l]

    sel_cov = coverage_ad.sum() / nactive
    naive_cov = coverage_unad.sum() / nactive
    ad_len = ad_length.sum() / nactive
    unad_len = unad_length.sum() / nactive
    bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive
    bayes_risk_unad = np.power(glm - true_val, 2.).sum() / nactive

    return np.vstack(
        [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
Ejemplo n.º 18
0
def test_approximate_ci(n=100,
                        p=10,
                        s=0,
                        snr=5,
                        rho=0.1,
                        lam_frac = 1.,
                        loss='gaussian',
                        randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        loss = rr.glm.gaussian(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))

    if randomizer == 'gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer == 'laplace':
        randomization = randomization.laplace((p,), scale=1.)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    # active_bool = np.zeros(p, np.bool)
    # active_bool[range(3)] = 1
    # inactive_bool = ~active_bool

    GS = greedy_score_step_approx(loss,
                                  penalty,
                                  np.zeros(p, dtype=bool),
                                  np.ones(p, dtype=bool),
                                  randomization,
                                  randomizer)

    GS.solve_approx()
    active = GS._overall
    print("nactive", active.sum())

    ci = approximate_conditional_density(GS)
    ci.solve_approx()

    active_set = np.asarray([i for i in range(p) if active[i]])
    true_support = np.asarray([i for i in range(p) if i < s])
    nactive = np.sum(active)
    print("active set, true_support", active_set, true_support)
    true_vec = beta[active]
    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support)) == True:

        ci_active = np.zeros((nactive, 2))
        covered = np.zeros(nactive, np.bool)
        ci_length = np.zeros(nactive)
        pivots = np.zeros(nactive)

        toc = time.time()

        for j in range(nactive):
            ci_active[j, :] = np.array(ci.approximate_ci(j))
            if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j, 1] >= true_vec[j]):
                covered[j] = 1
            ci_length[j] = ci_active[j, 1] - ci_active[j, 0]
            # print(ci_active[j, :])
            pivots[j] = ci.approximate_pvalue(j, true_vec[j])

        print("confidence intervals", ci_active)
        tic = time.time()
        print('ci time now', tic - toc)
Ejemplo n.º 19
0
OLS_3TC = ols_fit.params

lam_frac = 1.
loss = rr.glm.gaussian(X, Y)
epsilon = 1. / np.sqrt(n)
lam = lam_frac * np.mean(
    np.fabs(np.dot(X.T, np.random.standard_normal(
        (n, 2000)))).max(0)) * sigma_3TC
print(lam)

W = np.ones(p) * lam
penalty = rr.group_lasso(np.arange(p),
                         weights=dict(zip(np.arange(p), W)),
                         lagrange=1.)

randomization = randomization.isotropic_gaussian((p, ), scale=1.)

M_est = M_estimator_approx(loss,
                           epsilon,
                           penalty,
                           randomization,
                           randomizer='gaussian')
M_est.solve_approx()
active = M_est._overall
active_set = np.asarray([i for i in range(p) if active[i]])
nactive = np.sum(active)

active_set_0 = [NRTI_muts[i] for i in range(p) if active[i]]

ci_active = np.zeros((nactive, 2))
ci_length = np.zeros(nactive)
Ejemplo n.º 20
0
def test_approximate_inference(X,
                               y,
                               true_mean,
                               sigma,
                               threshold=3.,
                               seed_n=0,
                               lam_frac=1.,
                               loss='gaussian',
                               randomization_scale=1.):

    from selection.api import randomization
    n, p = X.shape
    np.random.seed(seed_n)
    if loss == "gaussian":
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))
        loss = rr.glm.logistic(X, y)

    active_bool = np.zeros(p, np.bool)
    inactive_bool = ~active_bool

    randomization = randomization.isotropic_gaussian((p, ),
                                                     scale=randomization_scale)
    TS = threshold_score_map(loss, threshold, randomization, active_bool,
                             inactive_bool, randomization_scale)

    TS.solve_approx()
    active = TS._overall
    active_set = np.asarray([i for i in range(p) if active[i]])
    nactive = np.sum(active)
    sys.stderr.write("number of active selected by thresholding" +
                     str(nactive) + "\n")
    sys.stderr.write("Active set selected by thresholding" + str(active_set) +
                     "\n")
    sys.stderr.write("Observed target" + str(TS.target_observed) + "\n")

    if nactive == 0:
        return None

    else:
        true_vec = np.linalg.inv(X[:, active].T.dot(X[:, active])).dot(
            X[:, active].T).dot(true_mean)

        sys.stderr.write("True target to be covered" + str(true_vec) + "\n")

        class target_class(object):
            def __init__(self, target_cov):
                self.target_cov = target_cov
                self.shape = target_cov.shape

        target = target_class(TS.target_cov)

        ci_naive = naive_confidence_intervals(target, TS.target_observed)
        naive_covered = np.zeros(nactive)
        naive_risk = np.zeros(nactive)

        ci = approximate_conditional_density(TS)
        ci.solve_approx()

        ci_sel = np.zeros((nactive, 2))
        sel_MLE = np.zeros(nactive)
        sel_length = np.zeros(nactive)

        for j in range(nactive):
            ci_sel[j, :] = np.array(ci.approximate_ci(j))
            sel_MLE[j] = ci.approx_MLE_solver(j, step=1, nstep=150)[0]
            sel_length[j] = ci_sel[j, 1] - ci_sel[j, 0]

        sel_covered = np.zeros(nactive, np.bool)
        sel_risk = np.zeros(nactive)

        for j in range(nactive):

            sel_risk[j] = (sel_MLE[j] - true_vec[j])**2.
            naive_risk[j] = (TS.target_observed[j] - true_vec[j])**2.

            if (ci_sel[j, 0] <= true_vec[j]) and (ci_sel[j, 1] >= true_vec[j]):
                sel_covered[j] = 1
            if (ci_naive[j, 0] <= true_vec[j]) and (ci_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1

        print("lengths", sel_length.sum() / nactive)
        print("selective intervals", ci_sel.T)
        print("risks", sel_risk.sum() / nactive)

        return np.transpose(
            np.vstack((ci_sel[:, 0], ci_sel[:, 1], ci_naive[:, 0],
                       ci_naive[:, 1], sel_MLE, TS.target_observed,
                       sel_covered, naive_covered, sel_risk, naive_risk)))