コード例 #1
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(XTX, XTXi, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.5
        noisy_S = sampler(scale=scale)
        loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)
        problem = rr.simple_problem(loss, pen)
        soln = problem.solve(max_its=100, tol=1.e-10)
        success += soln != 0
        return set(np.nonzero(success)[0])

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 20,
                                    'sizes': [100] * 5,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
コード例 #2
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(XTX, XTXi, dispersion, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.
        noisy_S = sampler(scale=scale)
        soln = XTXi.dot(noisy_S)
        solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion))
        pval = ndist.cdf(solnZ)
        pval = 2 * np.minimum(pval, 1 - pval)
        return set(BHfilter(pval, q=0.2))

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi,
                                            dispersion, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 5,
                                    'sizes': [200] * 10,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
コード例 #3
0
def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=5000):

    # description of statistical problem

    np.random.seed(seed)
    X, y, truth = gaussian_instance(n=n,
                                    p=p, 
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5, 
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False,
                                    center=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        n, p = X.shape

        rho = 0.8
        S = sampler(scale=0.) # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X
        Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape)

        X_full = np.hstack([X, Xnew])
        beta_full = np.linalg.pinv(X_full).dot(ynew)
        winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:]
        return set(np.nonzero(winners)[0])

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)


    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(8, 10),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
コード例 #4
0
def simulate(n=100):

    # description of statistical problem

    truth = np.array([2., -2.]) / np.sqrt(n)

    dispersion = 2
    data = np.sqrt(dispersion) * np.random.standard_normal(
        (n, 2)) + np.multiply.outer(np.ones(n), truth)
    S = np.sum(data, 0)
    observed_sampler = normal_sampler(S, dispersion * n * np.identity(2))

    def selection_algorithm(sampler):
        min_success = 1
        ntries = 3
        success = 0
        for _ in range(ntries):
            noisyS = sampler(scale=0.5)
            success += noisyS.sum() > 0.2 * np.sqrt(n) * np.sqrt(dispersion)
        if success >= min_success:
            return set([1, 0])
        return set([1])

    # run selection algorithm

    observed_set = selection_algorithm(observed_sampler)

    # find the target, based on the observed outcome

    # we just take the first target

    pivots, covered, lengths = [], [], []
    for idx in observed_set:
        true_target = truth[idx]

        pivot, interval = infer_full_target(selection_algorithm,
                                            observed_set, [idx],
                                            observed_sampler,
                                            dispersion,
                                            hypothesis=[true_target],
                                            fit_probability=probit_fit)[0][:2]

        pivots.append(pivot)
        covered.append(
            (interval[0] < true_target) * (interval[1] > true_target))
        lengths.append(interval[1] - interval[0])

    return pivots, covered, lengths
コード例 #5
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p, 
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5, 
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        S = sampler(scale=0.5) # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X
        G = lasso_glmnet(X, ynew, *[None]*4)
        select = G.select()
        return set(list(select[0]))

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=gbm_fit_sk,
                                fit_args={'n_estimators':2000})
コード例 #6
0
def simulate(n=100):

    # description of statistical problem

    truth = np.array([2., -2.]) / np.sqrt(n)

    data = np.random.standard_normal(
        (n, 2)) + np.multiply.outer(np.ones(n), truth)
    S = np.mean(data, 0)
    observed_sampler = normal_sampler(S, 1 / n * np.identity(2))

    def selection_algorithm(sampler):
        min_success = 1
        ntries = 3
        success = 0
        for _ in range(ntries):
            noisyS = sampler(scale=0.5)
            success += noisyS.sum() > 0.2 / np.sqrt(n)
        return success >= min_success

    # run selection algorithm

    observed_outcome = selection_algorithm(observed_sampler)

    # find the target, based on the observed outcome

    if observed_outcome:  # target is truth[0]
        (true_target, observed_target, target_cov,
         cross_cov) = (truth[0], S[0], 1. / n * np.identity(1),
                       np.array([1., 0.]).reshape((2, 1)) / n)
    else:
        (true_target, observed_target, target_cov,
         cross_cov) = (truth[1], S[1], 1. / n * np.identity(1),
                       np.array([0., 1.]).reshape((2, 1)) / n)

    pivot, interval = infer_general_target(selection_algorithm,
                                           observed_outcome,
                                           observed_sampler,
                                           observed_target,
                                           cross_cov,
                                           target_cov,
                                           hypothesis=true_target,
                                           fit_probability=probit_fit)[:2]

    return pivot, (interval[0] < true_target) * (
        interval[1] > true_target), interval[1] - interval[0]
コード例 #7
0
def simulate(n=400,
             p=100,
             s=10,
             signal=(0.5, 1),
             sigma=2,
             alpha=0.1,
             seed=0,
             B=2000):

    # description of statistical problem

    np.random.seed(seed)
    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False,
                                    center=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        n, p = X.shape
        idx = np.random.choice(np.arange(n), 200, replace=False)

        S = sampler(scale=0.)  # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid  # will be ok for n>p and non-degen X

        G = lasso_glmnet(X[idx], ynew[idx], *[None] * 4)
        select = G.select()
        return set(list(select[0]))

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    df = full_model_inference(X,
                              y,
                              truth,
                              selection_algorithm,
                              smooth_sampler,
                              success_params=(1, 1),
                              B=B,
                              fit_probability=keras_fit,
                              fit_args={
                                  'epochs': 20,
                                  'sizes': [100] * 5,
                                  'dropout': 0.,
                                  'activation': 'relu'
                              })

    if df is not None:

        observed_set = list(df['variable'])
        true_target = truth[observed_set]

        np.random.seed(seed)
        X2, _, _ = gaussian_instance(n=n,
                                     p=p,
                                     s=s,
                                     equicorrelated=False,
                                     rho=0.5,
                                     sigma=sigma,
                                     signal=signal,
                                     random_signs=True,
                                     center=False,
                                     scale=False)[:3]
        stage_1 = np.random.choice(np.arange(n), 200, replace=False)
        stage_2 = sorted(set(range(n)).difference(stage_1))
        X2 = X2[stage_2]
        y2 = X2.dot(truth) + sigma * np.random.standard_normal(X2.shape[0])

        XTXi_2 = np.linalg.inv(X2.T.dot(X2))
        resid2 = y2 - X2.dot(XTXi_2.dot(X2.T.dot(y2)))
        dispersion_2 = np.linalg.norm(resid2)**2 / (X2.shape[0] - X2.shape[1])

        naive_df = naive_full_model_inference(X2,
                                              y2,
                                              dispersion_2,
                                              observed_set,
                                              alpha=alpha)

        df = pd.merge(df, naive_df, on='variable')
        return df
コード例 #8
0
def simulate(n=1000, p=60, s=15, signal=3, sigma=2, alpha=0.1):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p, 
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5, 
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)
    splitting_sampler = split_sampler(X * y[:, None], covS / n)

    def meta_algorithm(XTX, XTXi, dispersion, sampler):

        min_success = 3
        ntries = 7
        p = XTX.shape[0]
        success = np.zeros(p)
        for _ in range(ntries):
            scale = 0.5
            frac = 1. / (scale**2 + 1.)
            noisy_S = sampler(scale=scale)
            noisy_beta = XTXi.dot(noisy_S)
            noisy_Z = noisy_beta / np.sqrt(dispersion * np.diag(XTXi) * frac)
            success += np.fabs(noisy_Z) > 2
        return set(np.nonzero(success >= min_success)[0])

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion)

    # run selection algorithm

    observed_set = selection_algorithm(splitting_sampler)

    # find the target, based on the observed outcome

    # we just take the first target  

    pivots, covered, lengths, naive_lengths = [], [], [], []
    for idx in observed_set:
        print(idx, len(observed_set))
        true_target = truth[idx]

        (pivot, 
         interval) = infer_full_target(selection_algorithm,
                                       observed_set,
                                       [idx],
                                       splitting_sampler,
                                       dispersion,
                                       hypothesis=[true_target],
                                       fit_probability=probit_fit,
                                       success_params=(1, 1),
                                       alpha=alpha,
                                       B=1000)[0][:2]

        pivots.append(pivot)
        covered.append((interval[0] < true_target) * (interval[1] > true_target))
        lengths.append(interval[1] - interval[0])

        target_sd = np.sqrt(dispersion * XTXi[idx, idx])
        naive_lengths.append(2 * ndist.ppf(1 - 0.5 * alpha) * target_sd)

    return pivots, covered, lengths, naive_lengths
コード例 #9
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        S = sampler(scale=0.)  # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid  # will be ok for n>p and non-degen X
        G = lasso_glmnet(X, ynew, *[None] * 4)
        select = G.select()
        return set(list(select[0]))

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    success_params = (1, 1)

    observed_set = repeat_selection(selection_algorithm, smooth_sampler,
                                    *success_params)

    # find the target, based on the observed outcome

    # we just take the first target

    pivots, covered, lengths, pvalues = [], [], [], []
    lower, upper = [], []
    naive_pvalues, naive_pivots, naive_covered, naive_lengths = [], [], [], []

    targets = []

    observed_list = sorted(observed_set)
    np.random.shuffle(observed_list)
    for idx in observed_list[:1]:
        print("variable: ", idx, "total selected: ", len(observed_set))
        true_target = [truth[idx]]
        targets.extend(true_target)

        (pivot, interval,
         pvalue) = infer_full_target(selection_algorithm,
                                     observed_set, [idx],
                                     splitting_sampler,
                                     dispersion,
                                     hypothesis=true_target,
                                     fit_probability=probit_fit,
                                     success_params=success_params,
                                     alpha=alpha,
                                     B=B,
                                     single=True)[0][:3]

        pvalues.append(pvalue)
        pivots.append(pivot)
        covered.append(
            (interval[0] < true_target[0]) * (interval[1] > true_target[0]))
        lengths.append(interval[1] - interval[0])

        target_sd = np.sqrt(dispersion * XTXi[idx, idx])
        observed_target = np.squeeze(XTXi[idx].dot(X.T.dot(y)))
        quantile = ndist.ppf(1 - 0.5 * alpha)
        naive_interval = (observed_target - quantile * target_sd,
                          observed_target + quantile * target_sd)

        naive_pivot = (1 - ndist.cdf(
            (observed_target - true_target[0]) / target_sd))
        naive_pivot = 2 * min(naive_pivot, 1 - naive_pivot)
        naive_pivots.append(naive_pivot)

        naive_pvalue = (1 - ndist.cdf(observed_target / target_sd))
        naive_pvalue = 2 * min(naive_pivot, 1 - naive_pivot)
        naive_pvalues.append(naive_pvalue)

        naive_covered.append((naive_interval[0] < true_target[0]) *
                             (naive_interval[1] > true_target[0]))
        naive_lengths.append(naive_interval[1] - naive_interval[0])
        lower.append(interval[0])
        upper.append(interval[1])

    if len(pvalues) > 0:
        return pd.DataFrame({
            'pivot': pivots,
            'target': targets,
            'pvalue': pvalues,
            'coverage': covered,
            'length': lengths,
            'naive_pivot': naive_pivots,
            'naive_coverage': naive_covered,
            'naive_length': naive_lengths,
            'upper': upper,
            'lower': lower
        })
コード例 #10
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p, 
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5, 
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, dispersion, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        scale = 0.
        noisy_S = sampler(scale=scale)
        soln = XTXi.dot(noisy_S)
        solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion))
        pval = ndist.cdf(solnZ)
        pval = 2 * np.minimum(pval, 1 - pval)
        return set(BHfilter(pval, q=0.2))

    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion)

    # run selection algorithm

    success_params = (1, 1)

    observed_set = repeat_selection(selection_algorithm, smooth_sampler, *success_params)

    # find the target, based on the observed outcome

    # we just take the first target  

    targets = []
    idx = sorted(observed_set)
    np.random.shuffle(idx)
    idx = idx[:1]
    if len(idx) > 0:
        print("variable: ", idx, "total selected: ", len(observed_set))
        true_target = truth[idx]

        results = infer_full_target(selection_algorithm,
                                    observed_set,
                                    idx,
                                    splitting_sampler,
                                    dispersion,
                                    hypothesis=true_target,
                                    fit_probability=logit_fit,
                                    fit_args={'df':20},
                                    success_params=success_params,
                                    alpha=alpha,
                                    B=B,
                                    single=True)

        pvalues = [r[2] for r in results]
        covered = [(r[1][0] < t) * (r[1][1] > t) for r, t in zip(results, true_target)]
        pivots = [r[0] for r in results]

        target_sd = np.sqrt(np.diag(dispersion * XTXi)[idx])
        observed_target = XTXi[idx].dot(X.T.dot(y))
        quantile = ndist.ppf(1 - 0.5 * alpha)
        naive_interval = np.vstack([observed_target - quantile * target_sd, observed_target + quantile * target_sd])

        naive_pivots = (1 - ndist.cdf((observed_target - true_target) / target_sd))
        naive_pivots = 2 * np.minimum(naive_pivots, 1 - naive_pivots)

        naive_pvalues = (1 - ndist.cdf(observed_target / target_sd))
        naive_pvalues = 2 * np.minimum(naive_pvalues, 1 - naive_pvalues)

        naive_covered = (naive_interval[0] < true_target) * (naive_interval[1] > true_target)
        naive_lengths = naive_interval[1] - naive_interval[0]
        lower = [r[1][0] for r in results]
        upper = [r[1][1] for r in results]
        lengths = np.array(upper) - np.array(lower)

        return pd.DataFrame({'pivot':pivots,
                             'pvalue':pvalues,
                             'coverage':covered,
                             'length':lengths,
                             'naive_pivot':naive_pivots,
                             'naive_coverage':naive_covered,
                             'naive_length':naive_lengths,
                             'upper':upper,
                             'lower':lower,
                             'targets':true_target,
                             'batch_size':B * np.ones(len(idx), np.int)})