Example #1
0
def test_bayesian_bootstrap_vs_bootstrap_geometric_quantiles(
        spark_context_or_none):
    num_enrollments = 20000

    rs = np.random.RandomState(42)
    data = rs.geometric(p=0.1, size=num_enrollments)

    quantiles = [0.3, 0.5, 0.9]

    def calc_quantiles(x):
        return dict(zip(quantiles, np.quantile(x, quantiles)))

    bb_res = mabsbb.bootstrap_one_branch(
        data,
        stat_fn=mabsbb.make_bb_quantile_closure(quantiles),
        sc=spark_context_or_none)
    pboot_res = mafsb.bootstrap_one_branch(data,
                                           stat_fn=calc_quantiles,
                                           sc=spark_context_or_none)

    for q in bb_res.index:
        for l in bb_res.columns:
            assert bb_res.loc[q, l] == pytest.approx(pboot_res.loc[q, l],
                                                     rel=5e-3), (q, l, bb_res,
                                                                 pboot_res)
Example #2
0
def test_bayesian_bootstrap_vs_beta(spark_context_or_none):
    # The two distributions should be mathematically identical for binary data
    # like this; differences could emerge from
    # 1. implementation errors
    # 2. not taking enough bootstrap replications to suppress variance
    num_enrollments = 10000
    fake_data = pd.Series(np.zeros(num_enrollments))
    fake_data[:300] = 1

    boot_res = mabsbb.bootstrap_one_branch(fake_data, sc=spark_context_or_none)
    beta_res = mabsbin.summarize_one_branch_from_agg(
        pd.Series({
            # `-1` to simulate Beta(0, 0) improper prior, closer to
            # bootstrap for quantiles (i think?)
            "num_enrollments": len(fake_data) - 1,
            "num_conversions": fake_data.sum() - 1,
        }))

    for l in boot_res.index:
        assert boot_res.loc[l] == pytest.approx(
            beta_res.loc[l],
            # abs=1.9 is usually good enough with a percentile bootstrap
            # set abs=2.9 because there are lots of tests
            abs=2.9 / num_enrollments,
        ), (l, boot_res, beta_res)
def test_bootstrap_one_branch(spark_context):
    data = np.concatenate([np.zeros(10000), np.ones(10000)])
    res = mabsbb.bootstrap_one_branch(
        spark_context, data, num_samples=100, summary_quantiles=(0.5, 0.61)
    )

    assert res['mean'] == pytest.approx(0.5, rel=1e-1)
    assert res['0.5'] == pytest.approx(0.5, rel=1e-1)
    assert res['0.61'] == pytest.approx(0.5, rel=1e-1)
Example #4
0
def test_bayesian_bootstrap_vs_bootstrap_geometric(spark_context_or_none):
    num_enrollments = 20000

    rs = np.random.RandomState(42)
    data = rs.geometric(p=0.1, size=num_enrollments)

    bb_res = mabsbb.bootstrap_one_branch(data, sc=spark_context_or_none)
    pboot_res = mafsb.bootstrap_one_branch(data, sc=spark_context_or_none)

    assert bb_res['mean'] == pytest.approx(10, rel=1e-2)
    assert bb_res['0.5'] == pytest.approx(10, rel=1e-2)

    for l in bb_res.index:
        assert bb_res.loc[l] == pytest.approx(pboot_res.loc[l],
                                              rel=5e-3), (l, bb_res, pboot_res)
def test_bayesian_bootstrap_vs_bootstrap_poisson(spark_context):
    num_enrollments = 10001

    rs = np.random.RandomState(42)
    data = rs.poisson(lam=10, size=num_enrollments)

    bb_res = mabsbb.bootstrap_one_branch(spark_context, data)
    pboot_res = mafsb.bootstrap_one_branch(spark_context, data)

    assert bb_res['mean'] == pytest.approx(10, rel=1e-2)
    assert bb_res['0.5'] == pytest.approx(10, rel=1e-2)

    for l in bb_res.index:
        assert bb_res.loc[l] == pytest.approx(pboot_res.loc[l],
                                              rel=5e-3), (l, bb_res, pboot_res)
def test_bootstrap_one_branch_multistat(spark_context):
    data = np.concatenate([np.zeros(10000), np.ones(10000), [1e20]])
    res = mabsbb.bootstrap_one_branch(
        spark_context, data,
        stat_fn=lambda x, y: {
            'max': np.max(x),
            'mean': np.dot(x, y),
        },
        num_samples=5,
        summary_quantiles=(0.5, 0.61),
        threshold_quantile=0.9999
    )

    assert res.shape == (2, 3)

    assert res.loc['max', 'mean'] == 1
    assert res.loc['max', '0.5'] == 1
    assert res.loc['max', '0.61'] == 1
    assert res.loc['mean', 'mean'] == pytest.approx(0.5, rel=1e-1)
    assert res.loc['mean', '0.5'] == pytest.approx(0.5, rel=1e-1)
    assert res.loc['mean', '0.61'] == pytest.approx(0.5, rel=1e-1)
def test_bayesian_bootstrap_vs_bootstrap_poisson_quantiles(spark_context):
    num_enrollments = 10001

    rs = np.random.RandomState(42)
    data = rs.poisson(lam=10, size=num_enrollments)

    quantiles = [0.1, 0.5, 0.95]

    def calc_quantiles(x):
        return dict(zip(quantiles, np.quantile(x, quantiles)))

    bb_res = mabsbb.bootstrap_one_branch(
        spark_context,
        data,
        stat_fn=mabsbb.make_bb_quantile_closure(quantiles))
    pboot_res = mafsb.bootstrap_one_branch(spark_context,
                                           data,
                                           stat_fn=calc_quantiles)

    for q in bb_res.index:
        for l in bb_res.columns:
            assert bb_res.loc[q, l] == pytest.approx(pboot_res.loc[q, l],
                                                     rel=5e-3), (q, l, bb_res,
                                                                 pboot_res)