def test_bayesian_bootstrap_vs_bootstrap_geometric_quantiles( spark_context_or_none): num_enrollments = 20000 rs = np.random.RandomState(42) data = rs.geometric(p=0.1, size=num_enrollments) quantiles = [0.3, 0.5, 0.9] def calc_quantiles(x): return dict(zip(quantiles, np.quantile(x, quantiles))) bb_res = mabsbb.bootstrap_one_branch( data, stat_fn=mabsbb.make_bb_quantile_closure(quantiles), sc=spark_context_or_none) pboot_res = mafsb.bootstrap_one_branch(data, stat_fn=calc_quantiles, sc=spark_context_or_none) for q in bb_res.index: for l in bb_res.columns: assert bb_res.loc[q, l] == pytest.approx(pboot_res.loc[q, l], rel=5e-3), (q, l, bb_res, pboot_res)
def test_bayesian_bootstrap_vs_beta(spark_context_or_none): # The two distributions should be mathematically identical for binary data # like this; differences could emerge from # 1. implementation errors # 2. not taking enough bootstrap replications to suppress variance num_enrollments = 10000 fake_data = pd.Series(np.zeros(num_enrollments)) fake_data[:300] = 1 boot_res = mabsbb.bootstrap_one_branch(fake_data, sc=spark_context_or_none) beta_res = mabsbin.summarize_one_branch_from_agg( pd.Series({ # `-1` to simulate Beta(0, 0) improper prior, closer to # bootstrap for quantiles (i think?) "num_enrollments": len(fake_data) - 1, "num_conversions": fake_data.sum() - 1, })) for l in boot_res.index: assert boot_res.loc[l] == pytest.approx( beta_res.loc[l], # abs=1.9 is usually good enough with a percentile bootstrap # set abs=2.9 because there are lots of tests abs=2.9 / num_enrollments, ), (l, boot_res, beta_res)
def test_bootstrap_one_branch(spark_context): data = np.concatenate([np.zeros(10000), np.ones(10000)]) res = mabsbb.bootstrap_one_branch( spark_context, data, num_samples=100, summary_quantiles=(0.5, 0.61) ) assert res['mean'] == pytest.approx(0.5, rel=1e-1) assert res['0.5'] == pytest.approx(0.5, rel=1e-1) assert res['0.61'] == pytest.approx(0.5, rel=1e-1)
def test_bayesian_bootstrap_vs_bootstrap_geometric(spark_context_or_none): num_enrollments = 20000 rs = np.random.RandomState(42) data = rs.geometric(p=0.1, size=num_enrollments) bb_res = mabsbb.bootstrap_one_branch(data, sc=spark_context_or_none) pboot_res = mafsb.bootstrap_one_branch(data, sc=spark_context_or_none) assert bb_res['mean'] == pytest.approx(10, rel=1e-2) assert bb_res['0.5'] == pytest.approx(10, rel=1e-2) for l in bb_res.index: assert bb_res.loc[l] == pytest.approx(pboot_res.loc[l], rel=5e-3), (l, bb_res, pboot_res)
def test_bayesian_bootstrap_vs_bootstrap_poisson(spark_context): num_enrollments = 10001 rs = np.random.RandomState(42) data = rs.poisson(lam=10, size=num_enrollments) bb_res = mabsbb.bootstrap_one_branch(spark_context, data) pboot_res = mafsb.bootstrap_one_branch(spark_context, data) assert bb_res['mean'] == pytest.approx(10, rel=1e-2) assert bb_res['0.5'] == pytest.approx(10, rel=1e-2) for l in bb_res.index: assert bb_res.loc[l] == pytest.approx(pboot_res.loc[l], rel=5e-3), (l, bb_res, pboot_res)
def test_bootstrap_one_branch_multistat(spark_context): data = np.concatenate([np.zeros(10000), np.ones(10000), [1e20]]) res = mabsbb.bootstrap_one_branch( spark_context, data, stat_fn=lambda x, y: { 'max': np.max(x), 'mean': np.dot(x, y), }, num_samples=5, summary_quantiles=(0.5, 0.61), threshold_quantile=0.9999 ) assert res.shape == (2, 3) assert res.loc['max', 'mean'] == 1 assert res.loc['max', '0.5'] == 1 assert res.loc['max', '0.61'] == 1 assert res.loc['mean', 'mean'] == pytest.approx(0.5, rel=1e-1) assert res.loc['mean', '0.5'] == pytest.approx(0.5, rel=1e-1) assert res.loc['mean', '0.61'] == pytest.approx(0.5, rel=1e-1)
def test_bayesian_bootstrap_vs_bootstrap_poisson_quantiles(spark_context): num_enrollments = 10001 rs = np.random.RandomState(42) data = rs.poisson(lam=10, size=num_enrollments) quantiles = [0.1, 0.5, 0.95] def calc_quantiles(x): return dict(zip(quantiles, np.quantile(x, quantiles))) bb_res = mabsbb.bootstrap_one_branch( spark_context, data, stat_fn=mabsbb.make_bb_quantile_closure(quantiles)) pboot_res = mafsb.bootstrap_one_branch(spark_context, data, stat_fn=calc_quantiles) for q in bb_res.index: for l in bb_res.columns: assert bb_res.loc[q, l] == pytest.approx(pboot_res.loc[q, l], rel=5e-3), (q, l, bb_res, pboot_res)