Example #1
0
def test_stratified_sampling_fit_and_sample_n_samples_approx_limit(
        df_treatment, df_pool, col_name):
    stratified_sampling_obj = StratifiedSampling()
    col_name = "col1"
    df_treatment = pd.DataFrame([{
        "id": f"id_{x}",
        col_name: x
    } for x in (list(np.arange(0, 2, 0.1)) + list(np.arange(2, 4, 0.5)) +
                list(np.arange(4, 6, 1)) + list(np.arange(6, 10, 0.2)))])
    df_pool = pd.DataFrame([{
        "id": f"id_{x}",
        col_name: x
    } for x in np.arange(0, 20, 0.01)])
    stratified_sampling_obj.add_column(col_name)

    n_samples_approx = 40
    stratified_sampling_obj.fit_and_sample(
        df_treatment,
        df_pool,
        n_samples_approx=n_samples_approx,
        random_seed=1,
        min_n_sampled_to_n_treatment_ratio=None,
    )
    output = stratified_sampling_obj.data_sample.df
    assert output["_bin_label"].nunique() == 2
    bins_df = stratified_sampling_obj.diagnostics().count_bins()
    assert abs(len(output) - n_samples_approx) <= 1
def diagnostics_obj(df_treatment, df_pool, col_name):
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name, n_bins=4)
    stratified_sampling_obj.fit_and_sample(
        df_treatment, df_pool, n_samples_approx=len(df_treatment), random_seed=1
    )
    return stratified_sampling_obj.diagnostics()
Example #3
0
def test_stratified_sampling_fit_and_sample_n_samples_approx_variations(
        df_treatment, df_pool, col_name):
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name)
    ## attempting to estimate both n_bins and n_samples
    stratified_sampling_obj.fit_and_sample(df_treatment,
                                           df_pool,
                                           random_seed=1)
    output = stratified_sampling_obj.data_sample.df
    bins_df = stratified_sampling_obj.diagnostics().count_bins()
    assert len(bins_df) == 3

    ## enforcing 1 bin
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name, n_bins=1)
    stratified_sampling_obj.fit_and_sample(df_treatment,
                                           df_pool,
                                           random_seed=1)
    output = stratified_sampling_obj.data_sample.df
    bins_df = stratified_sampling_obj.diagnostics().count_bins()

    ## enforcing 4 bins
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name, n_bins=4)
    stratified_sampling_obj.fit_and_sample(df_treatment,
                                           df_pool,
                                           random_seed=1)
    output = stratified_sampling_obj.data_sample.df
    bins_df = stratified_sampling_obj.diagnostics().count_bins()
    assert len(bins_df) == 4

    ## enforcing n_samples_approx=40
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name)
    stratified_sampling_obj.fit_and_sample(
        df_treatment,
        df_pool,
        n_samples_approx=40,
        random_seed=1,
        min_n_sampled_to_n_treatment_ratio=None,
    )
    output = stratified_sampling_obj.data_sample.df
    bins_df = stratified_sampling_obj.diagnostics().count_bins()
    # should be within 1 of n_samples_approx
    assert abs(len(output) - 40) <= 1
Example #4
0
def test_stratified_sampling_fit_and_sample_n_samples_approx_limit(
        df_treatment, df_pool, col_name):
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name)

    n_samples_approx = 40
    stratified_sampling_obj.fit_and_sample(df_treatment,
                                           df_pool,
                                           n_samples_approx=n_samples_approx,
                                           random_seed=1)
    output = stratified_sampling_obj.data_sample.df
    assert output["_bin_label"].nunique() == 2
    bins_df = stratified_sampling_obj.diagnostics().count_bins()
    assert (bins_df["n_sampled"] /
            bins_df["n_pct_sampled"]).round() == n_samples_approx
Example #5
0
def test_stratified_sampling_fit_and_sample_dont_require_equivalence(
        df_treatment, df_pool, col_name):
    df_treatment["col2"] = df_treatment[col_name].astype(int)
    df_pool["col2"] = df_pool[col_name].astype(int)
    df_treatment["col3"] = df_treatment[col_name].astype(int) * 2
    df_pool["col3"] = df_pool[col_name].astype(int) / 2
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name)
    stratified_sampling_obj.add_column("col2")
    stratified_sampling_obj.add_column("col3",
                                       auto_bin_require_equivalence=False)
    ## attempting to estimate both n_bins and n_samples
    stratified_sampling_obj.fit_and_sample(df_treatment,
                                           df_pool,
                                           random_seed=1)
    output = stratified_sampling_obj.data_sample.df
    bins_df = stratified_sampling_obj.diagnostics().count_bins()
    assert not output.empty
Example #6
0
def test_stratified_sampling_fit_and_sample_upper_limit_n_samples_approx(
        df_treatment, df_pool, col_name):
    stratified_sampling_obj = StratifiedSampling()
    stratified_sampling_obj.add_column(col_name)
    ## attempting to estimate both n_bins and n_samples
    with pytest.raises(ModelSamplingException):
        stratified_sampling_obj.fit_and_sample(df_treatment,
                                               df_pool,
                                               random_seed=1,
                                               n_samples_approx=1000)
    stratified_sampling_obj.fit_and_sample(
        df_treatment,
        df_pool,
        random_seed=1,
        n_samples_approx=1000,
        relax_n_samples_approx_constraint=True,
    )
    output = stratified_sampling_obj.data_sample.df
    bins_df = stratified_sampling_obj.diagnostics().count_bins()
    assert not output.empty
def test_stratified_sampling_fit_and_sample_records_equivalence(
        df_treatment, df_pool, col_name, equivalence_feature_ids,
        equivalence_feature_matrix):
    stratified_sampling_obj = StratifiedSampling()
    df_pool["col2"] = df_pool[col_name]
    df_treatment["col2"] = df_treatment[col_name]
    stratified_sampling_obj.add_column(col_name)
    stratified_sampling_obj.add_column("col2")
    ## attempting to estimate both n_bins and n_samples
    StratifiedSamplingBinSelector(
        stratified_sampling_obj,
        df_treatment,
        df_pool,
        min_n_bins=4,
        max_n_bins=6,
        random_seed=1,
        equivalence_method='chisquare',
        equivalence_feature_ids=equivalence_feature_ids,
        equivalence_feature_matrix=equivalence_feature_matrix)
    output = stratified_sampling_obj.data_sample.df
    bins_df = stratified_sampling_obj.diagnostics().count_bins()