Exemple #1
0
def test_one_sample_hypothesis():
    """Check one sample ttest."""
    schema = DataFrameSchema({
        "height_in_feet": Column(
            Float, [
                Hypothesis.one_sample_ttest(
                    popmean=5,
                    relationship="greater_than",
                    alpha=0.1),
            ]
        ),
    })

    subset_schema = DataFrameSchema({
        "group": Column(String),
        "height_in_feet": Column(
            Float, [
                Hypothesis.one_sample_ttest(
                    sample="A",
                    groupby="group",
                    popmean=5,
                    relationship="greater_than",
                    alpha=0.1),
            ]
        ),
    })

    df = (
        pd.DataFrame({
            "height_in_feet": [8.1, 7, 6.5, 6.7, 5.1],
            "group": ["A", "A", "B", "B", "A"],
        })
    )
    schema.validate(df)
    subset_schema.validate(df)
Exemple #2
0
def test_two_sample_ttest_hypothesis_relationships():
    """Check allowable relationships in two-sample ttest."""
    for relationship in Hypothesis.RELATIONSHIPS:
        schema = DataFrameSchema({
            "height_in_feet": Column(Float, [
                Hypothesis.two_sample_ttest(
                    sample1="M",
                    sample2="F",
                    groupby="sex",
                    relationship=relationship,
                    alpha=0.5),
            ]),
            "sex": Column(String)
        })
        assert isinstance(schema, DataFrameSchema)

    for relationship in ["foo", "bar", 1, 2, 3, None]:
        with pytest.raises(errors.SchemaInitError):
            DataFrameSchema({
                "height_in_feet": Column(Float, [
                    Hypothesis.two_sample_ttest(
                        sample1="M",
                        sample2="F",
                        groupby="sex",
                        relationship=relationship,
                        alpha=0.5),
                ]),
                "sex": Column(String)
            })
Exemple #3
0
def test_dataframe_hypothesis_checks():
    """
    Test that two specific implementations of a Hypothesis work as expected
    and that using a Column that wasn't defined will error.
    """
    df = pd.DataFrame({
        "col1": range(100, 201),
        "col2": range(0, 101),
    })

    hypothesis_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                relationship_kwargs={"alpha": 0.5},
            ),
            # one-sample test
            Hypothesis(
                test=stats.ttest_1samp,
                samples=["col1"],
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                test_kwargs={"popmean": 50},
                relationship_kwargs={"alpha": 0.01},
            ),
        ],
    )

    hypothesis_check_schema.validate(df)

    # raise error when using groupby for a column that doesn't exist
    hypothesis_check_schema_groupby = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                groupby="col3",
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                relationship_kwargs={"alpha": 0.5},
            ),
        ],
    )
    with pytest.raises(errors.SchemaDefinitionError):
        hypothesis_check_schema_groupby.validate(df)
Exemple #4
0
def test_dataframe_hypothesis_checks():

    df = pd.DataFrame({
        "col1": range(100, 201),
        "col2": range(0, 101),
    })

    hypothesis_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                relationship_kwargs={"alpha": 0.5},
            ),
            # one-sample test
            Hypothesis(
                test=stats.ttest_1samp,
                samples=["col1"],
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                test_kwargs={"popmean": 50},
                relationship_kwargs={"alpha": 0.01},
            ),
        ]
    )

    hypothesis_check_schema.validate(df)

    # raise error when using groupby
    hypothesis_check_schema_groupby = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                groupby="col3",
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                relationship_kwargs={"alpha": 0.5},
            ),
        ]
    )
    with pytest.raises(errors.SchemaDefinitionError):
        hypothesis_check_schema_groupby.validate(df)
Exemple #5
0
def hypothesis_accident_probability(feature, increases=True):
    relationship = "greater_than" if increases else "less_than"
    return {
        feature: Column(checks=Check.isin([1, 0])),
        f"{feature}_shap": Column(
            pa.Float,
            checks=Hypothesis.two_sample_ttest(
                sample1=1,
                sample2=0,
                groupby=feature,
                relationship=relationship,
                alpha=0.01,
            )
        ),
    }
Exemple #6
0
def test_hypothesis():
    """Tests the different API calls of Hypothesis."""
    # Example df for tests:
    df = pd.DataFrame({
        "height_in_feet": [6.5, 7, 6.1, 5.1, 4],
        "sex": ["M", "M", "F", "F", "F"],
    })

    # Initialise the different ways of calling a test:
    schema_pass_ttest_on_alpha_val_1 = DataFrameSchema({
        "height_in_feet":
        Column(
            Float,
            [
                Hypothesis.two_sample_ttest(
                    sample1="M",
                    sample2="F",
                    groupby="sex",
                    relationship="greater_than",
                    alpha=0.5,
                ),
            ],
        ),
        "sex":
        Column(String),
    })

    schema_pass_ttest_on_alpha_val_2 = DataFrameSchema({
        "height_in_feet":
        Column(
            Float,
            [
                Hypothesis(
                    test=stats.ttest_ind,
                    samples=["M", "F"],
                    groupby="sex",
                    relationship="greater_than",
                    relationship_kwargs={"alpha": 0.5},
                ),
            ],
        ),
        "sex":
        Column(String),
    })

    schema_pass_ttest_on_alpha_val_3 = DataFrameSchema({
        "height_in_feet":
        Column(
            Float,
            [
                Hypothesis.two_sample_ttest(
                    sample1="M",
                    sample2="F",
                    groupby="sex",
                    relationship="greater_than",
                    alpha=0.5,
                ),
            ],
        ),
        "sex":
        Column(String),
    })

    schema_pass_ttest_on_custom_relationship = DataFrameSchema({
        "height_in_feet":
        Column(
            Float,
            [
                Hypothesis(
                    test=stats.ttest_ind,
                    samples=["M", "F"],
                    groupby="sex",
                    relationship=lambda stat, pvalue, alpha=0.01:
                    (stat > 0 and pvalue / 2 < alpha),
                    relationship_kwargs={"alpha": 0.5},
                )
            ],
        ),
        "sex":
        Column(String),
    })

    # Check the 3 happy paths are successful:
    schema_pass_ttest_on_alpha_val_1.validate(df)
    schema_pass_ttest_on_alpha_val_2.validate(df)
    schema_pass_ttest_on_alpha_val_3.validate(df)
    schema_pass_ttest_on_custom_relationship.validate(df)

    schema_fail_ttest_on_alpha_val_1 = DataFrameSchema({
        "height_in_feet":
        Column(
            Float,
            [
                Hypothesis.two_sample_ttest(
                    sample1="M",
                    sample2="F",
                    groupby="sex",
                    relationship="greater_than",
                    alpha=0.05,
                ),
            ],
        ),
        "sex":
        Column(String),
    })

    schema_fail_ttest_on_alpha_val_2 = DataFrameSchema({
        "height_in_feet":
        Column(
            Float,
            [
                Hypothesis(
                    test=stats.ttest_ind,
                    samples=["M", "F"],
                    groupby="sex",
                    relationship="greater_than",
                    relationship_kwargs={"alpha": 0.05},
                ),
            ],
        ),
        "sex":
        Column(String),
    })

    schema_fail_ttest_on_alpha_val_3 = DataFrameSchema({
        "height_in_feet":
        Column(
            Float,
            [
                Hypothesis.two_sample_ttest(
                    sample1="M",
                    sample2="F",
                    groupby="sex",
                    relationship="greater_than",
                    alpha=0.05,
                ),
            ],
        ),
        "sex":
        Column(String),
    })

    with pytest.raises(errors.SchemaError):
        schema_fail_ttest_on_alpha_val_1.validate(df)
    with pytest.raises(errors.SchemaError):
        schema_fail_ttest_on_alpha_val_2.validate(df)
    with pytest.raises(errors.SchemaError):
        schema_fail_ttest_on_alpha_val_3.validate(df)
Exemple #7
0
percent_accidental = fatal_encounters_clean.disposition_accidental.mean()
display(Markdown(f"{percent_accidental * 100:0.02f}%"))

# %% [markdown] slideshow={"slide_type": "subslide"}
# **Hypothesis**: "the `disposition_accidental` target has a
# class balance of ~2.75%"

# %%
from pandera import Hypothesis

# use the Column object as a stand-alone schema object
target_schema = Column(
    pa.Bool,
    name="disposition_accidental",
    checks=Hypothesis.one_sample_ttest(
        popmean=0.0275, relationship="equal", alpha=0.01
    )
)

target_schema(fatal_encounters_clean);

# %% [markdown] slideshow={"slide_type": "skip"}
# What's the age distribution in the dataset?

# %% tags=["hide_input"] slideshow={"slide_type": "skip"}
fatal_encounters_clean.age.plot.hist(figsize=(8, 5)).set_xlabel("age");

# %% [markdown] slideshow={"slide_type": "skip"}
# **Hypothesis** check: "the `age` column is right-skewed"

# %% slideshow={"slide_type": "skip"}