Ejemplo n.º 1
0
def _multi_check_schema() -> DataFrameSchema:
    """Schema with multiple positivity checks on column `a`"""
    return DataFrameSchema(
        {
            "a": Column(
                int,
                [
                    Check.isin([0, 1]),
                    Check(lambda x: x >= 0),
                ],
            ),
        }
    )
Ejemplo n.º 2
0
def hypothesis_accident_probability(feature, increases=True):
    relationship = "greater_than" if increases else "less_than"
    return {
        feature: Column(checks=Check.isin([1, 0])),
        f"{feature}_shap": Column(
            pa.Float,
            checks=Hypothesis.two_sample_ttest(
                sample1=1,
                sample2=0,
                groupby=feature,
                relationship=relationship,
                alpha=0.01,
            )
        ),
    }
Ejemplo n.º 3
0
 def test_unpickling(self, int_dataframe: pd.DataFrame, n_tile: int):
     """Tests content validity of unpickled SchemaError."""
     df = pd.DataFrame(
         {"a": np.tile(int_dataframe["a"].to_numpy(), n_tile)}
     )
     schema = DataFrameSchema({"a": Column(int, Check.isin([0, 1]))})
     loaded = None
     try:
         # fails for element -1
         schema.validate(df)
     except SchemaError as exc:
         loaded = cast(SchemaError, pickle.loads(pickle.dumps(exc)))
     else:
         pytest.fail("SchemaError not raised")
     assert loaded is not None
     self._validate_error(df, n_tile, loaded)
Ejemplo n.º 4
0
class TestSchemaErrors:
    """Tests pickling behavior of errors.SchemaErrors."""

    @staticmethod
    @pytest.mark.parametrize(
        "schema",
        [
            DataFrameSchema(
                {
                    "a": Column(
                        int,
                        [
                            Check.isin([0, 1]),
                            Check(lambda x: x >= 0),
                        ],
                    ),
                }
            ),
            DataFrameSchema(
                {
                    "a": Column(int, Check.isin([0, 1])),
                }
            ),
        ],
    )
    def test_pickling(int_dataframe: pd.DataFrame, schema: DataFrameSchema):
        """Test for a non-empty pickled object."""
        try:
            schema.validate(int_dataframe, lazy=True)
        except SchemaErrors as exc:
            # expect non-empty bytes
            assert pickle.dumps(exc)
        else:
            pytest.fail("SchemaErrors not raised")

    def test_unpickling(
        self, int_dataframe: pd.DataFrame, multi_check_schema: DataFrameSchema
    ):
        """Tests content validity of unpickled SchemaErrors."""
        try:
            multi_check_schema.validate(int_dataframe, lazy=True)
        except SchemaErrors as exc:
            loaded = pickle.loads(pickle.dumps(exc))
            assert loaded is not None
            self._compare_exception_with_unpickled(exc, loaded)
        else:
            pytest.fail("SchemaErrors not raised")

    @staticmethod
    def _compare_exception_with_unpickled(
        exc_native: SchemaErrors, exc_unpickled: SchemaErrors
    ):
        """Compare content of native SchemaErrors with unpickled one."""
        assert isinstance(exc_native, SchemaErrors)
        assert isinstance(exc_unpickled, SchemaErrors)
        # compare message
        assert str(exc_unpickled) == str(exc_native)
        # compare schema_errors as string, as it is a nested container with
        # elements that compare by identity
        assert str(exc_unpickled.schema_errors) == str(
            exc_native.schema_errors
        )
        assert exc_unpickled.error_counts == exc_native.error_counts
        assert exc_unpickled.failure_cases == str(exc_native.failure_cases)
        assert exc_unpickled.data == str(exc_native.data)
Ejemplo n.º 5
0
class TestSchemaError:
    """Tests pickling behavior of errors.SchemaError."""

    @staticmethod
    @pytest.mark.parametrize(
        "check_obj", [Check.isin([0, 1]), Check(lambda x: x >= 0)]
    )
    def test_pickling(int_dataframe: pd.DataFrame, check_obj: Check):
        """Test for a non-empty pickled object."""
        schema = DataFrameSchema({"a": Column(int, check_obj)})
        try:
            # fails for element -1
            schema.validate(int_dataframe)
        except SchemaError as exc:
            # must be non-empty byte-array
            assert pickle.dumps(exc)
        else:
            pytest.fail("SchemaError not raised")

    @pytest.mark.parametrize("n_tile", [1, 10000])
    def test_unpickling(self, int_dataframe: pd.DataFrame, n_tile: int):
        """Tests content validity of unpickled SchemaError."""
        df = pd.DataFrame(
            {"a": np.tile(int_dataframe["a"].to_numpy(), n_tile)}
        )
        schema = DataFrameSchema({"a": Column(int, Check.isin([0, 1]))})
        loaded = None
        try:
            # fails for element -1
            schema.validate(df)
        except SchemaError as exc:
            loaded = cast(SchemaError, pickle.loads(pickle.dumps(exc)))
        else:
            pytest.fail("SchemaError not raised")
        assert loaded is not None
        self._validate_error(df, n_tile, loaded)

    @staticmethod
    def _validate_error(df: pd.DataFrame, n_tile: int, exc: SchemaError):
        """General validation of Exception content."""
        assert exc is not None
        assert (
            "Schema Column(name=a, type=DataType(int64))> "
            "failed element-wise validator 0" in str(exc)
        )
        assert exc.schema == "<Schema Column(name=a, type=DataType(int64))>"
        assert exc.data == str(df)
        # `failure_cases` is limited to 10 by `n_failure_cases` of `Check`
        assert exc.failure_cases == str(
            pd.DataFrame(
                {
                    "index": np.arange(n_tile) * 3,
                    "failure_case": np.full(n_tile, fill_value=-1, dtype=int),
                }
            ).head(10)
        )
        assert exc.check == "<Check isin: isin({0, 1})>"
        assert exc.check_index == 0
        assert exc.check_output == str(
            pd.Series(np.tile([False, True, True], n_tile), name="a")
        )
Ejemplo n.º 6
0
def test_lazy_dataframe_validation_error():
    """Test exceptions on lazy dataframe validation."""
    schema = DataFrameSchema(
        columns={
            "int_col": Column(Int, Check.greater_than(5)),
            "int_col2": Column(Int),
            "float_col": Column(Float, Check.less_than(0)),
            "str_col": Column(String, Check.isin(["foo", "bar"])),
            "not_in_dataframe": Column(Int),
        },
        checks=Check(lambda df: df != 1,
                     error="dataframe_not_equal_1",
                     ignore_na=False),
        index=Index(String, name="str_index"),
        strict=True,
    )

    dataframe = pd.DataFrame(
        data={
            "int_col": [1, 2, 6],
            "int_col2": ["a", "b", "c"],
            "float_col": [1.0, -2.0, 3.0],
            "str_col": ["foo", "b", "c"],
            "unknown_col": [None, None, None],
        },
        index=pd.Index(["index0", "index1", "index2"], name="str_index"),
    )

    expectation = {
        # schema object context -> check failure cases
        "DataFrameSchema": {
            # check name -> failure cases
            "column_in_schema": ["unknown_col"],
            "dataframe_not_equal_1": [1],
            "column_in_dataframe": ["not_in_dataframe"],
        },
        "Column": {
            "greater_than(5)": [1, 2],
            "pandas_dtype('int64')": ["object"],
            "less_than(0)": [1, 3],
        },
    }

    with pytest.raises(errors.SchemaErrors,
                       match="^A total of .+ schema errors were found"):
        schema.validate(dataframe, lazy=True)

    try:
        schema.validate(dataframe, lazy=True)
    except errors.SchemaErrors as err:

        # data in the caught exception should be equal to the dataframe
        # passed into validate
        assert err.data.equals(dataframe)

        # make sure all expected check errors are in schema errors
        for schema_context, check_failure_cases in expectation.items():
            err_df = err.failure_cases.loc[err.failure_cases.schema_context ==
                                           schema_context]
            for check, failure_cases in check_failure_cases.items():
                assert check in err_df.check.values
                assert (err_df.loc[err_df.check == check].failure_case.isin(
                    failure_cases).all())
Ejemplo n.º 7
0
                 Check.less_than(3)],
         name="column",
     ),
     pd.DataFrame({"column": [1, 2, 3]}),
     {
         "data": pd.DataFrame({"column": [1, 2, 3]}),
         "schema_errors": {
             "Column": {
                 "greater_than(1)": [1],
                 "less_than(3)": [3]
             },
         },
     },
 ],
 [
     Index(String, checks=Check.isin(["a", "b", "c"])),
     pd.DataFrame({"col": [1, 2, 3]}, index=["a", "b", "d"]),
     {
         # expect that the data in the SchemaError is the pd.Index cast
         # into a Series
         "data": pd.Series(["a", "b", "d"]),
         "schema_errors": {
             "Index": {
                 f"isin({set(['a', 'b', 'c'])})": ["d"]
             },
         },
     },
 ],
 [
     MultiIndex(indexes=[
         Index(Int, checks=Check.greater_than(0), name="index0"),
Ejemplo n.º 8
0
    "native_american_alaskan", "race_unspecified",
]
causes_of_death = [
    'asphyxiated_restrained', 'beaten_bludgeoned_with_instrument',
    'burned_smoke_inhalation', 'chemical_agent_pepper_spray',
    'drowned', 'drug_overdose', 'fell_from_a_height', 'gunshot',
    'medical_emergency', 'other', 'stabbed', 'tasered', 'undetermined',
    'unknown', 'vehicle'
]

# %%
training_data_schema = pa.DataFrameSchema(
    {
        # feature columns
        "age": Column(pa.Float, Check.in_range(0, 120), nullable=True),
        "gender": Column(pa.String, Check.isin(genders), nullable=True),
        "race": Column(pa.String, Check.isin(races), nullable=True),
        "cause_of_death": Column(pa.String, Check.isin(causes_of_death), nullable=True),
        "symptoms_of_mental_illness": Column(pa.Bool, nullable=True),
        # target column
        "disposition_accidental": Column(pa.Bool, nullable=False),
    },
    coerce=True  # <- coerce columns to the specified type
)

# %% [markdown] slideshow={"slide_type": "subslide"}
# #### Serialize schema to yaml format:

# %%
print(training_data_schema.to_yaml())