def _multi_check_schema() -> DataFrameSchema: """Schema with multiple positivity checks on column `a`""" return DataFrameSchema( { "a": Column( int, [ Check.isin([0, 1]), Check(lambda x: x >= 0), ], ), } )
def hypothesis_accident_probability(feature, increases=True): relationship = "greater_than" if increases else "less_than" return { feature: Column(checks=Check.isin([1, 0])), f"{feature}_shap": Column( pa.Float, checks=Hypothesis.two_sample_ttest( sample1=1, sample2=0, groupby=feature, relationship=relationship, alpha=0.01, ) ), }
def test_unpickling(self, int_dataframe: pd.DataFrame, n_tile: int): """Tests content validity of unpickled SchemaError.""" df = pd.DataFrame( {"a": np.tile(int_dataframe["a"].to_numpy(), n_tile)} ) schema = DataFrameSchema({"a": Column(int, Check.isin([0, 1]))}) loaded = None try: # fails for element -1 schema.validate(df) except SchemaError as exc: loaded = cast(SchemaError, pickle.loads(pickle.dumps(exc))) else: pytest.fail("SchemaError not raised") assert loaded is not None self._validate_error(df, n_tile, loaded)
class TestSchemaErrors: """Tests pickling behavior of errors.SchemaErrors.""" @staticmethod @pytest.mark.parametrize( "schema", [ DataFrameSchema( { "a": Column( int, [ Check.isin([0, 1]), Check(lambda x: x >= 0), ], ), } ), DataFrameSchema( { "a": Column(int, Check.isin([0, 1])), } ), ], ) def test_pickling(int_dataframe: pd.DataFrame, schema: DataFrameSchema): """Test for a non-empty pickled object.""" try: schema.validate(int_dataframe, lazy=True) except SchemaErrors as exc: # expect non-empty bytes assert pickle.dumps(exc) else: pytest.fail("SchemaErrors not raised") def test_unpickling( self, int_dataframe: pd.DataFrame, multi_check_schema: DataFrameSchema ): """Tests content validity of unpickled SchemaErrors.""" try: multi_check_schema.validate(int_dataframe, lazy=True) except SchemaErrors as exc: loaded = pickle.loads(pickle.dumps(exc)) assert loaded is not None self._compare_exception_with_unpickled(exc, loaded) else: pytest.fail("SchemaErrors not raised") @staticmethod def _compare_exception_with_unpickled( exc_native: SchemaErrors, exc_unpickled: SchemaErrors ): """Compare content of native SchemaErrors with unpickled one.""" assert isinstance(exc_native, SchemaErrors) assert isinstance(exc_unpickled, SchemaErrors) # compare message assert str(exc_unpickled) == str(exc_native) # compare schema_errors as string, as it is a nested container with # elements that compare by identity assert str(exc_unpickled.schema_errors) == str( exc_native.schema_errors ) assert exc_unpickled.error_counts == exc_native.error_counts assert exc_unpickled.failure_cases == str(exc_native.failure_cases) assert exc_unpickled.data == str(exc_native.data)
class TestSchemaError: """Tests pickling behavior of errors.SchemaError.""" @staticmethod @pytest.mark.parametrize( "check_obj", [Check.isin([0, 1]), Check(lambda x: x >= 0)] ) def test_pickling(int_dataframe: pd.DataFrame, check_obj: Check): """Test for a non-empty pickled object.""" schema = DataFrameSchema({"a": Column(int, check_obj)}) try: # fails for element -1 schema.validate(int_dataframe) except SchemaError as exc: # must be non-empty byte-array assert pickle.dumps(exc) else: pytest.fail("SchemaError not raised") @pytest.mark.parametrize("n_tile", [1, 10000]) def test_unpickling(self, int_dataframe: pd.DataFrame, n_tile: int): """Tests content validity of unpickled SchemaError.""" df = pd.DataFrame( {"a": np.tile(int_dataframe["a"].to_numpy(), n_tile)} ) schema = DataFrameSchema({"a": Column(int, Check.isin([0, 1]))}) loaded = None try: # fails for element -1 schema.validate(df) except SchemaError as exc: loaded = cast(SchemaError, pickle.loads(pickle.dumps(exc))) else: pytest.fail("SchemaError not raised") assert loaded is not None self._validate_error(df, n_tile, loaded) @staticmethod def _validate_error(df: pd.DataFrame, n_tile: int, exc: SchemaError): """General validation of Exception content.""" assert exc is not None assert ( "Schema Column(name=a, type=DataType(int64))> " "failed element-wise validator 0" in str(exc) ) assert exc.schema == "<Schema Column(name=a, type=DataType(int64))>" assert exc.data == str(df) # `failure_cases` is limited to 10 by `n_failure_cases` of `Check` assert exc.failure_cases == str( pd.DataFrame( { "index": np.arange(n_tile) * 3, "failure_case": np.full(n_tile, fill_value=-1, dtype=int), } ).head(10) ) assert exc.check == "<Check isin: isin({0, 1})>" assert exc.check_index == 0 assert exc.check_output == str( pd.Series(np.tile([False, True, True], n_tile), name="a") )
def test_lazy_dataframe_validation_error(): """Test exceptions on lazy dataframe validation.""" schema = DataFrameSchema( columns={ "int_col": Column(Int, Check.greater_than(5)), "int_col2": Column(Int), "float_col": Column(Float, Check.less_than(0)), "str_col": Column(String, Check.isin(["foo", "bar"])), "not_in_dataframe": Column(Int), }, checks=Check(lambda df: df != 1, error="dataframe_not_equal_1", ignore_na=False), index=Index(String, name="str_index"), strict=True, ) dataframe = pd.DataFrame( data={ "int_col": [1, 2, 6], "int_col2": ["a", "b", "c"], "float_col": [1.0, -2.0, 3.0], "str_col": ["foo", "b", "c"], "unknown_col": [None, None, None], }, index=pd.Index(["index0", "index1", "index2"], name="str_index"), ) expectation = { # schema object context -> check failure cases "DataFrameSchema": { # check name -> failure cases "column_in_schema": ["unknown_col"], "dataframe_not_equal_1": [1], "column_in_dataframe": ["not_in_dataframe"], }, "Column": { "greater_than(5)": [1, 2], "pandas_dtype('int64')": ["object"], "less_than(0)": [1, 3], }, } with pytest.raises(errors.SchemaErrors, match="^A total of .+ schema errors were found"): schema.validate(dataframe, lazy=True) try: schema.validate(dataframe, lazy=True) except errors.SchemaErrors as err: # data in the caught exception should be equal to the dataframe # passed into validate assert err.data.equals(dataframe) # make sure all expected check errors are in schema errors for schema_context, check_failure_cases in expectation.items(): err_df = err.failure_cases.loc[err.failure_cases.schema_context == schema_context] for check, failure_cases in check_failure_cases.items(): assert check in err_df.check.values assert (err_df.loc[err_df.check == check].failure_case.isin( failure_cases).all())
Check.less_than(3)], name="column", ), pd.DataFrame({"column": [1, 2, 3]}), { "data": pd.DataFrame({"column": [1, 2, 3]}), "schema_errors": { "Column": { "greater_than(1)": [1], "less_than(3)": [3] }, }, }, ], [ Index(String, checks=Check.isin(["a", "b", "c"])), pd.DataFrame({"col": [1, 2, 3]}, index=["a", "b", "d"]), { # expect that the data in the SchemaError is the pd.Index cast # into a Series "data": pd.Series(["a", "b", "d"]), "schema_errors": { "Index": { f"isin({set(['a', 'b', 'c'])})": ["d"] }, }, }, ], [ MultiIndex(indexes=[ Index(Int, checks=Check.greater_than(0), name="index0"),
"native_american_alaskan", "race_unspecified", ] causes_of_death = [ 'asphyxiated_restrained', 'beaten_bludgeoned_with_instrument', 'burned_smoke_inhalation', 'chemical_agent_pepper_spray', 'drowned', 'drug_overdose', 'fell_from_a_height', 'gunshot', 'medical_emergency', 'other', 'stabbed', 'tasered', 'undetermined', 'unknown', 'vehicle' ] # %% training_data_schema = pa.DataFrameSchema( { # feature columns "age": Column(pa.Float, Check.in_range(0, 120), nullable=True), "gender": Column(pa.String, Check.isin(genders), nullable=True), "race": Column(pa.String, Check.isin(races), nullable=True), "cause_of_death": Column(pa.String, Check.isin(causes_of_death), nullable=True), "symptoms_of_mental_illness": Column(pa.Bool, nullable=True), # target column "disposition_accidental": Column(pa.Bool, nullable=False), }, coerce=True # <- coerce columns to the specified type ) # %% [markdown] slideshow={"slide_type": "subslide"} # #### Serialize schema to yaml format: # %% print(training_data_schema.to_yaml())