def test_column_regex_non_str_types() -> None: """Check that column name regex matching excludes non-string types.""" data = pd.DataFrame( { 1: [1, 2, 3], 2.2: [1, 2, 3], pd.Timestamp("2018/01/01"): [1, 2, 3], "foo_1": [1, 2, 3], "foo_2": [1, 2, 3], "foo_3": [1, 2, 3], } ) schema = DataFrameSchema( columns={ "foo_": Column(Int, Check.gt(0), regex=True), r"\d+": Column(Int, Check.gt(0), regex=True), r"\d+\.\d+": Column(Int, Check.gt(0), regex=True), "2018-01-01": Column(Int, Check.gt(0), regex=True), }, ) assert isinstance(schema.validate(data), pd.DataFrame) # test MultiIndex column case data = pd.DataFrame( { (1, 1): [1, 2, 3], (2.2, 4.5): [1, 2, 3], ("foo", "bar"): [1, 2, 3], } ) schema = DataFrameSchema( columns={("foo_*", "bar_*"): Column(Int, regex=True)}, ) schema.validate(data)
def test_series_schema_checks(): """Test SeriesSchema check property.""" series_schema_no_checks = SeriesSchema() series_schema_one_check = SeriesSchema(checks=Check.eq(0)) series_schema_multiple_checks = SeriesSchema( checks=[Check.gt(0), Check.lt(100)]) for schema in [ series_schema_no_checks, series_schema_one_check, series_schema_multiple_checks, ]: assert isinstance(schema.checks, list) assert len(series_schema_no_checks.checks) == 0 assert len(series_schema_one_check.checks) == 1 assert len(series_schema_multiple_checks.checks) == 2
def test_check_io(): # pylint: disable=too-many-locals """Test that check_io correctly validates/invalidates data.""" schema = DataFrameSchema({"col": Column(Int, Check.gt(0))}) @check_io(df1=schema, df2=schema, out=schema) def simple_func(df1, df2): return df1.assign(col=df1["col"] + df2["col"]) @check_io(df1=schema, df2=schema) def simple_func_no_out(df1, df2): return df1.assign(col=df1["col"] + df2["col"]) @check_io(out=(1, schema)) def output_with_obj_getter(df): return None, df @check_io(out=[(0, schema), (1, schema)]) def multiple_outputs_tuple(df): return df, df @check_io(out=[(0, schema), ("foo", schema), (lambda x: x[2]["bar"], schema)]) def multiple_outputs_dict(df): return {0: df, "foo": df, 2: {"bar": df}} @check_io(df=schema, out=schema, head=1) def validate_head(df): return df @check_io(df=schema, out=schema, tail=1) def validate_tail(df): return df @check_io(df=schema, out=schema, sample=1, random_state=100) def validate_sample(df): return df @check_io(df=schema, out=schema, lazy=True) def validate_lazy(df): return df @check_io(df=schema, out=schema, inplace=True) def validate_inplace(df): return df df1 = pd.DataFrame({"col": [1, 1, 1]}) df2 = pd.DataFrame({"col": [2, 2, 2]}) invalid_df = pd.DataFrame({"col": [-1, -1, -1]}) expected = pd.DataFrame({"col": [3, 3, 3]}) for fn, valid, invalid, out in [ (simple_func, [df1, df2], [invalid_df, invalid_df], expected), (simple_func_no_out, [df1, df2], [invalid_df, invalid_df], expected), (output_with_obj_getter, [df1], [invalid_df], (None, df1)), (multiple_outputs_tuple, [df1], [invalid_df], (df1, df1)), ( multiple_outputs_dict, [df1], [invalid_df], { 0: df1, "foo": df1, 2: { "bar": df1 } }, ), (validate_head, [df1], [invalid_df], df1), (validate_tail, [df1], [invalid_df], df1), (validate_sample, [df1], [invalid_df], df1), (validate_lazy, [df1], [invalid_df], df1), (validate_inplace, [df1], [invalid_df], df1), ]: result = fn(*valid) if isinstance(result, pd.Series): assert (result == out).all() if isinstance(result, pd.DataFrame): assert (result == out).all(axis=None) else: assert result == out expected_error = (errors.SchemaErrors if fn is validate_lazy else errors.SchemaError) with pytest.raises(expected_error): fn(*invalid) # invalid out schema types for out_schema in [1, 5.0, "foo", {"foo": "bar"}, ["foo"]]: @check_io(out=out_schema) def invalid_out_schema_type(df): return df with pytest.raises((TypeError, ValueError)): invalid_out_schema_type(df1)