def test_multi_index_columns() -> None: """Tests that multi-index Columns within DataFrames validate correctly.""" schema = DataFrameSchema( { ("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))), ("zero", "bar"): Column( String, Check(lambda s: s.isin(["a", "b", "c", "d"])) ), ("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))), ("one", "bar"): Column( DateTime, Check(lambda s: s == pd.Timestamp(2019, 1, 1)) ), } ) validated_df = schema.validate( pd.DataFrame( { ("zero", "foo"): [0.1, 0.2, 0.7, 0.3], ("zero", "bar"): ["a", "b", "c", "d"], ("one", "foo"): [1, 6, 4, 7], ("one", "bar"): pd.to_datetime(["2019/01/01"] * 4), } ) ) assert isinstance(validated_df, pd.DataFrame)
def test_column_regex_multiindex(): """Text that column regex works on multi-index column.""" column_schema = Column( Int, Check(lambda s: s >= 0), name=("foo_*", "baz_*"), regex=True, ) dataframe_schema = DataFrameSchema({ ("foo_*", "baz_*"): Column(Int, Check(lambda s: s >= 0), regex=True), }) data = pd.DataFrame({ ("foo_1", "biz_1"): range(10), ("foo_2", "baz_1"): range(10, 20), ("foo_3", "baz_2"): range(20, 30), ("bar_1", "biz_2"): range(10), ("bar_2", "biz_3"): range(10, 20), ("bar_3", "biz_3"): range(20, 30), }) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error if tuple column name is applied to a dataframe with a # flat pd.Index object. failure_column_cases = ( ["foo_%s" % i for i in range(6)], pd.MultiIndex.from_tuples([ ("foo_%s" % i, "bar_%s" % i, "baz_%s" % i) for i in range(6) ]) ) for columns in failure_column_cases: data.columns = columns with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_multi_index_index(): schema = DataFrameSchema( columns={ "column1": Column(Float, Check(lambda s: s > 0)), "column2": Column(Float, Check(lambda s: s > 0)), }, index=MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"), ])) df = pd.DataFrame( data={ "column1": [0.1, 0.5, 123.1, 10.6, 22.31], "column2": [0.1, 0.5, 123.1, 10.6, 22.31], }, index=pd.MultiIndex.from_arrays( [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], )) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) # failure case df_fail = df.copy() df_fail.index = pd.MultiIndex.from_arrays( [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ) with pytest.raises(errors.SchemaError): schema.validate(df_fail)
def test_column_regex(): """Test that column regex work on single-level column index.""" column_schema = Column( Int, Check(lambda s: s >= 0), name="foo_*", regex=True) dataframe_schema = DataFrameSchema({ "foo_*": Column(Int, Check(lambda s: s >= 0), regex=True), }) data = pd.DataFrame({ "foo_1": range(10), "foo_2": range(10, 20), "foo_3": range(20, 30), "bar_1": range(10), "bar_2": range(10, 20), "bar_3": range(20, 30), }) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error on multi-index column case data.columns = pd.MultiIndex.from_tuples( ( ("foo_1", "biz_1"), ("foo_2", "baz_1"), ("foo_3", "baz_2"), ("bar_1", "biz_2"), ("bar_2", "biz_3"), ("bar_3", "biz_3"), ) ) with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_check_equality_operators(): """Test the usage of == between a Check and an entirely different Check.""" check = Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3") not_equal_check = Check(lambda x: x.isna().sum() == 0) assert check == copy.deepcopy(check) assert check != not_equal_check
def test_coerce_dtype_in_dataframe(): """Tests coercions of datatypes, especially regarding nullable integers.""" df = pd.DataFrame({ "column1": [10.0, 20.0, 30.0], "column2": ["2018-01-01", "2018-02-01", "2018-03-01"], "column3": [1, 2, None], "column4": [1., 1., np.nan], }) # specify `coerce` at the Column level schema1 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0), coerce=True), "column2": Column(DateTime, coerce=True), "column3": Column(String, coerce=True, nullable=True), }) # specify `coerce` at the DataFrameSchema level schema2 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0)), "column2": Column(DateTime), "column3": Column(String, nullable=True), }, coerce=True) for schema in [schema1, schema2]: result = schema.validate(df) assert result.column1.dtype == Int.value assert result.column2.dtype == DateTime.value for _, x in result.column3.iteritems(): assert pd.isna(x) or isinstance(x, str) # make sure that correct error is raised when null values are present # in a float column that's coerced to an int schema = DataFrameSchema({"column4": Column(Int, coerce=True)}) with pytest.raises(ValueError): schema.validate(df)
def test_check_groupby_multiple_columns(): schema = DataFrameSchema({ "col1": Column( Int, [ Check( lambda s: s[("bar", True)].sum() == 16, # 7 + 9 groupby=["col2", "col3"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), "col3": Column(Bool), }) df_pass = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], "col3": [True, False, True, False, True, False], }) df = schema.validate(df_pass) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 3 assert set(df.columns) == {"col1", "col2", "col3"}
def test_equality_operators_functional_equivalence(): """Test the usage of == for Checks where the Check callable object has the same implementation.""" main_check = Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3") same_check = Check(lambda h: h["foo"]["col1"].iat[0] == 1, groupby="col3") assert main_check == same_check
def test_check_groupby_multiple_columns(): """Tests uses of groupby to specify dependencies between one column and a number of other columns, including error handling.""" schema = DataFrameSchema({ "col1": Column( Int, [ Check( lambda s: s[("bar", True)].sum() == 16, # 7 + 9 groupby=["col2", "col3"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), "col3": Column(Bool), }) df_pass = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], "col3": [True, False, True, False, True, False], }) df = schema.validate(df_pass) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 3 assert set(df.columns) == {"col1", "col2", "col3"}
def test_check_groupby(): """Tests uses of groupby to specify dependencies between one column and a single other column, including error handling.""" schema = DataFrameSchema( columns={ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2"), Check(lambda s: s["bar"] < 10, groupby=["col2"]), Check(lambda s: s["foo"] > 10, groupby=lambda df: df.groupby("col2")), Check(lambda s: s["bar"] < 10, groupby=lambda df: df.groupby("col2")) ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }, index=Index(Int, name="data_id"), ) df_pass = pd.DataFrame( data={ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) df = schema.validate(df_pass) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 2 assert set(df.columns) == {"col1", "col2"} # raise errors.SchemaError when Check fails df_fail_on_bar = pd.DataFrame( data={ "col1": [7, 8, 20, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) df_fail_on_foo = pd.DataFrame( data={ "col1": [7, 8, 9, 11, 1, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) # raise errors.SchemaError when groupby column doesn't exist df_fail_no_column = pd.DataFrame( data={ "col1": [7, 8, 20, 11, 12, 13], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) for df in [df_fail_on_bar, df_fail_on_foo, df_fail_no_column]: with pytest.raises(errors.SchemaError): schema.validate(df)
def test_series_schema(): """Tests that a SeriesSchema Check behaves as expected for integers and strings. Tests error cases for types, duplicates, name errors, and issues around float and integer handling of nulls""" int_schema = SeriesSchema( Int, Check(lambda x: 0 <= x <= 100, element_wise=True)) assert isinstance(int_schema.validate(pd.Series([0, 30, 50, 100])), pd.Series) str_schema = SeriesSchema(String, Check(lambda s: s.isin(["foo", "bar", "baz"])), nullable=True, coerce=True) assert isinstance( str_schema.validate(pd.Series(["foo", "bar", "baz", None])), pd.Series) assert isinstance( str_schema.validate(pd.Series(["foo", "bar", "baz", np.nan])), pd.Series) # error cases for data in [-1, 101, 50.1, "foo"]: with pytest.raises(errors.SchemaError): int_schema.validate(pd.Series([data])) for data in [-1, {"a": 1}, -1.0]: with pytest.raises(TypeError): int_schema.validate(TypeError) non_duplicate_schema = SeriesSchema(Int, allow_duplicates=False) with pytest.raises(errors.SchemaError): non_duplicate_schema.validate(pd.Series([0, 1, 2, 3, 4, 1])) # when series name doesn't match schema named_schema = SeriesSchema(Int, name="my_series") with pytest.raises(errors.SchemaError, match=r"^Expected .+ to have name"): named_schema.validate(pd.Series(range(5), name="your_series")) # when series floats are declared to be integer with pytest.raises( errors.SchemaError, match=r"^after dropping null values, expected values in series"): SeriesSchema(Int, nullable=True).validate(pd.Series([1.1, 2.3, 5.5, np.nan])) # when series contains null values when schema is not nullable with pytest.raises(errors.SchemaError, match=r"^non-nullable series .+ contains null values"): SeriesSchema(Float, nullable=False).validate( pd.Series([1.1, 2.3, 5.5, np.nan])) # when series contains null values when schema is not nullable in addition # to having the wrong data type with pytest.raises( errors.SchemaError, match=(r"^expected series '.+' to have type .+, got .+ and " "non-nullable series contains null values")): SeriesSchema(Int, nullable=False).validate( pd.Series([1.1, 2.3, 5.5, np.nan]))
def init_schema_element_wise(): DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, element_wise=True, groupby=["col2"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), })
def __init__(self, config): self.raw_path = config['raw_path'] self.raw_data = config['raw_data'] self.random_state = config['random_state'] self.sample_size = config['sample_size'] self.columns = config['columns'] # Define checks self.check_ge_min = Check(lambda s: s >= 0) self.check_le_max = Check(lambda s: s <= max(s))
def test_add_and_remove_columns(): """Check that adding and removing columns works as expected and doesn't modify the original underlying DataFrameSchema.""" schema1 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), }, strict=True, ) schema1_exact_copy = copy.deepcopy(schema1) # test that add_columns doesn't modify schema1 after add_columns: schema2 = schema1.add_columns( { "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)), } ) schema2_exact_copy = copy.deepcopy(schema2) assert schema1 == schema1_exact_copy # test that add_columns changed schema1 into schema2: expected_schema_2 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)), }, strict=True, ) assert schema2 == expected_schema_2 # test that remove_columns doesn't modify schema2: schema3 = schema2.remove_columns(["col2"]) assert schema2 == schema2_exact_copy # test that remove_columns has removed the changes as expected: expected_schema_3 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), "col3": Column(Object, Check(lambda x: x == 0)), }, strict=True, ) assert schema3 == expected_schema_3 # test that remove_columns can remove two columns: schema4 = schema2.remove_columns(["col2", "col3"]) expected_schema_4 = DataFrameSchema( {"col1": Column(Int, Check(lambda s: s >= 0))}, strict=True ) assert schema4 == expected_schema_4 == schema1
def test_series_schema_multiple_validators(): schema = SeriesSchema(PandasDtype.Int, [ Check(lambda x: 0 <= x <= 50), Check(lambda s: (s == 21).any(), element_wise=False) ]) validated_series = schema.validate(pd.Series([1, 5, 21, 50])) assert isinstance(validated_series, pd.Series) # raise error if any of the validators fails with pytest.raises(SchemaError): schema.validate(pd.Series([1, 5, 20, 50]))
def test_raise_warning_series(): """Test that checks with raise_warning=True raise a warning.""" data = pd.Series([-1, -2, -3]) error_schema = SeriesSchema(checks=Check(lambda s: s > 0)) warning_schema = SeriesSchema( checks=Check(lambda s: s > 0, raise_warning=True)) with pytest.raises(errors.SchemaError): error_schema(data) with pytest.warns(UserWarning): warning_schema(data)
def test_index_schema(): schema = DataFrameSchema( columns={}, index=Index(Int, [ Check(lambda x: 1 <= x <= 11, element_wise=True), Check(lambda index: index.mean() > 1) ])) df = pd.DataFrame(index=range(1, 11), dtype="int64") assert isinstance(schema.validate(df), pd.DataFrame) with pytest.raises(errors.SchemaError): schema.validate(pd.DataFrame(index=range(1, 20)))
def test_index_schema(): """Tests that when specifying a DataFrameSchema Index pandera validates and errors appropriately.""" schema = DataFrameSchema(index=Index(Int, [ Check(lambda x: 1 <= x <= 11, element_wise=True), Check(lambda index: index.mean() > 1) ])) df = pd.DataFrame(index=range(1, 11), dtype="int64") assert isinstance(schema.validate(df), pd.DataFrame) with pytest.raises(errors.SchemaError): schema.validate(pd.DataFrame(index=range(1, 20)))
def test_check_groups(): """Tests uses of groupby and groups (for values within columns).""" schema = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2", groups=["foo"]), Check(lambda s: s["foo"] > 10, groupby="col2", groups="foo"), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) df = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) assert len(validated_df.columns) == 2 assert set(validated_df.columns) == {"col1", "col2"} # raise KeyError when groups does not include a particular group name schema_fail_key_error = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError, match="^'bar'"): schema_fail_key_error.validate(df) # raise KeyError when the group does not exist in the groupby column when # referenced in the Check function schema_fail_nonexistent_key_in_fn = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError, match="^'baz'"): schema_fail_nonexistent_key_in_fn.validate(df) # raise KeyError when the group does not exist in the groups argument. schema_fail_nonexistent_key_in_groups = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError): schema_fail_nonexistent_key_in_groups.validate(df)
def test_series_schema_multiple_validators(): """Tests how multiple Checks on a Series Schema are handled both successfully and when errors are expected.""" schema = SeriesSchema( Int, [ Check(lambda x: 0 <= x <= 50, element_wise=True), Check(lambda s: (s == 21).any())]) validated_series = schema.validate(pd.Series([1, 5, 21, 50])) assert isinstance(validated_series, pd.Series) # raise error if any of the validators fails with pytest.raises(errors.SchemaError): schema.validate(pd.Series([1, 5, 20, 50]))
def test_dataframe_schema_check_function_types(check_function, should_fail): schema = DataFrameSchema({ "a": Column(Int, Check(fn=check_function, element_wise=False)), "b": Column(Float, Check(fn=check_function, element_wise=False)) }) df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]}) if should_fail: with pytest.raises(errors.SchemaError): schema.validate(df) else: schema.validate(df)
def setup(self): self.schema = SeriesSchema( String, checks=[ Check(lambda s: s.str.startswith("foo")), Check(lambda s: s.str.endswith("bar")), Check(lambda x: len(x) > 3, element_wise=True) ], nullable=False, allow_duplicates=True, name="my_series") self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series")
def test_dataframe_schema(): """Tests the Checking of a DataFrame that has a wide variety of types and conditions. Tests include: when the Schema works, when a column is dropped, and when a columns values change its type. """ schema = DataFrameSchema({ "a": Column(Int, Check(lambda x: x > 0, element_wise=True)), "b": Column(Float, Check(lambda x: 0 <= x <= 10, element_wise=True)), "c": Column(String, Check(lambda x: set(x) == {"x", "y", "z"})), "d": Column(Bool, Check(lambda x: x.mean() > 0.5)), "e": Column(Category, Check(lambda x: set(x) == {"c1", "c2", "c3"})), "f": Column(Object, Check(lambda x: x.isin([(1, ), (2, ), (3, )]))), "g": Column( DateTime, Check(lambda x: x >= pd.Timestamp("2015-01-01"), element_wise=True)), "i": Column( Timedelta, Check(lambda x: x < pd.Timedelta(10, unit="D"), element_wise=True)) }) df = pd.DataFrame({ "a": [1, 2, 3], "b": [1.1, 2.5, 9.9], "c": ["z", "y", "x"], "d": [True, True, False], "e": pd.Series(["c2", "c1", "c3"], dtype="category"), "f": [(3, ), (2, ), (1, )], "g": [ pd.Timestamp("2015-02-01"), pd.Timestamp("2015-02-02"), pd.Timestamp("2015-02-03") ], "i": [ pd.Timedelta(1, unit="D"), pd.Timedelta(5, unit="D"), pd.Timedelta(9, unit="D") ] }) assert isinstance(schema.validate(df), pd.DataFrame) # error case with pytest.raises(errors.SchemaError): schema.validate(df.drop("a", axis=1)) with pytest.raises(errors.SchemaError): schema.validate(df.assign(a=[-1, -2, -1])) # checks if 'a' is converted to float, while schema says int, will a schema # error be thrown with pytest.raises(errors.SchemaError): schema.validate(df.assign(a=[1.7, 2.3, 3.1]))
def test_dataframe_schema_check(): """Test that DataFrameSchema-level Checks work properly.""" data = pd.DataFrame([range(10) for _ in range(10)]) schema_check_return_bool = DataFrameSchema( checks=Check(lambda df: (df < 10).all())) assert isinstance(schema_check_return_bool.validate(data), pd.DataFrame) schema_check_return_series = DataFrameSchema( checks=Check(lambda df: df[0] < 10)) assert isinstance(schema_check_return_series.validate(data), pd.DataFrame) schema_check_return_df = DataFrameSchema(checks=Check(lambda df: df < 10)) assert isinstance(schema_check_return_df.validate(data), pd.DataFrame)
def test_dataframe_schema_check_function_types(check_function, should_fail): """Tests a DataFrameSchema against a variety of Check conditions.""" schema = DataFrameSchema({ "a": Column(Int, Check(check_function, element_wise=False)), "b": Column(Float, Check(check_function, element_wise=False)), }) df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]}) if should_fail: with pytest.raises(errors.SchemaError): schema.validate(df) else: schema.validate(df)
def _validate_score_table(variant_information_table: DataFrame, score_table: DataFrame): """ Validate the results of the prioritization method. The following constraints are checked: * Each UID from the variant_information_table is also in the score_table * Each SCORE in the score_table is a numerical value Parameters ---------- variant_information_table : The variant information table score_table : The scoring results from the prioritization method Raises ------ :class:`~pandera.errors.SchemaErrors` If the validation of the data fails """ variants_uid = variant_information_table["UID"] schema = DataFrameSchema({ "UID": Column( Int, Check(lambda x: variants_uid.isin(x) & x.isin(variants_uid)), required=True), "SCORE": Column(Float, coerce=True, required=True) }) schema.validate(score_table, lazy=True)
def init_schema_no_groupby_column(): DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby=["col2"]), ]), })
def test_column_regex_matching(column_name_regex, expected_matches, error): """ Column regex pattern matching should yield correct matches and raise expected errors. """ columns = pd.MultiIndex.from_tuples(( ("foo_1", "biz_1"), ("foo_2", "baz_1"), ("foo_3", "baz_2"), ("bar_1", "biz_2"), ("bar_2", "biz_3"), ("bar_3", "biz_3"), )) column_schema = Column( Int, Check(lambda s: s >= 0), name=column_name_regex, regex=True, ) if error is not None: with pytest.raises(error): column_schema.get_regex_columns(columns) else: matched_columns = column_schema.get_regex_columns(columns) assert expected_matches == matched_columns.tolist()
def test_column_in_dataframe_schema(): """Test that a Column check returns a dataframe.""" schema = DataFrameSchema( {"a": Column(Int, Check(lambda x: x > 0, element_wise=True))} ) data = pd.DataFrame({"a": [1, 2, 3]}) assert isinstance(schema.validate(data), pd.DataFrame)
def test_datetime(): """Test datetime types can be validated properly by schema.validate""" schema = DataFrameSchema( columns={ "col": Column( pa.DateTime, checks=Check(lambda s: s.min() > pd.Timestamp("2015")), ) } ) validated_df = schema.validate( pd.DataFrame( {"col": pd.to_datetime(["2019/01/01", "2018/05/21", "2016/03/10"])} ) ) assert isinstance(validated_df, pd.DataFrame) with pytest.raises(SchemaError): schema.validate( pd.DataFrame( {"col": pd.to_datetime(["2010/01/01"])} ) )