def test_column_regex(): """Test that column regex work on single-level column index.""" column_schema = Column( Int, Check(lambda s: s >= 0), name="foo_*", regex=True ) dataframe_schema = DataFrameSchema( { "foo_*": Column(Int, Check(lambda s: s >= 0), regex=True), } ) data = pd.DataFrame( { "foo_1": range(10), "foo_2": range(10, 20), "foo_3": range(20, 30), "bar_1": range(10), "bar_2": range(10, 20), "bar_3": range(20, 30), } ) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error on multi-index column case data.columns = pd.MultiIndex.from_tuples( ( ("foo_1", "biz_1"), ("foo_2", "baz_1"), ("foo_3", "baz_2"), ("bar_1", "biz_2"), ("bar_2", "biz_3"), ("bar_3", "biz_3"), ) ) with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_column_regex_multiindex(): """Text that column regex works on multi-index column.""" column_schema = Column( Int, Check(lambda s: s >= 0), name=("foo_*", "baz_*"), regex=True, ) dataframe_schema = DataFrameSchema({ ("foo_*", "baz_*"): Column(Int, Check(lambda s: s >= 0), regex=True), }) data = pd.DataFrame({ ("foo_1", "biz_1"): range(10), ("foo_2", "baz_1"): range(10, 20), ("foo_3", "baz_2"): range(20, 30), ("bar_1", "biz_2"): range(10), ("bar_2", "biz_3"): range(10, 20), ("bar_3", "biz_3"): range(20, 30), }) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error if tuple column name is applied to a dataframe with a # flat pd.Index object. failure_column_cases = ( [f"foo_{i}" for i in range(6)], pd.MultiIndex.from_tuples([(f"foo_{i}", f"bar_{i}", f"baz_{i}") for i in range(6)]), ) for columns in failure_column_cases: data.columns = columns with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_coerce_nullable_object_column(): """Test that Object dtype coercing preserves object types.""" df_objects_with_na = pd.DataFrame( {"col": [1, 2.0, [1, 2, 3], { "a": 1 }, np.nan, None]}) column_schema = Column(Object, name="col", coerce=True, nullable=True) validated_df = column_schema.validate(df_objects_with_na) assert isinstance(validated_df, pd.DataFrame) assert pd.isna(validated_df["col"].iloc[-1]) assert pd.isna(validated_df["col"].iloc[-2]) for i in range(4): isinstance(validated_df["col"].iloc[i], type(df_objects_with_na["col"].iloc[i]))