def _create_schema_python_types(): return pa.DataFrameSchema({ "int_column": pa.Column(int), "float_column": pa.Column(float), "str_column": pa.Column(str), "object_column": pa.Column(object), })
def test_unique(): """Test uniqueness checks on modin dataframes.""" schema = pa.DataFrameSchema({"field": pa.Column(int)}, unique=["field"]) column_schema = pa.Column(int, unique=True, name="field") series_schema = pa.SeriesSchema(int, unique=True, name="field") data_unique = mpd.DataFrame({"field": [1, 2, 3]}) data_non_unique = mpd.DataFrame({"field": [1, 1, 1]}) assert isinstance(schema(data_unique), mpd.DataFrame) assert isinstance(column_schema(data_unique), mpd.DataFrame) assert isinstance(series_schema(data_unique["field"]), mpd.Series) with pytest.raises(pa.errors.SchemaError, match="columns .+ not unique"): schema(data_non_unique) with pytest.raises(pa.errors.SchemaError, match="series .+ contains duplicate values"): column_schema(data_non_unique) with pytest.raises(pa.errors.SchemaError, match="series .+ contains duplicate values"): series_schema(data_non_unique["field"]) schema.unique = None column_schema.unique = False series_schema.unique = False assert isinstance(schema(data_non_unique), mpd.DataFrame) assert isinstance(column_schema(data_non_unique), mpd.DataFrame) assert isinstance(series_schema(data_non_unique["field"]), mpd.Series)
def create_invoice_stats_schema(coerce: bool = True, strict: bool = True, nullable: bool = True): """Function to validate that invoice stats schema is correct, it also does value checks in runtime (really nice stuff, right here). Args: coerce (bool): Flag given to determine whether to coerce series to specified type strict (bool): Flag given to determine whether or not to accept columns in the dataframe that are not in the DataFrame nullable (bool): If columns should be nullable or not Returns: A pandas DataFrame schema that validates that the types are correct """ return pa.DataFrameSchema( { INVOICE_STATS_COLUMN_NAMES.get("invoice_median"): pa.Column(pa.Float64, nullable=nullable), INVOICE_STATS_COLUMN_NAMES.get("invoice_mean"): pa.Column(pa.Float64, nullable=nullable), }, index=pa.Index(pa.Int), strict=strict, coerce=coerce, )
def sample_dataframe_schema(**kwargs): return pa.DataFrameSchema( { "a": pa.Column(int, checks=pa.Check.le(10), description="a desc"), "b": pa.Column(float, checks=pa.Check.lt(-1.2), description="b desc"), "c": pa.Column( str, description="c desc", checks=[ pa.Check.str_startswith("value_"), pa.Check( lambda s: s.str.split("_", expand=True).shape[1] == 2, description="Two words separated by underscore", ), ], ), }, checks=[ pa.Check(lambda df: df["a"].sum() > df["b"].sum(), description="sum(a) > sum(b)"), ], **kwargs, )
def create_event_schema( coerce: bool = True, strict: bool = True, nullable: bool = False, ): """Function to validate that event schema is correct, it also does value checks in runtime (really nice stuff, right here). If this fails, then write to dead letter queue. Args: coerce (bool): Flag given to determine whether to coerce series to specified type strict (bool): Flag given to determine whether or not to accept columns in the dataframe that are not in the DataFrame nullable (bool): If columns should be nullable or not Returns: A pandas DataFrame schema that validates that the types are correct, and that the values inserted are correct. """ return pa.DataFrameSchema( { "id": pa.Column(pa.String, nullable=nullable), "timestamp": pa.Column(pa.DateTime, nullable=nullable), "version": pa.Column(pa.String, nullable=nullable), }, index=pa.Index(pa.Int), strict=strict, coerce=coerce, )
def test_to_script_lambda_check(): """Test writing DataFrameSchema to a script with lambda check.""" schema1 = pandera.DataFrameSchema( { "a": pandera.Column( pandera.Int, checks=pandera.Check( lambda s: s.mean() > 5, element_wise=False ), ), } ) with pytest.warns(UserWarning): pandera.io.to_script(schema1) schema2 = pandera.DataFrameSchema( { "a": pandera.Column( pandera.Int, ), }, checks=pandera.Check(lambda s: s.mean() > 5, element_wise=False), ) with pytest.warns(UserWarning, match=".*registered checks.*"): pandera.io.to_script(schema2)
def test_serialize_deserialize_custom_datetime_checks(): """ Test that custom checks for datetime columns can be serialized and deserialized """ # pylint: disable=unused-variable,unused-argument @pandera.extensions.register_check_method(statistics=["stat"]) def datetime_check(pandas_obj, *, stat): ... schema = pandera.DataFrameSchema( { "dt_col": pandera.Column( pandera.DateTime, checks=pandera.Check.datetime_check("foobar"), ), "td_col": pandera.Column( pandera.Timedelta, checks=pandera.Check.datetime_check("foobar"), ), } ) yaml_schema = schema.to_yaml() schema_from_yaml = schema.from_yaml(yaml_schema) assert schema_from_yaml == schema
def test_register_custom_groupby_check(custom_check_teardown: None) -> None: """Test registering a custom groupby check.""" @extensions.register_check_method( statistics=["group_a", "group_b"], supported_types=(pd.Series, pd.DataFrame), check_type="groupby", ) def custom_check(dict_groups, *, group_a, group_b): """ Test that the mean values in group A is larger than that of group B. Note that this function can handle groups of both dataframes and series. """ return (dict_groups[group_a].values.mean() > dict_groups[group_b].values.mean()) # column groupby check data_column_check = pd.DataFrame({ "col1": [20, 20, 10, 10], "col2": list("aabb"), }) schema_column_check = pa.DataFrameSchema({ "col1": pa.Column( int, Check.custom_check(group_a="a", group_b="b", groupby="col2"), ), "col2": pa.Column(str), }) assert isinstance(schema_column_check(data_column_check), pd.DataFrame) # dataframe groupby check data_df_check = pd.DataFrame( { "col1": [20, 20, 10, 10], "col2": [30, 30, 5, 5], "col3": [10, 10, 1, 1], }, index=pd.Index(list("aabb"), name="my_index"), ) schema_df_check = pa.DataFrameSchema( columns={ "col1": pa.Column(int), "col2": pa.Column(int), "col3": pa.Column(int), }, index=pa.Index(str, name="my_index"), checks=Check.custom_check(group_a="a", group_b="b", groupby="my_index"), ) assert isinstance(schema_df_check(data_df_check), pd.DataFrame) for kwargs in [{"element_wise": True}, {"element_wise": False}]: with pytest.warns(UserWarning): Check.custom_check(val=10, **kwargs)
def _create_schema(index="single"): if index == "multi": index = pa.MultiIndex([ pa.Index(pa.Int, name="int_index0"), pa.Index(pa.Int, name="int_index1"), pa.Index(pa.Int, name="int_index2"), ]) elif index == "single": index = pa.Index(pa.Int, name="int_index") else: index = None return pa.DataFrameSchema(columns={ "int_column": pa.Column( pa.Int, checks=[ pa.Check.greater_than(0), pa.Check.less_than(10), pa.Check.in_range(0, 10), ], ), "float_column": pa.Column( pa.Float, checks=[ pa.Check.greater_than(-10), pa.Check.less_than(20), pa.Check.in_range(-10, 20), ], ), "str_column": pa.Column( pa.String, checks=[ pa.Check.isin(["foo", "bar", "x", "xy"]), pa.Check.str_length(1, 3) ], ), "datetime_column": pa.Column(pa.DateTime, checks=[ pa.Check.greater_than(pd.Timestamp("20100101")), pa.Check.less_than(pd.Timestamp("20200101")), ]), "timedelta_column": pa.Column(pa.Timedelta, checks=[ pa.Check.greater_than(pd.Timedelta(1000, unit="ns")), pa.Check.less_than(pd.Timedelta(10000, unit="ns")), ]), }, index=index, coerce=False, strict=True)
def test_schema_selector(df, attrs, expected): schema = pa.DataFrameSchema({ "a": pa.Column(int, regex=True, nullable=False), "b": pa.Column(int, required=False, nullable=True), }) df = schema.validate(df) selector = SchemaSelector(**attrs) assert_col_indexer(df, selector, expected)
def validate_movement_annotations(mvmt_tbl: pd.DataFrame) -> pd.DataFrame: mvmt_tbl_schema = pa.DataFrameSchema( { "animal": pa.Column(pa.Int, pa.Check.greater_than_or_equal_to(0)), "timepoint": pa.Column(pa.Int, pa.Check.greater_than_or_equal_to(0)), "pair": pa.Column(pa.Int, pa.Check.greater_than_or_equal_to(0)), "mvmt-*": pa.Column(pa.Int, regex=True), }, strict=True, ) return mvmt_tbl_schema.validate(mvmt_tbl)
def test_dataframe_unique(size, data) -> None: """Test that DataFrameSchemas with unique columns are actually unique.""" schema = pa.DataFrameSchema( { "col1": pa.Column(int), "col2": pa.Column(float), "col3": pa.Column(str), "col4": pa.Column(int), }, unique=["col1", "col2", "col3"], ) df_sample = data.draw(schema.strategy(size=size)) schema(df_sample)
def test_csv_download(self): df = data_import.download_csv() schema_csv_download = pa.DataFrameSchema({ 'name': pa.Column(pa.String), 'energy,calculated (kJ)': pa.Column( pa.Int, pa.Check(lambda x: 0 <= x <= 4000, element_wise=True, error="kJ range checker [0, 2000]")), 'fat, total (g)': pa.Column(pa.String), 'carbohydrate, available (g)': pa.Column(pa.String), 'protein, total (g)': pa.Column(pa.String), # 'fibre, total (g)': pa.Column(), # can have NaN values 'sugars, total (g)': pa.Column(pa.String), 'alcohol (g)': pa.Column(pa.String), # 'sodium (mg)': pa.Column(), # can have NaN values 'salt (mg)': pa.Column(pa.String), }) df_valid = schema_csv_download.validate(df) self.assertTrue(1000 in df_valid.index)
def test_schema_selector_multi_index(df_mi, attrs, expected): schema = pa.DataFrameSchema({ ("int", "number"): pa.Column(int, nullable=True), ("float", "number"): pa.Column(float, nullable=True), ("category", "nominal"): pa.Column(str, required=False), ("string", "nominal"): pa.Column(str, required=False, nullable=True), }) df = schema.validate(df_mi) selector = SchemaSelector(**attrs) assert_col_indexer(df, selector, expected)
def test_required_column(): """Test the required column raises error.""" required_schema = pa.DataFrameSchema( {"field": pa.Column(int, required=True)}) schema = pa.DataFrameSchema({"field_": pa.Column(int, required=False)}) data = mpd.DataFrame({"field": [1, 2, 3]}) assert isinstance(required_schema(data), mpd.DataFrame) assert isinstance(schema(data), mpd.DataFrame) with pytest.raises(pa.errors.SchemaError): required_schema(mpd.DataFrame({"another_field": [1, 2, 3]})) schema(mpd.DataFrame({"another_field": [1, 2, 3]}))
def test_empty_dtype() -> None: expected = pa.DataFrameSchema({"empty_column": pa.Column()}) class EmptyDtypeSchema(pa.SchemaModel): empty_column: pa.typing.Series assert EmptyDtypeSchema.to_schema() == expected
def test_dataframe_schema_case(coerce): """Test a simple schema case.""" schema = pa.DataFrameSchema( { "int_column": pa.Column(int, pa.Check.ge(0)), "float_column": pa.Column(float, pa.Check.le(0)), "str_column": pa.Column(str, pa.Check.isin(list("abcde"))), }, coerce=coerce, ) mdf = mpd.DataFrame({ "int_column": range(10), "float_column": [float(-x) for x in range(10)], "str_column": list("aabbcceedd"), }) assert isinstance(schema.validate(mdf), mpd.DataFrame)
def test_strict_schema(): """Test schema strictness.""" strict_schema = pa.DataFrameSchema({"field": pa.Column()}, strict=True) non_strict_schema = pa.DataFrameSchema({"field": pa.Column()}) strict_df = mpd.DataFrame({"field": [1]}) non_strict_df = mpd.DataFrame({"field": [1], "foo": [2]}) strict_schema(strict_df) non_strict_schema(strict_df) with pytest.raises(pa.errors.SchemaError, match="column 'foo' not in DataFrameSchema"): strict_schema(non_strict_df) non_strict_schema(non_strict_df)
def test_infer_dataframe_schema(multi_index): """Test dataframe schema is correctly inferred.""" dataframe = _create_dataframe(multi_index=multi_index) schema = schema_inference.infer_dataframe_schema(dataframe) assert isinstance(schema, pa.DataFrameSchema) if multi_index: assert isinstance(schema.index, pa.MultiIndex) else: assert isinstance(schema.index, pa.Index) with pytest.warns( UserWarning, match="^This .+ is an inferred schema that hasn't been modified"): schema.validate(dataframe) # modifying an inferred schema should set _is_inferred to False schema_with_added_cols = schema.add_columns( {"foo": pa.Column(pa.String)}) assert schema._is_inferred assert not schema_with_added_cols._is_inferred assert isinstance( schema_with_added_cols.validate(dataframe.assign(foo="a")), pd.DataFrame) schema_with_removed_cols = schema.remove_columns(["int"]) assert schema._is_inferred assert not schema_with_removed_cols._is_inferred assert isinstance( schema_with_removed_cols.validate(dataframe.drop("int", axis=1)), pd.DataFrame)
def __init__(self, raw_data: pd.DataFrame): # Limit word usage to their specific pos. schema = pa.DataFrameSchema({ "word": pa.Column(pa.String), "pos": pa.Column(pa.String) }) schema.validate(raw_data) self.raw_data = raw_data # self.words = words # self.pos = pos # Use WordNet to map a word to it's synonym set. self.synset_map = self.get_synset_map(raw_data=raw_data) self.synset_map = self.get_synonyms(synset_map=self.synset_map) # self.word_df = pd.DataFrame() # maps word to synonym, def, pos self.syn_to_word = defaultdict(list) # maps a synonym to word.
def test_dataframe_schema(): """Test that DataFrameSchema works on GeoDataFrames.""" geo_df = gpd.GeoDataFrame( { "geometry": [ Polygon(((0, 0), (0, 1), (1, 1), (1, 0))), Polygon(((0, 0), (0, -1), (-1, -1), (-1, 0))), ], } ) for geo_schema in [ pa.DataFrameSchema({"geometry": pa.Column("geometry")}), pa.DataFrameSchema({"geometry": pa.Column(gpd.array.GeometryDtype)}), pa.DataFrameSchema({"geometry": pa.Column(gpd.array.GeometryDtype())}), ]: assert isinstance(geo_schema.validate(geo_df), gpd.GeoDataFrame)
def _test_literal_pandas_dtype( model: Type[SchemaModel], pandas_dtype: PandasDtype ): schema = model.to_schema() assert ( schema.columns["col"].dtype == pa.Column(pandas_dtype, name="col").dtype )
def test_config() -> None: """Test that Config can be inherited and translate into DataFrameSchema options.""" class Base(pa.SchemaModel): a: Series[int] idx_1: Index[str] idx_2: Index[str] class Config: name = "Base schema" coerce = True ordered = True multiindex_coerce = True multiindex_strict = True multiindex_name: Optional[str] = "mi" class Child(Base): b: Series[int] class Config: name = "Child schema" strict = True multiindex_strict = False description = "foo" title = "bar" expected = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(int) }, index=pa.MultiIndex( [pa.Index(str, name="idx_1"), pa.Index(str, name="idx_2")], coerce=True, strict=False, name="mi", ), name="Child schema", coerce=True, strict=True, ordered=True, description="foo", title="bar", ) assert expected == Child.to_schema()
def test_nullable( dtype: pandas_engine.DataType, data: st.DataObject, ): """Test nullable checks on koalas dataframes.""" checks = None if dtypes.is_datetime(type(dtype)) and MIN_TIMESTAMP is not None: checks = [pa.Check.gt(MIN_TIMESTAMP)] nullable_schema = pa.DataFrameSchema( {"field": pa.Column(dtype, checks=checks, nullable=True)}) nonnullable_schema = pa.DataFrameSchema( {"field": pa.Column(dtype, checks=checks, nullable=False)}) null_sample = data.draw(nullable_schema.strategy(size=5)) nonnull_sample = data.draw(nonnullable_schema.strategy(size=5)) # for some reason values less than MIN_TIMESTAMP are still sampled. if dtype is pandas_engine.DateTime or isinstance(dtype, pandas_engine.DateTime): if MIN_TIMESTAMP is not None and (null_sample < MIN_TIMESTAMP).any( axis=None): with pytest.raises(OverflowError, match="mktime argument out of range"): ks.DataFrame(null_sample) return if MIN_TIMESTAMP is not None and (nonnull_sample < MIN_TIMESTAMP).any( axis=None): with pytest.raises(OverflowError, match="mktime argument out of range"): ks.DataFrame(nonnull_sample) return else: try: ks_null_sample = ks.DataFrame(null_sample) except TypeError as exc: if "can not accept object <NA> in type" not in exc.args[0]: raise pytest.skip("koalas cannot handle native pd.NA type with dtype " f"{dtype.type}") ks_nonnull_sample = ks.DataFrame(nonnull_sample) n_nulls = ks_null_sample.isna().sum().item() assert ks_nonnull_sample.notna().all().item() assert n_nulls >= 0 if n_nulls > 0: with pytest.raises(pa.errors.SchemaError): nonnullable_schema(ks_null_sample)
def _create_schema_null_index(): return pa.DataFrameSchema(columns={ "float_column": pa.Column(pa.Float, checks=[ pa.Check.greater_than(-10), pa.Check.less_than(20), pa.Check.in_range(-10, 20), ]), "str_column": pa.Column(pa.String, checks=[ pa.Check.isin(["foo", "bar", "x", "xy"]), pa.Check.str_length(1, 3) ]), }, index=None)
def test_unsatisfiable_checks(): """Test that unsatisfiable checks raise an exception.""" schema = pa.DataFrameSchema(columns={ "col1": pa.Column(int, checks=[pa.Check.gt(0), pa.Check.lt(0)]) }) for _ in range(5): with pytest.raises(hypothesis.errors.Unsatisfiable): schema.example(size=10)
def helper_type_validation(dataframe_type, schema_type, debugging=False): """Helper function for using same or different dtypes for the dataframe and the schema_type""" df = pd.DataFrame({"column1": [dataframe_type(1)]}) if debugging: print(dataframe_type, df.column1) schema = pa.DataFrameSchema({"column1": pa.Column(schema_type)}) if debugging: print(schema) schema(df)
def test_dtype_coercion(from_dtype, to_dtype, data): """Test the datatype coercion provides informative errors.""" from_schema = pa.DataFrameSchema({"field": pa.Column(from_dtype)}) to_schema = pa.DataFrameSchema({"field": pa.Column(to_dtype, coerce=True)}) pd_sample = data.draw(from_schema.strategy(size=3)) sample = ks.DataFrame(pd_sample) if from_dtype is to_dtype: assert isinstance(to_schema(sample), ks.DataFrame) return # strings that can't be intepreted as numbers are converted to NA if from_dtype is str and to_dtype in {int, float}: with pytest.raises(pa.errors.SchemaError, match="non-nullable series"): to_schema(sample) return assert isinstance(to_schema(sample), ks.DataFrame)
def create_invoice_schema( max_invoice_value: Decimal, min_invoice_value: Decimal, coerce: bool = True, strict: bool = True, nullable: bool = False, ): """Function to validate that invoice schema is correct, it also does value checks in runtime (really nice stuff, right here). Args: max_invoice_value (Decimal): Given max invoice value min_invoice_value (Decimal): Given min invoice value coerce (bool): Flag given to determine whether to coerce series to specified type strict (bool): Flag given to determine whether or not to accept columns in the dataframe that are not in the DataFrame nullable (bool): If columns should be nullable or not Returns: A pandas DataFrame schema that validates that the types are correct, and that the values inserted are correct. If a row is inserted that does not follow: 0 < invoice_value < 200000000.00 An error will be thrown in runtime. """ return pa.DataFrameSchema( { INVOICE_COLUMN_NAMES.get("invoice_name"): pa.Column(pa.String, nullable=nullable), INVOICE_COLUMN_NAMES.get("invoice_value"): pa.Column( pa.Float64, checks=[ pa.Check.less_than_or_equal_to(max_invoice_value), pa.Check.greater_than_or_equal_to(min_invoice_value), ], nullable=nullable, ), }, index=pa.Index(pa.Int), strict=strict, coerce=coerce, )
def test_inherit_schemamodel_fields_alias(): """Test that columns and index aliases are inherited.""" class Base(pa.SchemaModel): a: Series[int] idx: Index[str] class Mid(Base): b: Series[str] = pa.Field(alias="_b") idx: Index[str] class ChildOverrideAttr(Mid): b: Series[int] class ChildOverrideAlias(Mid): b: Series[str] = pa.Field(alias="new_b") class ChildNewAttr(Mid): c: Series[int] class ChildEmpty(Mid): pass expected_mid = pa.DataFrameSchema( columns={ "a": pa.Column(int), "_b": pa.Column(str) }, index=pa.Index(str), ) expected_child_override_attr = expected_mid.rename_columns({ "_b": "b" }).update_column("b", pandas_dtype=int) expected_child_override_alias = expected_mid.rename_columns( {"_b": "new_b"}) expected_child_new_attr = expected_mid.add_columns({ "c": pa.Column(int), }) assert expected_mid == Mid.to_schema() assert expected_child_override_attr == ChildOverrideAttr.to_schema() assert expected_child_override_alias == ChildOverrideAlias.to_schema() assert expected_child_new_attr == ChildNewAttr.to_schema() assert expected_mid == ChildEmpty.to_schema()