def sample_dataframe_schema(**kwargs): return pa.DataFrameSchema( { "a": pa.Column(int, checks=pa.Check.le(10), description="a desc"), "b": pa.Column(float, checks=pa.Check.lt(-1.2), description="b desc"), "c": pa.Column( str, description="c desc", checks=[ pa.Check.str_startswith("value_"), pa.Check( lambda s: s.str.split("_", expand=True).shape[1] == 2, description="Two words separated by underscore", ), ], ), }, checks=[ pa.Check(lambda df: df["a"].sum() > df["b"].sum(), description="sum(a) > sum(b)"), ], **kwargs, )
def test_to_script_lambda_check(): """Test writing DataFrameSchema to a script with lambda check.""" schema1 = pandera.DataFrameSchema( { "a": pandera.Column( pandera.Int, checks=pandera.Check( lambda s: s.mean() > 5, element_wise=False ), ), } ) with pytest.warns(UserWarning): pandera.io.to_script(schema1) schema2 = pandera.DataFrameSchema( { "a": pandera.Column( pandera.Int, ), }, checks=pandera.Check(lambda s: s.mean() > 5, element_wise=False), ) with pytest.warns(UserWarning, match=".*registered checks.*"): pandera.io.to_script(schema2)
def test_csv_download(self): df = data_import.download_csv() schema_csv_download = pa.DataFrameSchema({ 'name': pa.Column(pa.String), 'energy,calculated (kJ)': pa.Column( pa.Int, pa.Check(lambda x: 0 <= x <= 4000, element_wise=True, error="kJ range checker [0, 2000]")), 'fat, total (g)': pa.Column(pa.String), 'carbohydrate, available (g)': pa.Column(pa.String), 'protein, total (g)': pa.Column(pa.String), # 'fibre, total (g)': pa.Column(), # can have NaN values 'sugars, total (g)': pa.Column(pa.String), 'alcohol (g)': pa.Column(pa.String), # 'sodium (mg)': pa.Column(), # can have NaN values 'salt (mg)': pa.Column(pa.String), }) df_valid = schema_csv_download.validate(df) self.assertTrue(1000 in df_valid.index)
def test_to_yaml_lambda_check(): """Test writing DataFrameSchema to a yaml with lambda check.""" schema = pa.DataFrameSchema({ "a": pa.Column( pa.Int, checks=pa.Check(lambda s: s.mean() > 5, element_wise=False) ), }) with pytest.warns(UserWarning): pa.io.to_yaml(schema)
def test_to_yaml_custom_dataframe_check(): """Tests that writing DataFrameSchema with an unregistered check raises.""" schema = pa.DataFrameSchema( { "a": pa.Column(pa.Int, ), }, checks=[pa.Check(lambda obj: len(obj.index) > 1)], ) with pytest.warns(UserWarning, match=".*registered checks.*"): pa.io.to_yaml(schema)
def test_infer_series_schema(series): """Test series schema is correctly inferred.""" schema = schema_inference.infer_series_schema(series) assert isinstance(schema, pa.SeriesSchema) with pytest.warns( UserWarning, match="^This .+ is an inferred schema that hasn't been modified"): schema.validate(series) # modifying an inferred schema should set _is_inferred to False schema_with_new_checks = schema.set_checks( [pa.Check(lambda x: x is not None)]) assert schema._is_inferred assert not schema_with_new_checks._is_inferred assert isinstance(schema_with_new_checks.validate(series), pd.Series)
def prioritized_values_check(named_priorities: Dict[str, int], separator: str, name: str) -> pa.checks.Check: """ Construct check for e.g. data source and scale columns. Both have fixed values and fixed order of the values. """ return pa.Check( lambda value: schema_checks.named_priority_check( value, named_priorities=named_priorities, separator=separator, ), element_wise=True, name=name, )
def test_custom_checks(custom_check_teardown): """Test that custom checks can be executed.""" @extensions.register_check_method(statistics=["value"]) def modin_eq(modin_obj, *, value): return modin_obj == value custom_schema = pa.DataFrameSchema( {"field": pa.Column(checks=pa.Check(lambda s: s == 0, name="custom"))}) custom_registered_schema = pa.DataFrameSchema( {"field": pa.Column(checks=pa.Check.modin_eq(0))}) for schema in (custom_schema, custom_registered_schema): schema(mpd.DataFrame({"field": [0] * 100})) try: schema(mpd.DataFrame({"field": [-1] * 100})) except pa.errors.SchemaError as err: assert (err.failure_cases["failure_case"] == -1).all()
) example = data.draw(strat) if nullable: assert example.isna().any(axis=None) else: assert example.notna().all(axis=None) @pytest.mark.parametrize( "schema, warning", [ [ pa.SeriesSchema( pa.Int, checks=[ pa.Check(lambda x: x > 0, element_wise=True), pa.Check(lambda x: x > -10, element_wise=True), ], ), "Element-wise", ], [ pa.SeriesSchema( pa.Int, checks=[ pa.Check(lambda s: s > -10000), pa.Check(lambda s: s > -9999), ], ), "Vectorized", ],
import titanic.datalake as datalake from titanic.config import Config, parse_args schema = pa.DataFrameSchema( { "PassengerId": pa.Column(int), "Survived": pa.Column(int, checks=pa.Check.isin([0, 1])), "Pclass": pa.Column(int, checks=pa.Check.isin([0, 1, 2, 3])), "Name": pa.Column(str), "Sex": pa.Column(str, checks=pa.Check(lambda s: s.isin(["male", "female"]))), "Age": pa.Column(float, checks=pa.Check.less_than(100, ignore_na=True), nullable=True), "SibSp": pa.Column(int), "Parch": pa.Column(int), "Ticket": pa.Column(str), "Fare": pa.Column(float), "Cabin": pa.Column(str, nullable=True), "Embarked":
def traces_schema(metadata: rules.Metadata): """ Get pandera schema for traces GeoDataFrame. """ trace_columns: Dict[str, pa.Column] = { VALIDATION_ERROR_COLUMN: pa.Column(pa.String, **default_non_required_kwargs()), DIP_COLUMN: pa.Column( pa.Float, **default_non_required_kwargs(), checks=[pa.checks.Check.in_range(min_value=0.0, max_value=90.0)], ), DIP_DIR_COLUMN: pa.Column( pa.Float, **default_non_required_kwargs(), checks=[pa.checks.Check.in_range(min_value=0.0, max_value=360.0)], ), DATA_SOURCE_COLUMN: pa.Column( pa.String, **default_non_required_kwargs(nullable=False), checks=[ prioritized_values_check( named_priorities=metadata.data_source.order, separator=metadata.data_source.separator, name= f"Value and priority order check for {DATA_SOURCE_COLUMN}.", ) ], ), DATE_COLUMN: pa.Column( pa.DateTime, **default_non_required_kwargs(nullable=False), checks=[ pa.Check(schema_checks.date_datetime_check, element_wise=True) ], ), OPERATOR_COLUMN: pa.Column( pa.String, **default_non_required_kwargs(nullable=False), checks=[pa.Check.isin(metadata.operators)], ), SCALE_COLUMN: pa.Column( pa.String, **default_non_required_kwargs(nullable=False), checks=[ prioritized_values_check( named_priorities=metadata.scale.order, separator=metadata.scale.separator, name=f"Value and priority order check for {SCALE_COLUMN}.", ) ], ), CERTAINTY_COLUMN: pa.Column( pa.String, **default_non_required_kwargs(nullable=False), checks=[pa.Check.isin(metadata.certainty)], ), LINEAMENT_ID_COLUMN: pa.Column( pa.String, **default_non_required_kwargs(nullable=False), checks=[ pa.Check( lambda raw_value: schema_checks.lineament_id_check( raw_value=raw_value, lineament_id_prefixes=metadata.lineament_id_prefixes, ), element_wise=True, name=f"{LINEAMENT_ID_COLUMN} check.", ) ], allow_duplicates=False, ), } return pa.DataFrameSchema( index=pa.Index(pa.Int), columns={ "geometry": pa.Column(required=True, ), **trace_columns, }, )
import pandera as pa import pandas as pd data = pd.read_json('../user.json') print(data) # Defining the schema schema = pa.DataFrameSchema({ "email" : pa.Column(pa.String, nullable=False) "books" : pa.Column(pa.String, nullable=True), "title" : pa.Column(pa.String, pa.Check(len<=120), nullable=True), "isbn" : pa.Column(pa.String, nullable=False), }) # Validating the data schema.validate(data_sample)
def test_add_necessary_columns(self): data = { 'name': [ 'Tämä on Ruuan kategoria, ja tämä tuotemerkki Beef, ja tää on detail' ], 'energy,calculated (kJ)': [123], 'fat, total (g)': ['3.4'], 'carbohydrate, available (g)': ['58.8'], 'protein, total (g)': ['<0.1'], 'sugars, total (g)': ['1.3'], 'fibre, total (g)': ['11.5'], 'alcohol (g)': ['0.0'], 'sodium (mg)': ['87.8'], 'salt (mg)': ['470.1'], 'lactose (g)': ['2.1'], } df_test = pd.DataFrame(data) df = data_import.add_necessary_columns(df_test) schema_added_columns = pa.DataFrameSchema({ 'category': pa.Column(pa.String), 'extra_category': pa.Column(pa.String), 'kcal': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'fat_kcal': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'carb_kcal': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'protein_kcal': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'alc_kcal': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'sugar': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'fibre': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'alc': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'sodium': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'salt': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'kcal_ratio': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), 'lactose': pa.Column(pa.Float, pa.Check(lambda s: s >= 0)), }) df_valid = schema_added_columns.validate(df) self.assertEqual(df_valid['kcal'][0], df_test['energy,calculated (kJ)'][0] / 4.184) self.assertEqual(df_valid['salt'][0], 470.1) self.assertEqual(df_valid['category'][0], 'Tämä on Ruuan kategoria') self.assertEqual(df_valid['extra_category'][0], 'beef')