def test_non_str_column_name_regex(column_key): """Check that Columns with non-str names cannot have regex=True.""" with pytest.raises(ValueError): DataFrameSchema({ column_key: Column( Float, checks=Check.greater_than_or_equal_to(0), regex=True, ), }) with pytest.raises(ValueError): Column( Float, checks=Check.greater_than_or_equal_to(0), name=column_key, regex=True, )
def validate_dataframe(dataframe: DataFrame) -> bool: """Validate dataframe against schema.""" schema = DataFrameSchema( { "source": Column(pa.String), "topic": Column(pa.String, nullable=True), "concept": Column(pa.String, nullable=True), "variable": Column(pa.String), "label": Column(pa.String), "value": Column(pa.Float), "denominator_variable": Column(pa.String, nullable=True), "denominator_label": Column(pa.String, nullable=True), "denominator": Column(pa.Float, nullable=True), "year": Column( pa.Int, checks=[ Check.less_than_or_equal_to(dt.now().year), Check.greater_than_or_equal_to(2000), ], ), "year_date": Column(pa.String, checks=[Check(validate_year_date, element_wise=True)]), "geo_id": Column(pa.String), "geo_name": Column(pa.String), "geo_type": Column(pa.String), "location": Column(pa.String, checks=[Check(validate_location, element_wise=True)]), "row_id": Column(pa.String, allow_duplicates=False), }, strict=True, coerce=True, checks=[ # Check that year_date and year fields are aligned Check(lambda df: df["year_date"][:4] == df["year"].astype(str), element_wise=True), # Check that row_id field concatenates other identifying fields as expected Check(lambda df: df["row_id"] == df.apply(make_row_id, axis=1)), ], ) # Validate dataframe against schema try: schema.validate(dataframe) except SchemaError as error: logger.warning(f"Failed to validate dataframe: {error.args[0]}") return False else: return True
describe_df_schema = DataFrameSchema( columns={ "geometry": Column( pandas_dtype=PandasDtype.String, checks=None, nullable=False, allow_duplicates=True, coerce=False, required=True, regex=False, ), "X": Column( pandas_dtype=PandasDtype.Int64, checks=[ Check.greater_than_or_equal_to(min_value=0.0), ], nullable=False, allow_duplicates=True, coerce=True, required=True, regex=False, ), "Y": Column( pandas_dtype=PandasDtype.Int64, checks=[ Check.greater_than_or_equal_to(min_value=0.0), ], nullable=False, allow_duplicates=True, coerce=True,
# %% [markdown] slideshow={"slide_type": "slide"} # ### You find yourself at a familiar function, but it looks a little different from when you left it... # %% slideshow={"slide_type": "skip"} # This needs to be here, but skipped for story-telling effect in the slides import pandera as pa from pandera import Column, Check in_schema = pa.DataFrameSchema({ "hours_worked": Column(pa.Float, coerce=True, nullable=True), "wage_per_hour": Column(pa.Float, coerce=True, nullable=True), }) out_schema = ( in_schema .update_column("hours_worked", checks=Check.greater_than_or_equal_to(0)) .add_columns({"weekly_income": Column(pa.Float, nullable=True)}) ) # %% @pa.check_input(in_schema) @pa.check_output(out_schema) def process_data(df): return ( df.assign( hours_worked=lambda x: x.hours_worked.where( # <- replace negative values with nans x.hours_worked >= 0, np.nan ) ) .assign(