Ejemplo n.º 1
0
def test_non_str_column_name_regex(column_key):
    """Check that Columns with non-str names cannot have regex=True."""

    with pytest.raises(ValueError):
        DataFrameSchema({
            column_key: Column(
                Float,
                checks=Check.greater_than_or_equal_to(0),
                regex=True,
            ),
        })

    with pytest.raises(ValueError):
        Column(
            Float,
            checks=Check.greater_than_or_equal_to(0),
            name=column_key,
            regex=True,
        )
Ejemplo n.º 2
0
def validate_dataframe(dataframe: DataFrame) -> bool:
    """Validate dataframe against schema."""
    schema = DataFrameSchema(
        {
            "source": Column(pa.String),
            "topic": Column(pa.String, nullable=True),
            "concept": Column(pa.String, nullable=True),
            "variable": Column(pa.String),
            "label": Column(pa.String),
            "value": Column(pa.Float),
            "denominator_variable": Column(pa.String, nullable=True),
            "denominator_label": Column(pa.String, nullable=True),
            "denominator": Column(pa.Float, nullable=True),
            "year": Column(
                pa.Int,
                checks=[
                    Check.less_than_or_equal_to(dt.now().year),
                    Check.greater_than_or_equal_to(2000),
                ],
            ),
            "year_date": Column(pa.String, checks=[Check(validate_year_date, element_wise=True)]),
            "geo_id": Column(pa.String),
            "geo_name": Column(pa.String),
            "geo_type": Column(pa.String),
            "location": Column(pa.String, checks=[Check(validate_location, element_wise=True)]),
            "row_id": Column(pa.String, allow_duplicates=False),
        },
        strict=True,
        coerce=True,
        checks=[
            # Check that year_date and year fields are aligned
            Check(lambda df: df["year_date"][:4] == df["year"].astype(str), element_wise=True),
            # Check that row_id field concatenates other identifying fields as expected
            Check(lambda df: df["row_id"] == df.apply(make_row_id, axis=1)),
        ],
    )

    # Validate dataframe against schema
    try:
        schema.validate(dataframe)
    except SchemaError as error:
        logger.warning(f"Failed to validate dataframe: {error.args[0]}")
        return False
    else:
        return True
Ejemplo n.º 3
0
describe_df_schema = DataFrameSchema(
    columns={
        "geometry": Column(
            pandas_dtype=PandasDtype.String,
            checks=None,
            nullable=False,
            allow_duplicates=True,
            coerce=False,
            required=True,
            regex=False,
        ),
        "X": Column(
            pandas_dtype=PandasDtype.Int64,
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            allow_duplicates=True,
            coerce=True,
            required=True,
            regex=False,
        ),
        "Y": Column(
            pandas_dtype=PandasDtype.Int64,
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            allow_duplicates=True,
            coerce=True,
Ejemplo n.º 4
0
# %% [markdown] slideshow={"slide_type": "slide"}
# ### You find yourself at a familiar function, but it looks a little different from when you left it...

# %% slideshow={"slide_type": "skip"}
# This needs to be here, but skipped for story-telling effect in the slides
import pandera as pa
from pandera import Column, Check

in_schema = pa.DataFrameSchema({
    "hours_worked": Column(pa.Float, coerce=True, nullable=True),
    "wage_per_hour": Column(pa.Float, coerce=True, nullable=True),
})

out_schema = (
    in_schema
    .update_column("hours_worked", checks=Check.greater_than_or_equal_to(0))
    .add_columns({"weekly_income": Column(pa.Float, nullable=True)})
)


# %%
@pa.check_input(in_schema)
@pa.check_output(out_schema)
def process_data(df):
    return (
        df.assign(
            hours_worked=lambda x: x.hours_worked.where(  # <- replace negative values with nans
                x.hours_worked >= 0, np.nan
            )
        )
        .assign(