Ejemplo n.º 1
0
def validate_dataframe(dataframe: DataFrame) -> bool:
    """Validate dataframe against schema."""
    schema = DataFrameSchema(
        {
            "source": Column(pa.String),
            "topic": Column(pa.String, nullable=True),
            "concept": Column(pa.String, nullable=True),
            "variable": Column(pa.String),
            "label": Column(pa.String),
            "value": Column(pa.Float),
            "denominator_variable": Column(pa.String, nullable=True),
            "denominator_label": Column(pa.String, nullable=True),
            "denominator": Column(pa.Float, nullable=True),
            "year": Column(
                pa.Int,
                checks=[
                    Check.less_than_or_equal_to(dt.now().year),
                    Check.greater_than_or_equal_to(2000),
                ],
            ),
            "year_date": Column(pa.String, checks=[Check(validate_year_date, element_wise=True)]),
            "geo_id": Column(pa.String),
            "geo_name": Column(pa.String),
            "geo_type": Column(pa.String),
            "location": Column(pa.String, checks=[Check(validate_location, element_wise=True)]),
            "row_id": Column(pa.String, allow_duplicates=False),
        },
        strict=True,
        coerce=True,
        checks=[
            # Check that year_date and year fields are aligned
            Check(lambda df: df["year_date"][:4] == df["year"].astype(str), element_wise=True),
            # Check that row_id field concatenates other identifying fields as expected
            Check(lambda df: df["row_id"] == df.apply(make_row_id, axis=1)),
        ],
    )

    # Validate dataframe against schema
    try:
        schema.validate(dataframe)
    except SchemaError as error:
        logger.warning(f"Failed to validate dataframe: {error.args[0]}")
        return False
    else:
        return True
Ejemplo n.º 2
0
 "Connections per Trace": Column(
     pandas_dtype=PandasDtype.Float64,
     checks=[
         Check.greater_than_or_equal_to(min_value=0.0),
     ],
     nullable=False,
     allow_duplicates=True,
     coerce=True,
     required=True,
     regex=False,
 ),
 "Connections per Branch": Column(
     pandas_dtype=PandasDtype.Float64,
     checks=[
         Check.greater_than_or_equal_to(min_value=0.0),
         Check.less_than_or_equal_to(max_value=2.0),
     ],
     nullable=False,
     allow_duplicates=True,
     coerce=True,
     required=True,
     regex=False,
 ),
 "Fracture Intensity (Mauldon)": Column(
     pandas_dtype=PandasDtype.Float64,
     checks=[
         Check.greater_than_or_equal_to(min_value=0.0),
     ],
     nullable=False,
     allow_duplicates=True,
     coerce=True,
Ejemplo n.º 3
0
# %% [markdown] slideshow={"slide_type": "skip"}
# ## Pandera Basics
#
# ### Step 1: Define a `DataFrameSchema`

# %% slideshow={"slide_type": "skip"}
import pandera as pa
from pandera import Column, Check

schema = pa.DataFrameSchema(
    {
        "hours_worked": Column(
            pa.Float, [
                Check.greater_than_or_equal_to(0),
                Check.less_than_or_equal_to(60),
            ],
            nullable=True
        ),
        "wage_per_hour": Column(
            pa.Float, Check.greater_than_or_equal_to(15), nullable=True
        ),
    },
    coerce=True,
)

# %% [markdown] slideshow={"slide_type": "skip"}
# ### Step 2: Call the `schema` on some data

# %% slideshow={"slide_type": "skip"}
import pandas as pd