class SampleDataSchema(pa.SchemaModel): """pandera schema of the parquet test dataset""" registration_dttm: pa.typing.Series[pa.typing.DateTime] id: pa.typing.Series[pd.Int64Dtype] = pa.Field(nullable=True, coerce=True) first_name: pa.typing.Series[pa.typing.String] last_name: pa.typing.Series[pa.typing.String] email: pa.typing.Series[pa.typing.String] gender: pa.typing.Series[pa.typing.String] = pa.Field(coerce=True) ip_address: pa.typing.Series[pa.typing.String] cc: pa.typing.Series[pa.typing.String] country: pa.typing.Series[pa.typing.String] birthdate: pa.typing.Series[pa.typing.String] salary: pa.typing.Series[pa.typing.Float64] = pa.Field(nullable=True) title: pa.typing.Series[pa.typing.String] comments: pa.typing.Series[pa.typing.String] = pa.Field(nullable=True) @staticmethod def length(): """Known length of the data""" return 5000 @staticmethod def n_salary_over_150000(): """Number of rows with salary > 150000""" return 2384
class Base(pa.SchemaModel): a: Series[int] b: Series[int] = pa.Field() c: Series[int] = pa.Field(alias="_c") d: Series[int] = pa.Field(alias=123) i1: Index[int] i2: Index[int] = pa.Field()
class BaseSchema(pa.SchemaModel): square_footage: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 3000}) n_bedrooms: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 10}) price: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 1000000}) class Config: coerce = True
class BollingerBands(pa.SchemaModel): """Bollinger bands for a set of stock prices.""" name: Series[str] = pa.Field(description="Ticker symbol of stock") date: Series[pd.Timestamp] = pa.Field(description="Date of prices") upper: Series[float] = pa.Field(ge=0, description="Upper band") lower: Series[float] = pa.Field(description="Lower band")
class StockPrices(pa.SchemaModel): """Open/close prices for one or more stocks by day.""" name: Series[str] = pa.Field(description="Ticker symbol of stock") date: Series[str] = pa.Field(description="Date of prices") open: Series[float] = pa.Field(ge=0, description="Price at market open") close: Series[float] = pa.Field(ge=0, description="Price at market close")
class StatsBombGameSchema(GameSchema): """Definition of a dataframe containing a list of games.""" competition_stage: Series[str] home_score: Series[int] away_score: Series[int] venue: Series[str] = pa.Field(nullable=True) referee_id: Series[int] = pa.Field(nullable=True)
class Schema(pa.SchemaModel): col_2020: Series[int] = pa.Field(alias=2020) idx: Index[int] = pa.Field(alias="_idx", check_name=True) @pa.check(2020) @classmethod def int_column_lt_100(cls, series: pd.Series) -> Iterable[bool]: return series < 100
class AnomalousEvents(pa.SchemaModel): """Anomalous price events, defined by a day on which a stock's closing price strayed above or below its Bollinger bands.""" date: Series[pd.Timestamp] = pa.Field(description="Date of price event") name: Series[str] = pa.Field(description="Ticker symbol of stock") event: Series[pd.CategoricalDtype] = pa.Field( description="Type of event: 'high' or low'")
class Child(Base): b: Series[str] = pa.Field(alias="_b") c: Series[str] d: Series[str] = pa.Field() extra1: Series[int] extra2: Series[int] = pa.Field() extra3: Series[int] = pa.Field(alias="_extra3") i1: Index[str] i3: Index[int] = pa.Field(alias="_i3")
class CovidPatientDataframe(pa.SchemaModel): txn_date: Series[DateTime] = pa.Field(ge=datetime(2020, 1, 1), coerce=True) gender: Series[Category] age_number: Series[Int] = pa.Field(coerce=True) age_range: Series[String] job: Series[String] risk: Series[String] patient_type: Series[Category] province: Series[String] update_date: Series[DateTime] = pa.Field(ge=datetime(2020, 1, 1), coerce=True)
class InSchema(pa.SchemaModel): hourly_pay: Series[float] = pa.Field(ge=7) hours_worked: Series[float] = pa.Field(ge=10) @pa.check("hourly_pay", "hours_worked") def check_numbers_are_positive(cls, series: Series) -> Series[bool]: """Defines a column-level custom check.""" return series > 0 class Config: coerce = True
class Schema(pa.SchemaModel): """Schema that uses registered checks in Field.""" col1: pa.typing.Series[int] = pa.Field(custom_gt=100) col2: pa.typing.Series[float] = pa.Field(custom_in_range={ "min_value": -10, "max_value": 10 }) class Config: coerce = True
class EventSchema(pa.SchemaModel): """Definition of a dataframe containing event stream data of a game.""" game_id: Series[int] event_id: Series[int] period_id: Series[int] team_id: Series[int] = pa.Field(nullable=True) player_id: Series[int] = pa.Field(nullable=True) type_id: Series[int] type_name: Series[str] class Config: # noqa: D106 strict = True
class Schema(pa.SchemaModel): a: Series[int] = pa.Field(eq=1) @pa.check("a") @classmethod def int_column_lt_100(cls, series: pd.Series) -> Iterable[bool]: return series < 100
def test_field_to_index(): """Test that Field outputs the correct index options.""" for flag in ["nullable", "allow_duplicates"]: for value in [True, False]: index = pa.Field(**{flag: value}).to_index(pa.DateTime) assert isinstance(index, pa.Index) assert index.dtype == pa.DateTime.value assert getattr(index, flag) == value
class Schema(pa.SchemaModel): a: Series[int] = pa.Field(alias=2020, gt=50) @pa.check(a) def int_column_lt_100(cls, series: pd.Series) -> Iterable[bool]: # pylint:disable=no-self-argument assert cls is Schema return series < 100
class SampleSchemaModel(pa.SchemaModel): a: pa.typing.Series[int] = pa.Field(le=10, description="a desc") b: pa.typing.Series[float] = pa.Field(lt=-1.2, description="b desc") c: pa.typing.Series[str] = pa.Field(str_startswith="value_", description="c desc") @pa.check("c") def c_check( # pylint: disable=no-self-argument cls, series: pa.typing.Series[str]) -> pa.typing.Series[bool]: """Two words separated by underscore""" return series.str.split("_", expand=True).shape[1] == 2 @pa.dataframe_check def a_gt_b(cls, df): """sum(a) > sum(b)""" return df["a"].sum() > df["b"].sum() Config = make_schema_model_config(**config_attrs)
def test_field_to_index() -> None: """Test that Field outputs the correct index options.""" for flag in ["nullable", "unique"]: for value in [True, False]: index = pa.Field(**{ flag: value }).to_index(pa.DateTime) # type: ignore[arg-type] assert isinstance(index, pa.Index) assert index.dtype == Engine.dtype(pa.DateTime) assert getattr(index, flag) == value
def test_field_to_column(): """Test that Field outputs the correct column options.""" for flag in ["nullable", "allow_duplicates", "coerce", "regex"]: for value in [True, False]: col = pa.Field(**{ flag: value }).to_column(pa.DateTime, required=value) assert isinstance(col, pa.Column) assert col.dtype == pa.DateTime.value assert col.properties[flag] == value assert col.required == value
def test_field_to_column() -> None: """Test that Field outputs the correct column options.""" for flag in ["nullable", "unique", "coerce", "regex"]: for value in [True, False]: col = pa.Field(**{ flag: value }).to_column( # type: ignore[arg-type] pa.DateTime, required=value) assert isinstance(col, pa.Column) assert col.dtype == Engine.dtype(pa.DateTime) assert col.properties[flag] == value assert col.required == value
class StockPrices(pa.SchemaModel): """Open/high/low/close prices for a set of stocks by day.""" name: Series[str] = pa.Field(description="Ticker symbol of stock") date: Series[pd.Timestamp] = pa.Field(description="Date of prices") open: Series[float] = pa.Field(ge=0, description="Price at market open") high: Series[float] = pa.Field(ge=0, description="Highest price of the day") low: Series[float] = pa.Field(ge=0, description="Lowest price of the day") close: Series[float] = pa.Field(ge=0, description="Price at market close") volume: Series[int] = pa.Field( ge=0, description="Number of shares traded for day")
class SampleDataSchema(pa.SchemaModel): """pandera schema of the test dataset""" col_timedelta: pa.typing.Series[pa.typing.Timedelta] = pa.Field(nullable=True) col_datetime: pa.typing.Series[pa.typing.DateTime] = pa.Field(nullable=True) col_bool: pa.typing.Series[pa.typing.Bool] = pa.Field(nullable=True) col_int: pa.typing.Series[pa.typing.Int] = pa.Field(nullable=True) col_string: pa.typing.Series[pa.typing.String] = pa.Field(nullable=True) col_float: pa.typing.Series[pa.typing.Float64] = pa.Field(nullable=True)
class CovidSitRepDataFrame(pa.SchemaModel): txn_date: Series[DateTime] = pa.Field(ge=datetime(2020, 1, 1), coerce=True) province: Optional[Series[String]] new_case: Series[Int] = pa.Field(ge=0, coerce=True) total_case: Series[Int] = pa.Field(ge=0, coerce=True) new_case_excludeabroad: Series[Int] total_case_excludeabroad: Series[Int] new_death: Series[Int] = pa.Field(ge=0, coerce=True) total_death: Series[Int] = pa.Field(ge=0, coerce=True) new_recovered: Optional[Series[Int]] total_recovered: Optional[Series[Int]] update_date: Optional[Series[DateTime]] = pa.Field(ge=datetime(2020, 1, 1), coerce=True)
class StatsBombEventSchema(EventSchema): """Definition of a dataframe containing event stream data of a game.""" event_id: Series[object] index: Series[int] timestamp: Series[DateTime] minute: Series[int] second: Series[int] = pa.Field(ge=0, le=59) possession: Series[int] possession_team_id: Series[int] possession_team_name: Series[str] play_pattern_id: Series[int] play_pattern_name: Series[str] team_name: Series[str] duration: Series[float] = pa.Field(nullable=True) extra: Series[object] related_events: Series[object] player_name: Series[str] = pa.Field(nullable=True) position_id: Series[int] = pa.Field(nullable=True) position_name: Series[str] = pa.Field(nullable=True) location: Series[object] = pa.Field(nullable=True) under_pressure: Series[bool] = pa.Field(nullable=True) counterpress: Series[bool] = pa.Field(nullable=True)
class ChildField(Base): a: Series[str] = pa.Field() b: Series[str] = pa.Field() c: Series[str] = pa.Field()
class MISchema(pa.SchemaModel): idx1: Index[int] = pa.Field(alias="index0") idx2: Index[int] = pa.Field(alias="index1")
class Schema(pa.SchemaModel): a: Optional[Series[str]] b: Optional[Series[str]] = pa.Field(eq="b") c: Optional[Series[String]] # test pandera.typing alias
class Schema(pa.SchemaModel): col_regex: Series[str] = pa.Field(alias="column_([0-9])+", regex=True)
class Base(pa.SchemaModel): a: Series[int] b: Series[int] = pa.Field() c: Series[int] = pa.Field(alias="_c")
class ChildAlias(Base): a: Series[str] = pa.Field(alias="_a") b: Series[str] = pa.Field(alias="_b") c: Series[str] = pa.Field(alias="_c")