Beispiel #1
0
class SampleDataSchema(pa.SchemaModel):
    """pandera schema of the parquet test dataset"""

    registration_dttm: pa.typing.Series[pa.typing.DateTime]
    id: pa.typing.Series[pd.Int64Dtype] = pa.Field(nullable=True, coerce=True)
    first_name: pa.typing.Series[pa.typing.String]
    last_name: pa.typing.Series[pa.typing.String]
    email: pa.typing.Series[pa.typing.String]
    gender: pa.typing.Series[pa.typing.String] = pa.Field(coerce=True)
    ip_address: pa.typing.Series[pa.typing.String]
    cc: pa.typing.Series[pa.typing.String]
    country: pa.typing.Series[pa.typing.String]
    birthdate: pa.typing.Series[pa.typing.String]
    salary: pa.typing.Series[pa.typing.Float64] = pa.Field(nullable=True)
    title: pa.typing.Series[pa.typing.String]
    comments: pa.typing.Series[pa.typing.String] = pa.Field(nullable=True)

    @staticmethod
    def length():
        """Known length of the data"""
        return 5000

    @staticmethod
    def n_salary_over_150000():
        """Number of rows with salary > 150000"""
        return 2384
Beispiel #2
0
 class Base(pa.SchemaModel):
     a: Series[int]
     b: Series[int] = pa.Field()
     c: Series[int] = pa.Field(alias="_c")
     d: Series[int] = pa.Field(alias=123)
     i1: Index[int]
     i2: Index[int] = pa.Field()
class BaseSchema(pa.SchemaModel):
    square_footage: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 3000})
    n_bedrooms: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 10})
    price: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 1000000})

    class Config:
        coerce = True
Beispiel #4
0
class BollingerBands(pa.SchemaModel):
    """Bollinger bands for a set of stock prices."""

    name: Series[str] = pa.Field(description="Ticker symbol of stock")
    date: Series[pd.Timestamp] = pa.Field(description="Date of prices")
    upper: Series[float] = pa.Field(ge=0, description="Upper band")
    lower: Series[float] = pa.Field(description="Lower band")
Beispiel #5
0
class StockPrices(pa.SchemaModel):
    """Open/close prices for one or more stocks by day."""

    name: Series[str] = pa.Field(description="Ticker symbol of stock")
    date: Series[str] = pa.Field(description="Date of prices")
    open: Series[float] = pa.Field(ge=0, description="Price at market open")
    close: Series[float] = pa.Field(ge=0, description="Price at market close")
Beispiel #6
0
class StatsBombGameSchema(GameSchema):
    """Definition of a dataframe containing a list of games."""

    competition_stage: Series[str]
    home_score: Series[int]
    away_score: Series[int]
    venue: Series[str] = pa.Field(nullable=True)
    referee_id: Series[int] = pa.Field(nullable=True)
Beispiel #7
0
    class Schema(pa.SchemaModel):
        col_2020: Series[int] = pa.Field(alias=2020)
        idx: Index[int] = pa.Field(alias="_idx", check_name=True)

        @pa.check(2020)
        @classmethod
        def int_column_lt_100(cls, series: pd.Series) -> Iterable[bool]:
            return series < 100
Beispiel #8
0
class AnomalousEvents(pa.SchemaModel):
    """Anomalous price events, defined by a day on which a stock's closing price strayed above or
    below its Bollinger bands."""

    date: Series[pd.Timestamp] = pa.Field(description="Date of price event")
    name: Series[str] = pa.Field(description="Ticker symbol of stock")
    event: Series[pd.CategoricalDtype] = pa.Field(
        description="Type of event: 'high' or low'")
Beispiel #9
0
 class Child(Base):
     b: Series[str] = pa.Field(alias="_b")
     c: Series[str]
     d: Series[str] = pa.Field()
     extra1: Series[int]
     extra2: Series[int] = pa.Field()
     extra3: Series[int] = pa.Field(alias="_extra3")
     i1: Index[str]
     i3: Index[int] = pa.Field(alias="_i3")
class CovidPatientDataframe(pa.SchemaModel):
    txn_date: Series[DateTime] = pa.Field(ge=datetime(2020, 1, 1), coerce=True)
    gender: Series[Category]
    age_number: Series[Int] = pa.Field(coerce=True)
    age_range: Series[String]
    job: Series[String]
    risk: Series[String]
    patient_type: Series[Category]
    province: Series[String]
    update_date: Series[DateTime] = pa.Field(ge=datetime(2020, 1, 1),
                                             coerce=True)
Beispiel #11
0
class InSchema(pa.SchemaModel):
    hourly_pay: Series[float] = pa.Field(ge=7)
    hours_worked: Series[float] = pa.Field(ge=10)

    @pa.check("hourly_pay", "hours_worked")
    def check_numbers_are_positive(cls, series: Series) -> Series[bool]:
        """Defines a column-level custom check."""
        return series > 0

    class Config:
        coerce = True
Beispiel #12
0
    class Schema(pa.SchemaModel):
        """Schema that uses registered checks in Field."""

        col1: pa.typing.Series[int] = pa.Field(custom_gt=100)
        col2: pa.typing.Series[float] = pa.Field(custom_in_range={
            "min_value": -10,
            "max_value": 10
        })

        class Config:
            coerce = True
Beispiel #13
0
class EventSchema(pa.SchemaModel):
    """Definition of a dataframe containing event stream data of a game."""

    game_id: Series[int]
    event_id: Series[int]
    period_id: Series[int]
    team_id: Series[int] = pa.Field(nullable=True)
    player_id: Series[int] = pa.Field(nullable=True)
    type_id: Series[int]
    type_name: Series[str]

    class Config:  # noqa: D106
        strict = True
Beispiel #14
0
    class Schema(pa.SchemaModel):
        a: Series[int] = pa.Field(eq=1)

        @pa.check("a")
        @classmethod
        def int_column_lt_100(cls, series: pd.Series) -> Iterable[bool]:
            return series < 100
def test_field_to_index():
    """Test that Field outputs the correct index options."""
    for flag in ["nullable", "allow_duplicates"]:
        for value in [True, False]:
            index = pa.Field(**{flag: value}).to_index(pa.DateTime)
            assert isinstance(index, pa.Index)
            assert index.dtype == pa.DateTime.value
            assert getattr(index, flag) == value
Beispiel #16
0
    class Schema(pa.SchemaModel):
        a: Series[int] = pa.Field(alias=2020, gt=50)

        @pa.check(a)
        def int_column_lt_100(cls, series: pd.Series) -> Iterable[bool]:
            # pylint:disable=no-self-argument
            assert cls is Schema
            return series < 100
    class SampleSchemaModel(pa.SchemaModel):

        a: pa.typing.Series[int] = pa.Field(le=10, description="a desc")
        b: pa.typing.Series[float] = pa.Field(lt=-1.2, description="b desc")
        c: pa.typing.Series[str] = pa.Field(str_startswith="value_",
                                            description="c desc")

        @pa.check("c")
        def c_check(  # pylint: disable=no-self-argument
                cls, series: pa.typing.Series[str]) -> pa.typing.Series[bool]:
            """Two words separated by underscore"""
            return series.str.split("_", expand=True).shape[1] == 2

        @pa.dataframe_check
        def a_gt_b(cls, df):
            """sum(a) > sum(b)"""
            return df["a"].sum() > df["b"].sum()

        Config = make_schema_model_config(**config_attrs)
Beispiel #18
0
def test_field_to_index() -> None:
    """Test that Field outputs the correct index options."""
    for flag in ["nullable", "unique"]:
        for value in [True, False]:
            index = pa.Field(**{
                flag: value
            }).to_index(pa.DateTime)  # type: ignore[arg-type]
            assert isinstance(index, pa.Index)
            assert index.dtype == Engine.dtype(pa.DateTime)
            assert getattr(index, flag) == value
def test_field_to_column():
    """Test that Field outputs the correct column options."""
    for flag in ["nullable", "allow_duplicates", "coerce", "regex"]:
        for value in [True, False]:
            col = pa.Field(**{
                flag: value
            }).to_column(pa.DateTime, required=value)
            assert isinstance(col, pa.Column)
            assert col.dtype == pa.DateTime.value
            assert col.properties[flag] == value
            assert col.required == value
Beispiel #20
0
def test_field_to_column() -> None:
    """Test that Field outputs the correct column options."""
    for flag in ["nullable", "unique", "coerce", "regex"]:
        for value in [True, False]:
            col = pa.Field(**{
                flag: value
            }).to_column(  # type: ignore[arg-type]
                pa.DateTime, required=value)
            assert isinstance(col, pa.Column)
            assert col.dtype == Engine.dtype(pa.DateTime)
            assert col.properties[flag] == value
            assert col.required == value
Beispiel #21
0
class StockPrices(pa.SchemaModel):
    """Open/high/low/close prices for a set of stocks by day."""

    name: Series[str] = pa.Field(description="Ticker symbol of stock")
    date: Series[pd.Timestamp] = pa.Field(description="Date of prices")
    open: Series[float] = pa.Field(ge=0, description="Price at market open")
    high: Series[float] = pa.Field(ge=0,
                                   description="Highest price of the day")
    low: Series[float] = pa.Field(ge=0, description="Lowest price of the day")
    close: Series[float] = pa.Field(ge=0, description="Price at market close")
    volume: Series[int] = pa.Field(
        ge=0, description="Number of shares traded for day")
Beispiel #22
0
class SampleDataSchema(pa.SchemaModel):
    """pandera schema of the test dataset"""

    col_timedelta: pa.typing.Series[pa.typing.Timedelta] = pa.Field(nullable=True)
    col_datetime: pa.typing.Series[pa.typing.DateTime] = pa.Field(nullable=True)
    col_bool: pa.typing.Series[pa.typing.Bool] = pa.Field(nullable=True)
    col_int: pa.typing.Series[pa.typing.Int] = pa.Field(nullable=True)
    col_string: pa.typing.Series[pa.typing.String] = pa.Field(nullable=True)
    col_float: pa.typing.Series[pa.typing.Float64] = pa.Field(nullable=True)
class CovidSitRepDataFrame(pa.SchemaModel):
    txn_date: Series[DateTime] = pa.Field(ge=datetime(2020, 1, 1), coerce=True)
    province: Optional[Series[String]]
    new_case: Series[Int] = pa.Field(ge=0, coerce=True)
    total_case: Series[Int] = pa.Field(ge=0, coerce=True)
    new_case_excludeabroad: Series[Int]
    total_case_excludeabroad: Series[Int]
    new_death: Series[Int] = pa.Field(ge=0, coerce=True)
    total_death: Series[Int] = pa.Field(ge=0, coerce=True)
    new_recovered: Optional[Series[Int]]
    total_recovered: Optional[Series[Int]]
    update_date: Optional[Series[DateTime]] = pa.Field(ge=datetime(2020, 1, 1),
                                                       coerce=True)
Beispiel #24
0
class StatsBombEventSchema(EventSchema):
    """Definition of a dataframe containing event stream data of a game."""

    event_id: Series[object]
    index: Series[int]
    timestamp: Series[DateTime]
    minute: Series[int]
    second: Series[int] = pa.Field(ge=0, le=59)
    possession: Series[int]
    possession_team_id: Series[int]
    possession_team_name: Series[str]
    play_pattern_id: Series[int]
    play_pattern_name: Series[str]
    team_name: Series[str]
    duration: Series[float] = pa.Field(nullable=True)
    extra: Series[object]
    related_events: Series[object]
    player_name: Series[str] = pa.Field(nullable=True)
    position_id: Series[int] = pa.Field(nullable=True)
    position_name: Series[str] = pa.Field(nullable=True)
    location: Series[object] = pa.Field(nullable=True)
    under_pressure: Series[bool] = pa.Field(nullable=True)
    counterpress: Series[bool] = pa.Field(nullable=True)
Beispiel #25
0
 class ChildField(Base):
     a: Series[str] = pa.Field()
     b: Series[str] = pa.Field()
     c: Series[str] = pa.Field()
Beispiel #26
0
 class MISchema(pa.SchemaModel):
     idx1: Index[int] = pa.Field(alias="index0")
     idx2: Index[int] = pa.Field(alias="index1")
Beispiel #27
0
 class Schema(pa.SchemaModel):
     a: Optional[Series[str]]
     b: Optional[Series[str]] = pa.Field(eq="b")
     c: Optional[Series[String]]  # test pandera.typing alias
Beispiel #28
0
 class Schema(pa.SchemaModel):
     col_regex: Series[str] = pa.Field(alias="column_([0-9])+", regex=True)
Beispiel #29
0
 class Base(pa.SchemaModel):
     a: Series[int]
     b: Series[int] = pa.Field()
     c: Series[int] = pa.Field(alias="_c")
Beispiel #30
0
 class ChildAlias(Base):
     a: Series[str] = pa.Field(alias="_a")
     b: Series[str] = pa.Field(alias="_b")
     c: Series[str] = pa.Field(alias="_c")