def test_get_series_schema_statistics():
    """Test that series schema statistics logic is correct."""
    schema = pa.SeriesSchema(
        int,
        nullable=False,
        checks=[
            pa.Check.greater_than_or_equal_to(0),
            pa.Check.less_than_or_equal_to(100),
        ],
    )
    statistics = schema_statistics.get_series_schema_statistics(schema)
    assert statistics == {
        "dtype": pandas_engine.Engine.dtype(int),
        "nullable": False,
        "checks": {
            "greater_than_or_equal_to": {
                "min_value": 0
            },
            "less_than_or_equal_to": {
                "max_value": 100
            },
        },
        "name": None,
        "coerce": False,
    }
Esempio n. 2
0
def test_unique():
    """Test uniqueness checks on modin dataframes."""
    schema = pa.DataFrameSchema({"field": pa.Column(int)}, unique=["field"])
    column_schema = pa.Column(int, unique=True, name="field")
    series_schema = pa.SeriesSchema(int, unique=True, name="field")

    data_unique = mpd.DataFrame({"field": [1, 2, 3]})
    data_non_unique = mpd.DataFrame({"field": [1, 1, 1]})

    assert isinstance(schema(data_unique), mpd.DataFrame)
    assert isinstance(column_schema(data_unique), mpd.DataFrame)
    assert isinstance(series_schema(data_unique["field"]), mpd.Series)

    with pytest.raises(pa.errors.SchemaError, match="columns .+ not unique"):
        schema(data_non_unique)
    with pytest.raises(pa.errors.SchemaError,
                       match="series .+ contains duplicate values"):
        column_schema(data_non_unique)
    with pytest.raises(pa.errors.SchemaError,
                       match="series .+ contains duplicate values"):
        series_schema(data_non_unique["field"])

    schema.unique = None
    column_schema.unique = False
    series_schema.unique = False

    assert isinstance(schema(data_non_unique), mpd.DataFrame)
    assert isinstance(column_schema(data_non_unique), mpd.DataFrame)
    assert isinstance(series_schema(data_non_unique["field"]), mpd.Series)
Esempio n. 3
0
def test_seriesschema():
    """Test that SeriesSchemaBase is compatible with pydantic."""
    assert isinstance(
        SeriesSchemaPydantic(
            pa_series_schema=pa.SeriesSchema(),
            pa_column=pa.Column(),
            pa_index=pa.Index(),
        ),
        SeriesSchemaPydantic,
    )
Esempio n. 4
0
def test_series_schema() -> None:
    """
    Test that SeriesSchema based pandera validation works with Dask Series.
    """
    integer_schema = pa.SeriesSchema(int)
    string_schema = pa.SeriesSchema(str)

    series = pd.Series(["1"])
    dseries = dd.from_pandas(series, npartitions=1)

    dseries = string_schema.validate(dseries)
    pd.testing.assert_series_equal(series, dseries.compute())

    dseries = integer_schema.validate(dseries)

    with pytest.raises(pa.errors.SchemaError):
        dseries.compute()

    integer_schema.validate(dseries, inplace=True)

    with pytest.raises(pa.errors.SchemaError):
        dseries.compute()
Esempio n. 5
0
def test_series_example():
    """Test SeriesSchema example method generate examples that pass."""
    series_schema = pa.SeriesSchema(pa.Int, pa.Check.gt(0))
    for _ in range(10):
        series_schema(series_schema.example())
Esempio n. 6
0
def test_series_strategy(data):
    """Test SeriesSchema strategy."""
    series_schema = pa.SeriesSchema(pa.Int, pa.Check.gt(0))
    series_schema(data.draw(series_schema.strategy()))
Esempio n. 7
0
    )
    example = data.draw(strat)
    if nullable:
        assert example.isna().any(axis=None)
    else:
        assert example.notna().all(axis=None)


@pytest.mark.parametrize(
    "schema, warning",
    [
        [
            pa.SeriesSchema(
                pa.Int,
                checks=[
                    pa.Check(lambda x: x > 0, element_wise=True),
                    pa.Check(lambda x: x > -10, element_wise=True),
                ],
            ),
            "Element-wise",
        ],
        [
            pa.SeriesSchema(
                pa.Int,
                checks=[
                    pa.Check(lambda s: s > -10000),
                    pa.Check(lambda s: s > -9999),
                ],
            ),
            "Vectorized",
        ],
Esempio n. 8
0
import pandas as pd
import pytest

import pandera as pa


@pytest.mark.parametrize(
    "schema1, schema2, data",
    [
        [
            pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True),
            pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True),
            pd.DataFrame({"col": [1, 2, 3]}),
        ],
        [
            pa.SeriesSchema(int, coerce=True),
            pa.SeriesSchema(float, coerce=True),
            pd.Series([1, 2, 3]),
        ],
    ],
)
@pytest.mark.parametrize("inplace", [False, True])
def test_dataframe_series_add_schema(schema1, schema2, data, inplace):
    """
    Test that pandas object contains schema metadata after pandera validation.
    """
    validated_data_1 = schema1(data, inplace=inplace)
    if inplace:
        assert data.pandera.schema == schema1
    else:
        assert data.pandera.schema is None
Esempio n. 9
0
import pandera as pa

weight_series = pa.SeriesSchema(pa.Float64,
                                index=pa.Index(pa.DateTime),
                                name="weight")
consumption_series = pa.SeriesSchema(pa.Float64,
                                     index=pa.Index(pa.DateTime),
                                     name="consumption")
servings_series = pa.SeriesSchema(pa.Int,
                                  index=pa.Index(pa.DateTime),
                                  name="servings")
Esempio n. 10
0
    probabilistic_mental_illness_schema(fatal_encounters_clean);

# %% [markdown] slideshow={"slide_type": "slide"}
# ### Prepare Training and Test Sets
#
# For functions that have tuple/list-like output, specify an integer
# index `pa.check_output(schema, <int>)` to apply the schema to a
# specific element in the output.

# %%
from sklearn.model_selection import train_test_split

target_schema = pa.SeriesSchema(
    pa.Bool,
    name="disposition_accidental",
    checks=Hypothesis.one_sample_ttest(
        popmean=0.0275, relationship="equal", alpha=0.01
    )
)
feature_schema = training_data_schema.remove_columns([target_schema.name])


@pa.check_input(training_data_schema)
@pa.check_output(feature_schema, 0)
@pa.check_output(feature_schema, 1)
@pa.check_output(target_schema, 2)
@pa.check_output(target_schema, 3)
def split_training_data(fatal_encounters_clean):
    return train_test_split(
        fatal_encounters_clean[list(feature_schema.columns)],
        fatal_encounters_clean[target_schema.name],