def test_multi_index_index():
    schema = DataFrameSchema(
        columns={
            "column1": Column(Float, Check(lambda s: s > 0)),
            "column2": Column(Float, Check(lambda s: s > 0)),
        },
        index=MultiIndex(indexes=[
            Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
            Index(String,
                  Check(lambda s: s.isin(["foo", "bar"])),
                  name="index1"),
        ]))

    df = pd.DataFrame(
        data={
            "column1": [0.1, 0.5, 123.1, 10.6, 22.31],
            "column2": [0.1, 0.5, 123.1, 10.6, 22.31],
        },
        index=pd.MultiIndex.from_arrays(
            [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
            names=["index0", "index1"],
        ))

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)

    # failure case
    df_fail = df.copy()
    df_fail.index = pd.MultiIndex.from_arrays(
        [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
        names=["index0", "index1"],
    )
    with pytest.raises(errors.SchemaError):
        schema.validate(df_fail)
Example #2
0
def test_one_sample_hypothesis():
    """Check one sample ttest."""
    schema = DataFrameSchema({
        "height_in_feet": Column(
            Float, [
                Hypothesis.one_sample_ttest(
                    popmean=5,
                    relationship="greater_than",
                    alpha=0.1),
            ]
        ),
    })

    subset_schema = DataFrameSchema({
        "group": Column(String),
        "height_in_feet": Column(
            Float, [
                Hypothesis.one_sample_ttest(
                    sample="A",
                    groupby="group",
                    popmean=5,
                    relationship="greater_than",
                    alpha=0.1),
            ]
        ),
    })

    df = (
        pd.DataFrame({
            "height_in_feet": [8.1, 7, 6.5, 6.7, 5.1],
            "group": ["A", "A", "B", "B", "A"],
        })
    )
    schema.validate(df)
    subset_schema.validate(df)
def test_column_regex_non_str_types() -> None:
    """Check that column name regex matching excludes non-string types."""
    data = pd.DataFrame(
        {
            1: [1, 2, 3],
            2.2: [1, 2, 3],
            pd.Timestamp("2018/01/01"): [1, 2, 3],
            "foo_1": [1, 2, 3],
            "foo_2": [1, 2, 3],
            "foo_3": [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={
            "foo_": Column(Int, Check.gt(0), regex=True),
            r"\d+": Column(Int, Check.gt(0), regex=True),
            r"\d+\.\d+": Column(Int, Check.gt(0), regex=True),
            "2018-01-01": Column(Int, Check.gt(0), regex=True),
        },
    )
    assert isinstance(schema.validate(data), pd.DataFrame)

    # test MultiIndex column case
    data = pd.DataFrame(
        {
            (1, 1): [1, 2, 3],
            (2.2, 4.5): [1, 2, 3],
            ("foo", "bar"): [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={("foo_*", "bar_*"): Column(Int, regex=True)},
    )
    schema.validate(data)
Example #4
0
def test_lazy_dataframe_validation_nullable():
    """
    Test that non-nullable column failure cases are correctly processed during
    lazy validation.
    """
    schema = DataFrameSchema(
        columns={
            "int_column": Column(Int, nullable=False),
            "float_column": Column(Float, nullable=False),
            "str_column": Column(String, nullable=False),
        },
        strict=True,
    )

    df = pd.DataFrame({
        "int_column": [1, None, 3],
        "float_column": [0.1, 1.2, None],
        "str_column": [None, "foo", "bar"],
    })

    try:
        schema.validate(df, lazy=True)
    except errors.SchemaErrors as err:
        assert err.failure_cases.failure_case.isna().all()
        for col, index in [
            ("int_column", 1),
            ("float_column", 2),
            ("str_column", 0),
        ]:
            # pylint: disable=cell-var-from-loop
            assert (err.failure_cases.loc[lambda df: df.column == col,
                                          "index"].iloc[0] == index)
Example #5
0
def test_coerce_dtype_in_dataframe():
    """Tests coercions of datatypes, especially regarding nullable integers."""
    df = pd.DataFrame({
        "column1": [10.0, 20.0, 30.0],
        "column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
        "column3": [1, 2, None],
        "column4": [1., 1., np.nan],
    })
    # specify `coerce` at the Column level
    schema1 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0), coerce=True),
        "column2": Column(DateTime, coerce=True),
        "column3": Column(String, coerce=True, nullable=True),
    })
    # specify `coerce` at the DataFrameSchema level
    schema2 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0)),
        "column2": Column(DateTime),
        "column3": Column(String, nullable=True),
    }, coerce=True)

    for schema in [schema1, schema2]:
        result = schema.validate(df)
        assert result.column1.dtype == Int.value
        assert result.column2.dtype == DateTime.value
        for _, x in result.column3.iteritems():
            assert pd.isna(x) or isinstance(x, str)

        # make sure that correct error is raised when null values are present
        # in a float column that's coerced to an int
        schema = DataFrameSchema({"column4": Column(Int, coerce=True)})
        with pytest.raises(ValueError):
            schema.validate(df)
Example #6
0
def test_column_regex_multiindex():
    """Text that column regex works on multi-index column."""
    column_schema = Column(
        Int, Check(lambda s: s >= 0), name=("foo_*", "baz_*"), regex=True,
    )
    dataframe_schema = DataFrameSchema({
        ("foo_*", "baz_*"): Column(Int, Check(lambda s: s >= 0), regex=True),
    })

    data = pd.DataFrame({
        ("foo_1", "biz_1"): range(10),
        ("foo_2", "baz_1"): range(10, 20),
        ("foo_3", "baz_2"): range(20, 30),
        ("bar_1", "biz_2"): range(10),
        ("bar_2", "biz_3"): range(10, 20),
        ("bar_3", "biz_3"): range(20, 30),
    })
    assert isinstance(column_schema.validate(data), pd.DataFrame)
    assert isinstance(dataframe_schema.validate(data), pd.DataFrame)

    # Raise an error if tuple column name is applied to a dataframe with a
    # flat pd.Index object.
    failure_column_cases = (
        ["foo_%s" % i for i in range(6)],
        pd.MultiIndex.from_tuples([
            ("foo_%s" % i, "bar_%s" % i, "baz_%s" % i) for i in range(6)
        ])
    )
    for columns in failure_column_cases:
        data.columns = columns
        with pytest.raises(IndexError):
            column_schema.validate(data)
        with pytest.raises(IndexError):
            dataframe_schema.validate(data)
Example #7
0
    def _validate_score_table(variant_information_table: DataFrame,
                              score_table: DataFrame):
        """ Validate the results of the prioritization method.

        The following constraints are checked:

            * Each UID from the variant_information_table is also in the score_table
            * Each SCORE in the score_table is a numerical value

        Parameters
        ----------
        variant_information_table :
            The variant information table

        score_table :
            The scoring results from the prioritization method

        Raises
        ------
        :class:`~pandera.errors.SchemaErrors`
            If the validation of the data fails
        """
        variants_uid = variant_information_table["UID"]
        schema = DataFrameSchema({
            "UID":
            Column(
                Int,
                Check(lambda x: variants_uid.isin(x) & x.isin(variants_uid)),
                required=True),
            "SCORE":
            Column(Float, coerce=True, required=True)
        })
        schema.validate(score_table, lazy=True)
Example #8
0
def test_required():
    """Tests how a Required Column is handled when it's not included, included
    and then not specified and a second column which is implicitly required
    isn't available."""
    schema = DataFrameSchema({
        "col1": Column(Int, required=False),
        "col2": Column(String)
    })

    df_ok_1 = pd.DataFrame({
        "col2": ['hello', 'world']
    })

    df = schema.validate(df_ok_1)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 1
    assert set(df.columns) == {"col2"}

    df_ok_2 = pd.DataFrame({
        "col1": [1, 2],
        "col2": ['hello', 'world']
    })

    df = schema.validate(df_ok_2)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 2
    assert set(df.columns) == {"col1", "col2"}

    df_not_ok = pd.DataFrame({
        "col1": [1, 2]
    })

    with pytest.raises(Exception):
        schema.validate(df_not_ok)
Example #9
0
def test_dataframe_schema_strict():
    # checks if strict=True whether a schema error is raised because 'a' is not
    # present in the dataframe.
    schema = DataFrameSchema({"a": Column(Int, nullable=True)}, strict=True)
    df = pd.DataFrame({"b": [1, 2, 3]})
    with pytest.raises(errors.SchemaError):
        schema.validate(df)
Example #10
0
def test_datetime():
    """Test datetime types can be validated properly by schema.validate"""
    schema = DataFrameSchema(
        columns={
            "col": Column(
                pa.DateTime,
                checks=Check(lambda s: s.min() > pd.Timestamp("2015")),
            )
        }
    )

    validated_df = schema.validate(
        pd.DataFrame(
            {"col": pd.to_datetime(["2019/01/01", "2018/05/21", "2016/03/10"])}
        )
    )

    assert isinstance(validated_df, pd.DataFrame)

    with pytest.raises(SchemaError):
        schema.validate(
            pd.DataFrame(
                {"col": pd.to_datetime(["2010/01/01"])}
            )
        )
def test_column_regex_strict() -> None:
    """Test that Column regex patterns correctly parsed in DataFrameSchema."""
    data = pd.DataFrame(
        {
            "foo_1": [1, 2, 3],
            "foo_2": [1, 2, 3],
            "foo_3": [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={"foo_*": Column(Int, regex=True)}, strict=True
    )
    assert isinstance(schema.validate(data), pd.DataFrame)

    # adding an extra column in the dataframe should cause error
    data = data.assign(bar=[1, 2, 3])
    with pytest.raises(errors.SchemaError):
        schema.validate(data)

    # adding an extra regex column to the schema should pass the strictness
    # test
    validated_data = schema.add_columns(
        {"bar_*": Column(Int, regex=True)}
    ).validate(data.assign(bar_1=[1, 2, 3]))
    assert isinstance(validated_data, pd.DataFrame)
Example #12
0
def test_column_regex():
    """Test that column regex work on single-level column index."""
    column_schema = Column(
        Int, Check(lambda s: s >= 0), name="foo_*", regex=True)

    dataframe_schema = DataFrameSchema({
        "foo_*": Column(Int, Check(lambda s: s >= 0), regex=True),
    })

    data = pd.DataFrame({
        "foo_1": range(10),
        "foo_2": range(10, 20),
        "foo_3": range(20, 30),
        "bar_1": range(10),
        "bar_2": range(10, 20),
        "bar_3": range(20, 30),
    })
    assert isinstance(column_schema.validate(data), pd.DataFrame)
    assert isinstance(dataframe_schema.validate(data), pd.DataFrame)

    # Raise an error on multi-index column case
    data.columns = pd.MultiIndex.from_tuples(
        (
            ("foo_1", "biz_1"),
            ("foo_2", "baz_1"),
            ("foo_3", "baz_2"),
            ("bar_1", "biz_2"),
            ("bar_2", "biz_3"),
            ("bar_3", "biz_3"),
        )
    )
    with pytest.raises(IndexError):
        column_schema.validate(data)
    with pytest.raises(IndexError):
        dataframe_schema.validate(data)
Example #13
0
def test_dataframe_checks():
    """Tests that dataframe checks validate, error when a DataFrame doesn't
    comply with the schema, simple tests of the groupby checks which are
    covered in more detail above."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Float),
            "col3": Column(String),
            "col4": Column(String),
        },
        checks=[
            Check(lambda df: df["col1"] < df["col2"]),
            Check(lambda df: df["col3"] == df["col4"]),
        ],
    )
    df = pd.DataFrame({
        "col1": [1, 2, 3],
        "col2": [2.0, 3.0, 4.0],
        "col3": ["foo", "bar", "baz"],
        "col4": ["foo", "bar", "baz"],
    })

    assert isinstance(schema.validate(df), pd.DataFrame)

    # test invalid schema error raising
    invalid_df = df.copy()
    invalid_df["col1"] = invalid_df["col1"] * 3

    with pytest.raises(errors.SchemaError):
        schema.validate(invalid_df)

    # test groupby checks
    groupby_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col3": Column(String),
        },
        checks=[
            Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"),
            Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"),
            Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"),
            Check(
                lambda g: g[("foo", "foo")]["col1"].iat[0] == 1,
                groupby=["col3", "col4"],
            ),
        ],
    )
    assert isinstance(groupby_check_schema.validate(df), pd.DataFrame)

    # test element-wise checks
    element_wise_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Float),
        },
        checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True),
    )
    assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
Example #14
0
def test_check_groupby():
    """Tests uses of groupby to specify dependencies between one column and a
    single other column, including error handling."""
    schema = DataFrameSchema(
        columns={
            "col1":
            Column(Int, [
                Check(lambda s: s["foo"] > 10, groupby="col2"),
                Check(lambda s: s["bar"] < 10, groupby=["col2"]),
                Check(lambda s: s["foo"] > 10,
                      groupby=lambda df: df.groupby("col2")),
                Check(lambda s: s["bar"] < 10,
                      groupby=lambda df: df.groupby("col2"))
            ]),
            "col2":
            Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        },
        index=Index(Int, name="data_id"),
    )

    df_pass = pd.DataFrame(
        data={
            "col1": [7, 8, 9, 11, 12, 13],
            "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 2
    assert set(df.columns) == {"col1", "col2"}

    # raise errors.SchemaError when Check fails
    df_fail_on_bar = pd.DataFrame(
        data={
            "col1": [7, 8, 20, 11, 12, 13],
            "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )
    df_fail_on_foo = pd.DataFrame(
        data={
            "col1": [7, 8, 9, 11, 1, 13],
            "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )
    # raise errors.SchemaError when groupby column doesn't exist
    df_fail_no_column = pd.DataFrame(
        data={
            "col1": [7, 8, 20, 11, 12, 13],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )

    for df in [df_fail_on_bar, df_fail_on_foo, df_fail_no_column]:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
Example #15
0
def test_dataframe_hypothesis_checks():
    """
    Test that two specific implementations of a Hypothesis work as expected
    and that using a Column that wasn't defined will error.
    """
    df = pd.DataFrame({
        "col1": range(100, 201),
        "col2": range(0, 101),
    })

    hypothesis_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                relationship_kwargs={"alpha": 0.5},
            ),
            # one-sample test
            Hypothesis(
                test=stats.ttest_1samp,
                samples=["col1"],
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                test_kwargs={"popmean": 50},
                relationship_kwargs={"alpha": 0.01},
            ),
        ],
    )

    hypothesis_check_schema.validate(df)

    # raise error when using groupby for a column that doesn't exist
    hypothesis_check_schema_groupby = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                groupby="col3",
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                relationship_kwargs={"alpha": 0.5},
            ),
        ],
    )
    with pytest.raises(errors.SchemaDefinitionError):
        hypothesis_check_schema_groupby.validate(df)
Example #16
0
def test_dataframe_hypothesis_checks():

    df = pd.DataFrame({
        "col1": range(100, 201),
        "col2": range(0, 101),
    })

    hypothesis_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                relationship_kwargs={"alpha": 0.5},
            ),
            # one-sample test
            Hypothesis(
                test=stats.ttest_1samp,
                samples=["col1"],
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                test_kwargs={"popmean": 50},
                relationship_kwargs={"alpha": 0.01},
            ),
        ]
    )

    hypothesis_check_schema.validate(df)

    # raise error when using groupby
    hypothesis_check_schema_groupby = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                groupby="col3",
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                relationship_kwargs={"alpha": 0.5},
            ),
        ]
    )
    with pytest.raises(errors.SchemaDefinitionError):
        hypothesis_check_schema_groupby.validate(df)
Example #17
0
 def test_pickling(int_dataframe: pd.DataFrame, schema: DataFrameSchema):
     """Test for a non-empty pickled object."""
     try:
         schema.validate(int_dataframe, lazy=True)
     except SchemaErrors as exc:
         # expect non-empty bytes
         assert pickle.dumps(exc)
     else:
         pytest.fail("SchemaErrors not raised")
Example #18
0
def test_nullable_int_in_dataframe():
    df = pd.DataFrame({"column1": [5, 1, np.nan]})
    null_schema = DataFrameSchema(
        {"column1": Column(Int, Check(lambda x: x > 0), nullable=True)})
    assert isinstance(null_schema.validate(df), pd.DataFrame)

    # test case where column is an object
    df = df.astype({"column1": "object"})
    assert isinstance(null_schema.validate(df), pd.DataFrame)
Example #19
0
    def validate(self):
        """ Check if the evaluation data is valid.

        The following constraints are checked:

            * CHROM has to be in ``{"1",...,"22","X","Y"}``
            * POS has to be ``> 1``
            * REF has to match with ``re.compile("^[ACGT]+$")``
            * ALT has to match with ``re.compile("^[ACGT]+$")``
            * RG has to be of type :class:`vpmbench.enums.ReferenceGenome`
            * CLASS has to be of type :class:`vpmbench.enums.PathogencityClass`
            * TYPE has to be of type :class:`vpmbench.enums.VariationType`
            * UID has to be ``> 0``

        Raises
        ------
        :class:`~pandera.errors.SchemaErrors`
            If the validation of the data fails
        """
        chroms = set([str(x) for x in range(1, 23)] + ["X", "Y"])
        ref_validator = re.compile("^[ACGT]+$")
        alt_validator = re.compile("^[ACGT]+$")
        schema = DataFrameSchema({
            "CHROM":
            Column(String,
                   Check(lambda chrom: chrom in chroms, element_wise=True),
                   required=True),
            "POS":
            Column(Int, Check(lambda pos: pos >= 1), required=True),
            "REF":
            Column(String,
                   Check(lambda ref: ref_validator.match(ref) is not None,
                         element_wise=True),
                   required=True),
            "ALT":
            Column(String,
                   Check(lambda alt: alt_validator.match(alt) is not None,
                         element_wise=True),
                   required=True),
            "CLASS":
            Column(checks=Check(lambda cl: isinstance(cl, PathogencityClass),
                                element_wise=True),
                   required=True),
            "UID":
            Column(Int, Check(lambda x: x >= 0), required=True),
            "TYPE":
            Column(checks=Check(lambda cl: isinstance(cl, VariationType),
                                element_wise=True),
                   required=True),
            "RG":
            Column(checks=Check(lambda cl: isinstance(cl, ReferenceGenome),
                                element_wise=True),
                   required=True)
        })
        schema.validate(self.table, lazy=True)
Example #20
0
def test_dataframe_schema():
    schema = DataFrameSchema({
        "a":
        Column(PandasDtype.Int, Check(lambda x: x > 0)),
        "b":
        Column(PandasDtype.Float, Check(lambda x: 0 <= x <= 10)),
        "c":
        Column(PandasDtype.String,
               Check(lambda x: set(x) == {"x", "y", "z"}, element_wise=False)),
        "d":
        Column(PandasDtype.Bool,
               Check(lambda x: x.mean() > 0.5, element_wise=False)),
        "e":
        Column(
            PandasDtype.Category,
            Check(lambda x: set(x) == {"c1", "c2", "c3"}, element_wise=False)),
        "f":
        Column(
            PandasDtype.Object,
            Check(lambda x: x.isin([(1, ), (2, ), (3, )]),
                  element_wise=False)),
        "g":
        Column(PandasDtype.DateTime,
               Check(lambda x: x >= pd.Timestamp("2015-01-01"))),
        "i":
        Column(PandasDtype.Timedelta,
               Check(lambda x: x < pd.Timedelta(10, unit="D")))
    })
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": [1.1, 2.5, 9.9],
        "c": ["z", "y", "x"],
        "d": [True, True, False],
        "e":
        pd.Series(["c2", "c1", "c3"], dtype="category"),
        "f": [(3, ), (2, ), (1, )],
        "g": [
            pd.Timestamp("2015-02-01"),
            pd.Timestamp("2015-02-02"),
            pd.Timestamp("2015-02-03")
        ],
        "i": [
            pd.Timedelta(1, unit="D"),
            pd.Timedelta(5, unit="D"),
            pd.Timedelta(9, unit="D")
        ]
    })
    assert isinstance(schema.validate(df), pd.DataFrame)

    # error case
    with pytest.raises(SchemaError):
        schema.validate(df.drop("a", axis=1))

    with pytest.raises(SchemaError):
        schema.validate(df.assign(a=[-1, -2, -1]))
Example #21
0
def test_tail_dataframe_schema():
    df = pd.DataFrame(
        {"col1": [i for i in range(100)] + [i for i in range(-1, -1001, -1)]})

    schema = DataFrameSchema(
        columns={"col1": Column(Int, Check(lambda s: s < 0))})

    # Validating with tail of 1000 should pass
    assert schema.validate(df, tail=1000).equals(df)
    with pytest.raises(errors.SchemaError):
        schema.validate(df)
Example #22
0
 def test_pickling(int_dataframe: pd.DataFrame, check_obj: Check):
     """Test for a non-empty pickled object."""
     schema = DataFrameSchema({"a": Column(int, check_obj)})
     try:
         # fails for element -1
         schema.validate(int_dataframe)
     except SchemaError as exc:
         # must be non-empty byte-array
         assert pickle.dumps(exc)
     else:
         pytest.fail("SchemaError not raised")
def test_index_schema():
    """Tests that when specifying a DataFrameSchema Index pandera validates
    and errors appropriately."""
    schema = DataFrameSchema(index=Index(Int, [
        Check(lambda x: 1 <= x <= 11, element_wise=True),
        Check(lambda index: index.mean() > 1)
    ]))
    df = pd.DataFrame(index=range(1, 11), dtype="int64")
    assert isinstance(schema.validate(df), pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        schema.validate(pd.DataFrame(index=range(1, 20)))
Example #24
0
def test_no_dtype_dataframe():
    schema = DataFrameSchema({"col": Column(nullable=False)})
    validated_df = schema.validate(pd.DataFrame({"col": [-123.1, -76.3, 1.0]}))
    assert isinstance(validated_df, pd.DataFrame)

    schema = DataFrameSchema({"col": Column(nullable=True)})
    validated_df = schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
    assert isinstance(validated_df, pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        schema = DataFrameSchema({"col": Column(nullable=False)})
        schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
Example #25
0
def test_index_schema():
    schema = DataFrameSchema(
        columns={},
        index=Index(Int, [
            Check(lambda x: 1 <= x <= 11, element_wise=True),
            Check(lambda index: index.mean() > 1)
        ]))
    df = pd.DataFrame(index=range(1, 11), dtype="int64")
    assert isinstance(schema.validate(df), pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        schema.validate(pd.DataFrame(index=range(1, 20)))
Example #26
0
 def test_unpickling(
     self, int_dataframe: pd.DataFrame, multi_check_schema: DataFrameSchema
 ):
     """Tests content validity of unpickled SchemaErrors."""
     try:
         multi_check_schema.validate(int_dataframe, lazy=True)
     except SchemaErrors as exc:
         loaded = pickle.loads(pickle.dumps(exc))
         assert loaded is not None
         self._compare_exception_with_unpickled(exc, loaded)
     else:
         pytest.fail("SchemaErrors not raised")
Example #27
0
def test_tail_dataframe_schema():
    """Checks that validating the tail of a dataframe validates correctly."""
    df = pd.DataFrame(
        {"col1": list(range(0, 100)) + list(range(-1, -1001, -1))})

    schema = DataFrameSchema(
        columns={"col1": Column(Int, Check(lambda s: s < 0))})

    # Validating with tail of 1000 should pass
    assert schema.validate(df, tail=1000).equals(df)
    with pytest.raises(errors.SchemaError):
        schema.validate(df)
Example #28
0
def test_dataframe_schema_check_function_types(check_function, should_fail):
    schema = DataFrameSchema({
        "a":
        Column(Int, Check(fn=check_function, element_wise=False)),
        "b":
        Column(Float, Check(fn=check_function, element_wise=False))
    })
    df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]})
    if should_fail:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
    else:
        schema.validate(df)
def test_single_index_multi_index_mismatch() -> None:
    """Tests the failure case that attempting to validate a MultiIndex DataFrame
    against a single index schema raises a SchemaError with a constructive error
    message."""
    ind = pd.MultiIndex.from_tuples(
        [("a", "b"), ("c", "d"), ("e", "f")],
        names=("one", "two"),
    )
    df_fail = pd.DataFrame(index=ind)
    schema = DataFrameSchema(index=Index(name="key"))

    with pytest.raises(errors.SchemaError):
        schema.validate(df_fail)
Example #30
0
def test_dataframe_schema_check():
    """Test that DataFrameSchema-level Checks work properly."""
    data = pd.DataFrame([range(10) for _ in range(10)])

    schema_check_return_bool = DataFrameSchema(
        checks=Check(lambda df: (df < 10).all()))
    assert isinstance(schema_check_return_bool.validate(data), pd.DataFrame)

    schema_check_return_series = DataFrameSchema(
        checks=Check(lambda df: df[0] < 10))
    assert isinstance(schema_check_return_series.validate(data), pd.DataFrame)

    schema_check_return_df = DataFrameSchema(checks=Check(lambda df: df < 10))
    assert isinstance(schema_check_return_df.validate(data), pd.DataFrame)