Ejemplo n.º 1
0
def test_column_regex_non_str_types() -> None:
    """Check that column name regex matching excludes non-string types."""
    data = pd.DataFrame(
        {
            1: [1, 2, 3],
            2.2: [1, 2, 3],
            pd.Timestamp("2018/01/01"): [1, 2, 3],
            "foo_1": [1, 2, 3],
            "foo_2": [1, 2, 3],
            "foo_3": [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={
            "foo_": Column(Int, Check.gt(0), regex=True),
            r"\d+": Column(Int, Check.gt(0), regex=True),
            r"\d+\.\d+": Column(Int, Check.gt(0), regex=True),
            "2018-01-01": Column(Int, Check.gt(0), regex=True),
        },
    )
    assert isinstance(schema.validate(data), pd.DataFrame)

    # test MultiIndex column case
    data = pd.DataFrame(
        {
            (1, 1): [1, 2, 3],
            (2.2, 4.5): [1, 2, 3],
            ("foo", "bar"): [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={("foo_*", "bar_*"): Column(Int, regex=True)},
    )
    schema.validate(data)
Ejemplo n.º 2
0
def test_series_schema_checks():
    """Test SeriesSchema check property."""
    series_schema_no_checks = SeriesSchema()
    series_schema_one_check = SeriesSchema(checks=Check.eq(0))
    series_schema_multiple_checks = SeriesSchema(
        checks=[Check.gt(0), Check.lt(100)])

    for schema in [
            series_schema_no_checks,
            series_schema_one_check,
            series_schema_multiple_checks,
    ]:
        assert isinstance(schema.checks, list)

    assert len(series_schema_no_checks.checks) == 0
    assert len(series_schema_one_check.checks) == 1
    assert len(series_schema_multiple_checks.checks) == 2
Ejemplo n.º 3
0
def test_check_io():
    # pylint: disable=too-many-locals
    """Test that check_io correctly validates/invalidates data."""

    schema = DataFrameSchema({"col": Column(Int, Check.gt(0))})

    @check_io(df1=schema, df2=schema, out=schema)
    def simple_func(df1, df2):
        return df1.assign(col=df1["col"] + df2["col"])

    @check_io(df1=schema, df2=schema)
    def simple_func_no_out(df1, df2):
        return df1.assign(col=df1["col"] + df2["col"])

    @check_io(out=(1, schema))
    def output_with_obj_getter(df):
        return None, df

    @check_io(out=[(0, schema), (1, schema)])
    def multiple_outputs_tuple(df):
        return df, df

    @check_io(out=[(0, schema), ("foo", schema),
                   (lambda x: x[2]["bar"], schema)])
    def multiple_outputs_dict(df):
        return {0: df, "foo": df, 2: {"bar": df}}

    @check_io(df=schema, out=schema, head=1)
    def validate_head(df):
        return df

    @check_io(df=schema, out=schema, tail=1)
    def validate_tail(df):
        return df

    @check_io(df=schema, out=schema, sample=1, random_state=100)
    def validate_sample(df):
        return df

    @check_io(df=schema, out=schema, lazy=True)
    def validate_lazy(df):
        return df

    @check_io(df=schema, out=schema, inplace=True)
    def validate_inplace(df):
        return df

    df1 = pd.DataFrame({"col": [1, 1, 1]})
    df2 = pd.DataFrame({"col": [2, 2, 2]})
    invalid_df = pd.DataFrame({"col": [-1, -1, -1]})
    expected = pd.DataFrame({"col": [3, 3, 3]})

    for fn, valid, invalid, out in [
        (simple_func, [df1, df2], [invalid_df, invalid_df], expected),
        (simple_func_no_out, [df1, df2], [invalid_df, invalid_df], expected),
        (output_with_obj_getter, [df1], [invalid_df], (None, df1)),
        (multiple_outputs_tuple, [df1], [invalid_df], (df1, df1)),
        (
            multiple_outputs_dict,
            [df1],
            [invalid_df],
            {
                0: df1,
                "foo": df1,
                2: {
                    "bar": df1
                }
            },
        ),
        (validate_head, [df1], [invalid_df], df1),
        (validate_tail, [df1], [invalid_df], df1),
        (validate_sample, [df1], [invalid_df], df1),
        (validate_lazy, [df1], [invalid_df], df1),
        (validate_inplace, [df1], [invalid_df], df1),
    ]:
        result = fn(*valid)
        if isinstance(result, pd.Series):
            assert (result == out).all()
        if isinstance(result, pd.DataFrame):
            assert (result == out).all(axis=None)
        else:
            assert result == out

        expected_error = (errors.SchemaErrors
                          if fn is validate_lazy else errors.SchemaError)
        with pytest.raises(expected_error):
            fn(*invalid)

    # invalid out schema types
    for out_schema in [1, 5.0, "foo", {"foo": "bar"}, ["foo"]]:

        @check_io(out=out_schema)
        def invalid_out_schema_type(df):
            return df

        with pytest.raises((TypeError, ValueError)):
            invalid_out_schema_type(df1)