Ejemplo n.º 1
0
def test_column_regex_multiindex():
    """Text that column regex works on multi-index column."""
    column_schema = Column(
        Int, Check(lambda s: s >= 0), name=("foo_*", "baz_*"), regex=True,
    )
    dataframe_schema = DataFrameSchema({
        ("foo_*", "baz_*"): Column(Int, Check(lambda s: s >= 0), regex=True),
    })

    data = pd.DataFrame({
        ("foo_1", "biz_1"): range(10),
        ("foo_2", "baz_1"): range(10, 20),
        ("foo_3", "baz_2"): range(20, 30),
        ("bar_1", "biz_2"): range(10),
        ("bar_2", "biz_3"): range(10, 20),
        ("bar_3", "biz_3"): range(20, 30),
    })
    assert isinstance(column_schema.validate(data), pd.DataFrame)
    assert isinstance(dataframe_schema.validate(data), pd.DataFrame)

    # Raise an error if tuple column name is applied to a dataframe with a
    # flat pd.Index object.
    failure_column_cases = (
        ["foo_%s" % i for i in range(6)],
        pd.MultiIndex.from_tuples([
            ("foo_%s" % i, "bar_%s" % i, "baz_%s" % i) for i in range(6)
        ])
    )
    for columns in failure_column_cases:
        data.columns = columns
        with pytest.raises(IndexError):
            column_schema.validate(data)
        with pytest.raises(IndexError):
            dataframe_schema.validate(data)
Ejemplo n.º 2
0
def test_multi_index_index():
    schema = DataFrameSchema(
        columns={
            "column1": Column(Float, Check(lambda s: s > 0)),
            "column2": Column(Float, Check(lambda s: s > 0)),
        },
        index=MultiIndex(indexes=[
            Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
            Index(String,
                  Check(lambda s: s.isin(["foo", "bar"])),
                  name="index1"),
        ]))

    df = pd.DataFrame(
        data={
            "column1": [0.1, 0.5, 123.1, 10.6, 22.31],
            "column2": [0.1, 0.5, 123.1, 10.6, 22.31],
        },
        index=pd.MultiIndex.from_arrays(
            [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
            names=["index0", "index1"],
        ))

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)

    # failure case
    df_fail = df.copy()
    df_fail.index = pd.MultiIndex.from_arrays(
        [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
        names=["index0", "index1"],
    )
    with pytest.raises(errors.SchemaError):
        schema.validate(df_fail)
Ejemplo n.º 3
0
def test_column_regex():
    """Test that column regex work on single-level column index."""
    column_schema = Column(
        Int, Check(lambda s: s >= 0), name="foo_*", regex=True)

    dataframe_schema = DataFrameSchema({
        "foo_*": Column(Int, Check(lambda s: s >= 0), regex=True),
    })

    data = pd.DataFrame({
        "foo_1": range(10),
        "foo_2": range(10, 20),
        "foo_3": range(20, 30),
        "bar_1": range(10),
        "bar_2": range(10, 20),
        "bar_3": range(20, 30),
    })
    assert isinstance(column_schema.validate(data), pd.DataFrame)
    assert isinstance(dataframe_schema.validate(data), pd.DataFrame)

    # Raise an error on multi-index column case
    data.columns = pd.MultiIndex.from_tuples(
        (
            ("foo_1", "biz_1"),
            ("foo_2", "baz_1"),
            ("foo_3", "baz_2"),
            ("bar_1", "biz_2"),
            ("bar_2", "biz_3"),
            ("bar_3", "biz_3"),
        )
    )
    with pytest.raises(IndexError):
        column_schema.validate(data)
    with pytest.raises(IndexError):
        dataframe_schema.validate(data)
Ejemplo n.º 4
0
def test_coerce_dtype_in_dataframe():
    """Tests coercions of datatypes, especially regarding nullable integers."""
    df = pd.DataFrame({
        "column1": [10.0, 20.0, 30.0],
        "column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
        "column3": [1, 2, None],
        "column4": [1., 1., np.nan],
    })
    # specify `coerce` at the Column level
    schema1 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0), coerce=True),
        "column2": Column(DateTime, coerce=True),
        "column3": Column(String, coerce=True, nullable=True),
    })
    # specify `coerce` at the DataFrameSchema level
    schema2 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0)),
        "column2": Column(DateTime),
        "column3": Column(String, nullable=True),
    }, coerce=True)

    for schema in [schema1, schema2]:
        result = schema.validate(df)
        assert result.column1.dtype == Int.value
        assert result.column2.dtype == DateTime.value
        for _, x in result.column3.iteritems():
            assert pd.isna(x) or isinstance(x, str)

        # make sure that correct error is raised when null values are present
        # in a float column that's coerced to an int
        schema = DataFrameSchema({"column4": Column(Int, coerce=True)})
        with pytest.raises(ValueError):
            schema.validate(df)
Ejemplo n.º 5
0
def test_multi_index_columns() -> None:
    """Tests that multi-index Columns within DataFrames validate correctly."""
    schema = DataFrameSchema(
        {
            ("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
            ("zero", "bar"): Column(
                String, Check(lambda s: s.isin(["a", "b", "c", "d"]))
            ),
            ("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
            ("one", "bar"): Column(
                DateTime, Check(lambda s: s == pd.Timestamp(2019, 1, 1))
            ),
        }
    )
    validated_df = schema.validate(
        pd.DataFrame(
            {
                ("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
                ("zero", "bar"): ["a", "b", "c", "d"],
                ("one", "foo"): [1, 6, 4, 7],
                ("one", "bar"): pd.to_datetime(["2019/01/01"] * 4),
            }
        )
    )
    assert isinstance(validated_df, pd.DataFrame)
Ejemplo n.º 6
0
def test_check_groupby_multiple_columns():
    schema = DataFrameSchema({
        "col1":
        Column(
            Int,
            [
                Check(
                    lambda s: s[("bar", True)].sum() == 16,  # 7 + 9
                    groupby=["col2", "col3"]),
            ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        "col3":
        Column(Bool),
    })

    df_pass = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        "col3": [True, False, True, False, True, False],
    })

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 3
    assert set(df.columns) == {"col1", "col2", "col3"}
Ejemplo n.º 7
0
def test_check_equality_operators():
    """Test the usage of == between a Check and an entirely different Check."""
    check = Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3")

    not_equal_check = Check(lambda x: x.isna().sum() == 0)
    assert check == copy.deepcopy(check)
    assert check != not_equal_check
Ejemplo n.º 8
0
def test_equality_operators_functional_equivalence():
    """Test the usage of == for Checks where the Check callable object has
    the same implementation."""
    main_check = Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3")
    same_check = Check(lambda h: h["foo"]["col1"].iat[0] == 1, groupby="col3")

    assert main_check == same_check
Ejemplo n.º 9
0
def test_column_regex_non_str_types() -> None:
    """Check that column name regex matching excludes non-string types."""
    data = pd.DataFrame(
        {
            1: [1, 2, 3],
            2.2: [1, 2, 3],
            pd.Timestamp("2018/01/01"): [1, 2, 3],
            "foo_1": [1, 2, 3],
            "foo_2": [1, 2, 3],
            "foo_3": [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={
            "foo_": Column(Int, Check.gt(0), regex=True),
            r"\d+": Column(Int, Check.gt(0), regex=True),
            r"\d+\.\d+": Column(Int, Check.gt(0), regex=True),
            "2018-01-01": Column(Int, Check.gt(0), regex=True),
        },
    )
    assert isinstance(schema.validate(data), pd.DataFrame)

    # test MultiIndex column case
    data = pd.DataFrame(
        {
            (1, 1): [1, 2, 3],
            (2.2, 4.5): [1, 2, 3],
            ("foo", "bar"): [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={("foo_*", "bar_*"): Column(Int, regex=True)},
    )
    schema.validate(data)
Ejemplo n.º 10
0
def test_check_groupby_multiple_columns():
    """Tests uses of groupby to specify dependencies between one column and a
    number of other columns, including error handling."""
    schema = DataFrameSchema({
        "col1":
        Column(
            Int,
            [
                Check(
                    lambda s: s[("bar", True)].sum() == 16,  # 7 + 9
                    groupby=["col2", "col3"]),
            ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        "col3":
        Column(Bool),
    })

    df_pass = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        "col3": [True, False, True, False, True, False],
    })

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 3
    assert set(df.columns) == {"col1", "col2", "col3"}
Ejemplo n.º 11
0
def test_series_schema():
    """Tests that a SeriesSchema Check behaves as expected for integers and
    strings. Tests error cases for types, duplicates, name errors, and issues
    around float and integer handling of nulls"""
    int_schema = SeriesSchema(
        Int, Check(lambda x: 0 <= x <= 100, element_wise=True))
    assert isinstance(int_schema.validate(pd.Series([0, 30, 50, 100])),
                      pd.Series)

    str_schema = SeriesSchema(String,
                              Check(lambda s: s.isin(["foo", "bar", "baz"])),
                              nullable=True,
                              coerce=True)
    assert isinstance(
        str_schema.validate(pd.Series(["foo", "bar", "baz", None])), pd.Series)
    assert isinstance(
        str_schema.validate(pd.Series(["foo", "bar", "baz", np.nan])),
        pd.Series)

    # error cases
    for data in [-1, 101, 50.1, "foo"]:
        with pytest.raises(errors.SchemaError):
            int_schema.validate(pd.Series([data]))

    for data in [-1, {"a": 1}, -1.0]:
        with pytest.raises(TypeError):
            int_schema.validate(TypeError)

    non_duplicate_schema = SeriesSchema(Int, allow_duplicates=False)
    with pytest.raises(errors.SchemaError):
        non_duplicate_schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))

    # when series name doesn't match schema
    named_schema = SeriesSchema(Int, name="my_series")
    with pytest.raises(errors.SchemaError, match=r"^Expected .+ to have name"):
        named_schema.validate(pd.Series(range(5), name="your_series"))

    # when series floats are declared to be integer
    with pytest.raises(
            errors.SchemaError,
            match=r"^after dropping null values, expected values in series"):
        SeriesSchema(Int,
                     nullable=True).validate(pd.Series([1.1, 2.3, 5.5,
                                                        np.nan]))

    # when series contains null values when schema is not nullable
    with pytest.raises(errors.SchemaError,
                       match=r"^non-nullable series .+ contains null values"):
        SeriesSchema(Float, nullable=False).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))

    # when series contains null values when schema is not nullable in addition
    # to having the wrong data type
    with pytest.raises(
            errors.SchemaError,
            match=(r"^expected series '.+' to have type .+, got .+ and "
                   "non-nullable series contains null values")):
        SeriesSchema(Int, nullable=False).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))
Ejemplo n.º 12
0
def test_check_groupby():
    """Tests uses of groupby to specify dependencies between one column and a
    single other column, including error handling."""
    schema = DataFrameSchema(
        columns={
            "col1":
            Column(Int, [
                Check(lambda s: s["foo"] > 10, groupby="col2"),
                Check(lambda s: s["bar"] < 10, groupby=["col2"]),
                Check(lambda s: s["foo"] > 10,
                      groupby=lambda df: df.groupby("col2")),
                Check(lambda s: s["bar"] < 10,
                      groupby=lambda df: df.groupby("col2"))
            ]),
            "col2":
            Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        },
        index=Index(Int, name="data_id"),
    )

    df_pass = pd.DataFrame(
        data={
            "col1": [7, 8, 9, 11, 12, 13],
            "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 2
    assert set(df.columns) == {"col1", "col2"}

    # raise errors.SchemaError when Check fails
    df_fail_on_bar = pd.DataFrame(
        data={
            "col1": [7, 8, 20, 11, 12, 13],
            "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )
    df_fail_on_foo = pd.DataFrame(
        data={
            "col1": [7, 8, 9, 11, 1, 13],
            "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )
    # raise errors.SchemaError when groupby column doesn't exist
    df_fail_no_column = pd.DataFrame(
        data={
            "col1": [7, 8, 20, 11, 12, 13],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )

    for df in [df_fail_on_bar, df_fail_on_foo, df_fail_no_column]:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
Ejemplo n.º 13
0
 def init_schema_element_wise():
     DataFrameSchema({
         "col1": Column(Int, [
             Check(lambda s: s["foo"] > 10,
                   element_wise=True,
                   groupby=["col2"]),
         ]),
         "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
     })
Ejemplo n.º 14
0
def plot_roc_auc(y_true, y_pred, label, ax=None):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_curve_df = pd.DataFrame({"fpr": fpr, "tpr": tpr}).pipe(
        pa.DataFrameSchema({
            "fpr": Column(pa.Float, Check.in_range(0, 1)),
            "tpr": Column(pa.Float, Check.in_range(0, 1)),
        })
    )
    return roc_curve_df.plot.line(x="fpr", y="tpr", label=label, ax=ax)
Ejemplo n.º 15
0
    def __init__(self, config):
        self.raw_path = config['raw_path']
        self.raw_data = config['raw_data']
        self.random_state = config['random_state']
        self.sample_size = config['sample_size']
        self.columns = config['columns']

        # Define checks
        self.check_ge_min = Check(lambda s: s >= 0)
        self.check_le_max = Check(lambda s: s <= max(s))
Ejemplo n.º 16
0
def test_add_and_remove_columns():
    """Check that adding and removing columns works as expected and doesn't
    modify the original underlying DataFrameSchema."""
    schema1 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
        },
        strict=True,
    )

    schema1_exact_copy = copy.deepcopy(schema1)

    # test that add_columns doesn't modify schema1 after add_columns:
    schema2 = schema1.add_columns(
        {
            "col2": Column(String, Check(lambda x: x <= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        }
    )

    schema2_exact_copy = copy.deepcopy(schema2)

    assert schema1 == schema1_exact_copy

    # test that add_columns changed schema1 into schema2:
    expected_schema_2 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
            "col2": Column(String, Check(lambda x: x <= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        },
        strict=True,
    )

    assert schema2 == expected_schema_2

    # test that remove_columns doesn't modify schema2:
    schema3 = schema2.remove_columns(["col2"])

    assert schema2 == schema2_exact_copy

    # test that remove_columns has removed the changes as expected:
    expected_schema_3 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        },
        strict=True,
    )

    assert schema3 == expected_schema_3

    # test that remove_columns can remove two columns:
    schema4 = schema2.remove_columns(["col2", "col3"])

    expected_schema_4 = DataFrameSchema(
        {"col1": Column(Int, Check(lambda s: s >= 0))}, strict=True
    )

    assert schema4 == expected_schema_4 == schema1
Ejemplo n.º 17
0
def test_series_schema_multiple_validators():
    schema = SeriesSchema(PandasDtype.Int, [
        Check(lambda x: 0 <= x <= 50),
        Check(lambda s: (s == 21).any(), element_wise=False)
    ])
    validated_series = schema.validate(pd.Series([1, 5, 21, 50]))
    assert isinstance(validated_series, pd.Series)

    # raise error if any of the validators fails
    with pytest.raises(SchemaError):
        schema.validate(pd.Series([1, 5, 20, 50]))
Ejemplo n.º 18
0
def test_index_schema():
    schema = DataFrameSchema(
        columns={},
        index=Index(Int, [
            Check(lambda x: 1 <= x <= 11, element_wise=True),
            Check(lambda index: index.mean() > 1)
        ]))
    df = pd.DataFrame(index=range(1, 11), dtype="int64")
    assert isinstance(schema.validate(df), pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        schema.validate(pd.DataFrame(index=range(1, 20)))
Ejemplo n.º 19
0
def test_index_schema():
    """Tests that when specifying a DataFrameSchema Index pandera validates
    and errors appropriately."""
    schema = DataFrameSchema(index=Index(Int, [
        Check(lambda x: 1 <= x <= 11, element_wise=True),
        Check(lambda index: index.mean() > 1)
    ]))
    df = pd.DataFrame(index=range(1, 11), dtype="int64")
    assert isinstance(schema.validate(df), pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        schema.validate(pd.DataFrame(index=range(1, 20)))
Ejemplo n.º 20
0
def test_check_groups():
    """Tests uses of groupby and groups (for values within columns)."""
    schema = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2", groups=["foo"]),
            Check(lambda s: s["foo"] > 10, groupby="col2", groups="foo"),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })

    df = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
    })

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)
    assert len(validated_df.columns) == 2
    assert set(validated_df.columns) == {"col1", "col2"}

    # raise KeyError when groups does not include a particular group name
    schema_fail_key_error = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'bar'"):
        schema_fail_key_error.validate(df)

    # raise KeyError when the group does not exist in the groupby column when
    # referenced in the Check function
    schema_fail_nonexistent_key_in_fn = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'baz'"):
        schema_fail_nonexistent_key_in_fn.validate(df)

    # raise KeyError when the group does not exist in the groups argument.
    schema_fail_nonexistent_key_in_groups = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError):
        schema_fail_nonexistent_key_in_groups.validate(df)
Ejemplo n.º 21
0
def test_raise_warning_series():
    """Test that checks with raise_warning=True raise a warning."""
    data = pd.Series([-1, -2, -3])
    error_schema = SeriesSchema(checks=Check(lambda s: s > 0))
    warning_schema = SeriesSchema(
        checks=Check(lambda s: s > 0, raise_warning=True))

    with pytest.raises(errors.SchemaError):
        error_schema(data)

    with pytest.warns(UserWarning):
        warning_schema(data)
Ejemplo n.º 22
0
 def setup(self):
     self.schema = SeriesSchema(
             String,
             checks=[
                 Check(lambda s: s.str.startswith("foo")),
                 Check(lambda s: s.str.endswith("bar")),
                 Check(lambda x: len(x) > 3, element_wise=True)
                 ],
             nullable=False,
             allow_duplicates=True,
             name="my_series")
     self.series = pd.Series(["foobar", "foobar", "foobar"],
                             name="my_series")
Ejemplo n.º 23
0
def _multi_check_schema() -> DataFrameSchema:
    """Schema with multiple positivity checks on column `a`"""
    return DataFrameSchema(
        {
            "a": Column(
                int,
                [
                    Check.isin([0, 1]),
                    Check(lambda x: x >= 0),
                ],
            ),
        }
    )
Ejemplo n.º 24
0
def test_dataframe_schema_check_function_types(check_function, should_fail):
    schema = DataFrameSchema({
        "a":
        Column(Int, Check(fn=check_function, element_wise=False)),
        "b":
        Column(Float, Check(fn=check_function, element_wise=False))
    })
    df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]})
    if should_fail:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
    else:
        schema.validate(df)
Ejemplo n.º 25
0
def test_series_schema_multiple_validators():
    """Tests how multiple Checks on a Series Schema are handled both
    successfully and when errors are expected."""
    schema = SeriesSchema(
        Int, [
            Check(lambda x: 0 <= x <= 50, element_wise=True),
            Check(lambda s: (s == 21).any())])
    validated_series = schema.validate(pd.Series([1, 5, 21, 50]))
    assert isinstance(validated_series, pd.Series)

    # raise error if any of the validators fails
    with pytest.raises(errors.SchemaError):
        schema.validate(pd.Series([1, 5, 20, 50]))
Ejemplo n.º 26
0
def test_dataframe_schema_check_function_types(check_function, should_fail):
    """Tests a DataFrameSchema against a variety of Check conditions."""
    schema = DataFrameSchema({
        "a":
        Column(Int, Check(check_function, element_wise=False)),
        "b":
        Column(Float, Check(check_function, element_wise=False)),
    })
    df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]})
    if should_fail:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
    else:
        schema.validate(df)
Ejemplo n.º 27
0
def test_dataframe_schema_check():
    """Test that DataFrameSchema-level Checks work properly."""
    data = pd.DataFrame([range(10) for _ in range(10)])

    schema_check_return_bool = DataFrameSchema(
        checks=Check(lambda df: (df < 10).all()))
    assert isinstance(schema_check_return_bool.validate(data), pd.DataFrame)

    schema_check_return_series = DataFrameSchema(
        checks=Check(lambda df: df[0] < 10))
    assert isinstance(schema_check_return_series.validate(data), pd.DataFrame)

    schema_check_return_df = DataFrameSchema(checks=Check(lambda df: df < 10))
    assert isinstance(schema_check_return_df.validate(data), pd.DataFrame)
Ejemplo n.º 28
0
def test_dataframe_schema():
    """Tests the Checking of a DataFrame that has a wide variety of types and
    conditions. Tests include: when the Schema works, when a column is dropped,
    and when a columns values change its type.
    """
    schema = DataFrameSchema({
        "a":
        Column(Int, Check(lambda x: x > 0, element_wise=True)),
        "b":
        Column(Float, Check(lambda x: 0 <= x <= 10, element_wise=True)),
        "c":
        Column(String, Check(lambda x: set(x) == {"x", "y", "z"})),
        "d":
        Column(Bool, Check(lambda x: x.mean() > 0.5)),
        "e":
        Column(Category, Check(lambda x: set(x) == {"c1", "c2", "c3"})),
        "f":
        Column(Object, Check(lambda x: x.isin([(1, ), (2, ), (3, )]))),
        "g":
        Column(
            DateTime,
            Check(lambda x: x >= pd.Timestamp("2015-01-01"),
                  element_wise=True)),
        "i":
        Column(
            Timedelta,
            Check(lambda x: x < pd.Timedelta(10, unit="D"), element_wise=True))
    })
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": [1.1, 2.5, 9.9],
        "c": ["z", "y", "x"],
        "d": [True, True, False],
        "e":
        pd.Series(["c2", "c1", "c3"], dtype="category"),
        "f": [(3, ), (2, ), (1, )],
        "g": [
            pd.Timestamp("2015-02-01"),
            pd.Timestamp("2015-02-02"),
            pd.Timestamp("2015-02-03")
        ],
        "i": [
            pd.Timedelta(1, unit="D"),
            pd.Timedelta(5, unit="D"),
            pd.Timedelta(9, unit="D")
        ]
    })
    assert isinstance(schema.validate(df), pd.DataFrame)

    # error case
    with pytest.raises(errors.SchemaError):
        schema.validate(df.drop("a", axis=1))

    with pytest.raises(errors.SchemaError):
        schema.validate(df.assign(a=[-1, -2, -1]))

    # checks if 'a' is converted to float, while schema says int, will a schema
    # error be thrown
    with pytest.raises(errors.SchemaError):
        schema.validate(df.assign(a=[1.7, 2.3, 3.1]))
Ejemplo n.º 29
0
 def init_schema_no_groupby_column():
     DataFrameSchema({
         "col1":
         Column(Int, [
             Check(lambda s: s["foo"] > 10, groupby=["col2"]),
         ]),
     })
Ejemplo n.º 30
0
def test_column_regex_matching(column_name_regex, expected_matches, error):
    """
    Column regex pattern matching should yield correct matches and raise
    expected errors.
    """
    columns = pd.MultiIndex.from_tuples((
        ("foo_1", "biz_1"),
        ("foo_2", "baz_1"),
        ("foo_3", "baz_2"),
        ("bar_1", "biz_2"),
        ("bar_2", "biz_3"),
        ("bar_3", "biz_3"),
    ))

    column_schema = Column(
        Int,
        Check(lambda s: s >= 0),
        name=column_name_regex,
        regex=True,
    )
    if error is not None:
        with pytest.raises(error):
            column_schema.get_regex_columns(columns)
    else:
        matched_columns = column_schema.get_regex_columns(columns)
        assert expected_matches == matched_columns.tolist()