Beispiel #1
0
def tests_multi_index_subindex_coerce():
    """MultIndex component should override sub indexes."""
    indexes = [
        Index(String, coerce=True),
        Index(String, coerce=False),
        Index(String, coerce=True),
        Index(String, coerce=False),
    ]

    data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4))

    schema = DataFrameSchema(index=MultiIndex(indexes), coerce=False)
    validated_df = schema(data)
    for level_i in range(validated_df.index.nlevels):
        if indexes[level_i].coerce:
            assert validated_df.index.get_level_values(level_i).dtype == \
                indexes[level_i].dtype
        else:
            # dtype should be string representation of pandas strings
            assert validated_df.index.get_level_values(level_i).dtype == \
                "object"

    # coerce=True in MultiIndex should override subindex coerce setting
    schema_override = DataFrameSchema(index=MultiIndex(indexes), coerce=True)
    validated_df_override = schema_override(data)
    for level_i in range(validated_df.index.nlevels):
        assert validated_df_override.index.get_level_values(level_i).dtype == \
            indexes[level_i].dtype
def tests_multi_index_subindex_coerce():
    """MultIndex component should override sub indexes."""
    indexes = [
        Index(String, coerce=True),
        Index(String, coerce=False),
        Index(String, coerce=True),
        Index(String, coerce=False),
    ]

    data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4))

    # coerce=True in MultiIndex and DataFrameSchema should override subindex
    # coerce setting
    for schema_override in [
            DataFrameSchema(index=MultiIndex(indexes, coerce=True)),
            DataFrameSchema(index=MultiIndex(indexes), coerce=True),
    ]:
        validated_df_override = schema_override(data)
        for level_i in range(validated_df_override.index.nlevels):
            assert (validated_df_override.index.get_level_values(level_i).dtype
                    == "object")

    # coerce=False at the MultiIndex level should result in two type errors
    schema = DataFrameSchema(index=MultiIndex(indexes))
    with pytest.raises(errors.SchemaErrors,
                       match="A total of 2 schema errors were found"):
        schema(data, lazy=True)
Beispiel #3
0
def _deserialize_schema(serialized_schema):
    # pylint: disable-all
    from pandera import DataFrameSchema, Column, Index, MultiIndex

    columns, index = None, None
    if serialized_schema["columns"] is not None:
        columns = {
            col_name: Column(**_deserialize_component_stats(column_stats))
            for col_name, column_stats in serialized_schema["columns"].items()
        }

    if serialized_schema["index"] is not None:
        index = [
            _deserialize_component_stats(index_component)
            for index_component in serialized_schema["index"]
        ]

    if index is None:
        pass
    elif len(index) == 1:
        index = Index(**index[0])
    else:
        index = MultiIndex(
            indexes=[Index(**index_properties) for index_properties in index])

    return DataFrameSchema(
        columns={col_name: column
                 for col_name, column in columns.items()},
        index=index,
        coerce=serialized_schema["coerce"],
        strict=serialized_schema["strict"],
    )
def test_multi_index_index():
    schema = DataFrameSchema(
        columns={
            "column1": Column(Float, Check(lambda s: s > 0)),
            "column2": Column(Float, Check(lambda s: s > 0)),
        },
        index=MultiIndex(indexes=[
            Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
            Index(String,
                  Check(lambda s: s.isin(["foo", "bar"])),
                  name="index1"),
        ]))

    df = pd.DataFrame(
        data={
            "column1": [0.1, 0.5, 123.1, 10.6, 22.31],
            "column2": [0.1, 0.5, 123.1, 10.6, 22.31],
        },
        index=pd.MultiIndex.from_arrays(
            [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
            names=["index0", "index1"],
        ))

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)

    # failure case
    df_fail = df.copy()
    df_fail.index = pd.MultiIndex.from_arrays(
        [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
        names=["index0", "index1"],
    )
    with pytest.raises(errors.SchemaError):
        schema.validate(df_fail)
Beispiel #5
0
def test_series_schema_with_index(coerce):
    """Test SeriesSchema with Index and MultiIndex components."""
    schema_with_index = SeriesSchema(
        pandas_dtype=Int,
        index=Index(Int, coerce=coerce),
    )
    validated_series = schema_with_index(pd.Series([1, 2, 3], index=[1, 2, 3]))
    assert isinstance(validated_series, pd.Series)

    schema_with_multiindex = SeriesSchema(
        pandas_dtype=Int,
        index=MultiIndex(
            [
                Index(Int, coerce=coerce),
                Index(String, coerce=coerce),
            ]
        ),
    )
    multi_index = pd.MultiIndex.from_arrays(
        [[0, 1, 2], ["foo", "bar", "foo"]],
    )
    validated_series_multiindex = schema_with_multiindex(
        pd.Series([1, 2, 3], index=multi_index)
    )
    assert isinstance(validated_series_multiindex, pd.Series)
    assert (validated_series_multiindex.index == multi_index).all()
Beispiel #6
0
def schema_multiindex():
    """Fixture for schema with MultiIndex."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(pandas_dtype=Int),
            "col2": Column(pandas_dtype=Float),
        },
        index=MultiIndex([
            Index(pandas_dtype=String, name="ind0"),
            Index(pandas_dtype=String, name="ind1"),
        ]),
    )
    return schema
Beispiel #7
0
def _deserialize_schema(serialized_schema):
    # pylint: disable=import-outside-toplevel
    from pandera import Index, MultiIndex

    # GH#475
    serialized_schema = serialized_schema if serialized_schema else {}

    if not isinstance(serialized_schema, Mapping):
        raise pandera.errors.SchemaDefinitionError(
            "Schema representation must be a mapping.")

    columns = serialized_schema.get("columns")
    index = serialized_schema.get("index")
    checks = serialized_schema.get("checks")

    if columns is not None:
        columns = {
            col_name: Column(**_deserialize_component_stats(column_stats))
            for col_name, column_stats in columns.items()
        }

    if index is not None:
        index = [
            _deserialize_component_stats(index_component)
            for index_component in index
        ]

    if checks is not None:
        # handles unregistered checks by raising AttributeErrors from getattr
        checks = [
            _deserialize_check_stats(getattr(Check, check_name), check_stats)
            for check_name, check_stats in checks.items()
        ]

    if index is None:
        pass
    elif len(index) == 1:
        index = Index(**index[0])
    else:
        index = MultiIndex(
            indexes=[Index(**index_properties) for index_properties in index])

    return DataFrameSchema(
        columns=columns,
        checks=checks,
        index=index,
        coerce=serialized_schema.get("coerce", False),
        strict=serialized_schema.get("strict", False),
    )
def test_schema_component_equality_operators():
    """Test the usage of == for Column, Index and MultiIndex."""
    column = Column(Int, Check(lambda s: s >= 0))
    index = Index(Int, [Check(lambda x: 1 <= x <= 11, element_wise=True)])
    multi_index = MultiIndex(indexes=[
        Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
        Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"),
    ])
    not_equal_schema = DataFrameSchema(
        {"col1": Column(Int, Check(lambda s: s >= 0))})

    assert column == copy.deepcopy(column)
    assert column != not_equal_schema
    assert index == copy.deepcopy(index)
    assert index != not_equal_schema
    assert multi_index == copy.deepcopy(multi_index)
    assert multi_index != not_equal_schema
def test_multi_index_schema_coerce():
    """Test that multi index can be type-coerced."""
    indexes = [
        Index(Float),
        Index(Int),
        Index(String),
    ]
    schema = DataFrameSchema(index=MultiIndex(indexes=indexes))
    df = pd.DataFrame(index=pd.MultiIndex.from_arrays([
        [1.0, 2.1, 3.5, 4.8],
        [5, 6, 7, 8],
        ["9", "10", "11", "12"],
    ]))
    validated_df = schema(df)
    for level_i in range(validated_df.index.nlevels):
        assert (validated_df.index.get_level_values(level_i).dtype ==
                indexes[level_i].dtype)
def test_multi_index_index():
    """Tests that multi-index Indexes within DataFrames validate correctly."""
    schema = DataFrameSchema(
        columns={
            "column1": Column(Float, Check(lambda s: s > 0)),
            "column2": Column(Float, Check(lambda s: s > 0)),
        },
        index=MultiIndex(indexes=[
            Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
            Index(
                String,
                Check(lambda s: s.isin(["foo", "bar"])),
                name="index1",
            ),
        ]),
    )

    df = pd.DataFrame(
        data={
            "column1": [0.1, 0.5, 123.1, 10.6, 22.31],
            "column2": [0.1, 0.5, 123.1, 10.6, 22.31],
        },
        index=pd.MultiIndex.from_arrays(
            [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
            names=["index0", "index1"],
        ),
    )

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)
    assert schema.index.names == ["index0", "index1"]
    assert (schema.index.__repr__() ==
            f"<Schema MultiIndex: '{schema.index.names}'>")

    # failure case
    df_fail = df.copy()
    df_fail.index = pd.MultiIndex.from_arrays(
        [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
        names=["index0", "index1"],
    )
    with pytest.raises(errors.SchemaError):
        schema.validate(df_fail)
Beispiel #11
0
def _deserialize_schema(serialized_schema):
    # pylint: disable=import-outside-toplevel
    from pandera import Check, Column, DataFrameSchema, Index, MultiIndex

    columns, index, checks = None, None, None
    if serialized_schema["columns"] is not None:
        columns = {
            col_name: Column(**_deserialize_component_stats(column_stats))
            for col_name, column_stats in serialized_schema["columns"].items()
        }

    if serialized_schema["index"] is not None:
        index = [
            _deserialize_component_stats(index_component)
            for index_component in serialized_schema["index"]
        ]

    if serialized_schema["checks"] is not None:
        # handles unregistered checks by raising AttributeErrors from getattr
        checks = [
            _deserialize_check_stats(getattr(Check, check_name), check_stats)
            for check_name, check_stats in serialized_schema["checks"].items()
        ]

    if index is None:
        pass
    elif len(index) == 1:
        index = Index(**index[0])
    else:
        index = MultiIndex(
            indexes=[Index(**index_properties) for index_properties in index])

    return DataFrameSchema(
        columns=columns,
        checks=checks,
        index=index,
        coerce=serialized_schema["coerce"],
        strict=serialized_schema["strict"],
    )
Beispiel #12
0
COLUMN_TEMPLATE = """
Column(
    dtype={dtype},
    checks={checks},
    nullable={nullable},
    allow_duplicates={allow_duplicates},
    coerce={coerce},
    required={required},
    regex={regex},
)
"""

INDEX_TEMPLATE = ("Index(dtype={dtype},checks={checks},"
                  "nullable={nullable},coerce={coerce},name={name})")

MULTIINDEX_TEMPLATE = """
MultiIndex(indexes=[{indexes}])
"""


def _format_checks(checks_dict):
    if checks_dict is None:
        return "None"

    checks = []
    for check_name, check_kwargs in checks_dict.items():
        if check_kwargs is None:
            warnings.warn(f"Check {check_name} cannot be serialized. "
                          "This check will be ignored")
        else:
            args = ", ".join(f"{k}={v.__repr__()}"
def test_multiindex_incorrect_input(indexes):
    """Passing in non-Index object raises SchemaInitError."""
    with pytest.raises((errors.SchemaInitError, TypeError)):
        MultiIndex(indexes)
def test_multiindex_unordered_init_exception(indexes):
    """Un-named indexes in unordered MultiIndex raises an exception."""
    with pytest.raises(errors.SchemaInitError):
        MultiIndex(indexes, ordered=False)
        [
            pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["a", "a"
                                                                     ]),
            False,
        ],
        [
            pd.MultiIndex.from_arrays([[1, 2, 3], ["a", "b", "c"]],
                                      names=["a", "a"]),
            True,
        ],
    ],
)
@pytest.mark.parametrize(
    "schema",
    [
        MultiIndex([Index(int, name="a"),
                    Index(int, name="a")]),
        MultiIndex(
            [Index(int, name="a"), Index(int, name="a")], coerce=True),
    ],
)
def test_multiindex_duplicate_index_names(multiindex, error, schema):
    """Test MultiIndex schema component can handle duplicate index names."""
    if error:
        with pytest.raises(errors.SchemaError):
            schema(pd.DataFrame(index=multiindex))
        with pytest.raises(errors.SchemaErrors):
            schema(pd.DataFrame(index=multiindex), lazy=True)
    else:
        assert isinstance(schema(pd.DataFrame(index=multiindex)), pd.DataFrame)

Beispiel #16
0
        DataFrameSchema(columns={"new_name": Column(name="old_name")})


@pytest.mark.parametrize(
    "columns,index",
    [
        (
            {
                "a": Column(Int, required=False),
                "b": Column(Int, required=False),
            },
            None,
        ),
        (
            None,
            MultiIndex(indexes=[Index(Int, name="a"),
                                Index(Int, name="b")], ),
        ),
    ],
)
def test_ordered_dataframe(columns: Dict[str, Column], index: MultiIndex):
    """Test that columns are ordered."""
    schema = DataFrameSchema(columns=columns, index=index, ordered=True)

    df = pd.DataFrame(
        data=[[1, 2, 3]],
        columns=["a", "a", "b"],
        index=pd.MultiIndex.from_arrays([[1], [2], [3]], names=["a", "a",
                                                                "b"]),
    )
    assert isinstance(schema.validate(df), pd.DataFrame)