Example #1
0
def test_get_invalid_schema_message_dtype_mismatch(sample_df):
    schema_df = sample_df.copy()
    schema_df.ww.init(logical_types={
        "age": "Categorical",
        "full_name": "PersonFullName"
    })
    schema = schema_df.ww.schema

    incorrect_int_dtype_df = schema_df.ww.astype({"id": "Int64"})
    incorrect_bool_dtype_df = schema_df.ww.astype({"is_registered": "Int64"})

    assert (
        get_invalid_schema_message(incorrect_int_dtype_df, schema) ==
        "dtype mismatch for column id between DataFrame dtype, Int64, and Integer dtype, int64"
    )
    assert (
        get_invalid_schema_message(incorrect_bool_dtype_df, schema) ==
        "dtype mismatch for column is_registered between DataFrame dtype, Int64, and BooleanNullable dtype, boolean"
    )

    # Spark backup dtypes make these checks not relevant
    if not _is_spark_dataframe(sample_df):
        incorrect_str_dtype_df = schema_df.ww.astype({"full_name": "object"
                                                      })  # wont work for spark
        incorrect_categorical_dtype_df = schema_df.ww.astype(
            {"age": "string"})  # wont work for spark
        assert (
            get_invalid_schema_message(incorrect_str_dtype_df, schema) ==
            "dtype mismatch for column full_name between DataFrame dtype, object, and PersonFullName dtype, string[pyarrow]"
        )
        assert (
            get_invalid_schema_message(incorrect_categorical_dtype_df,
                                       schema) ==
            "dtype mismatch for column age between DataFrame dtype, string, and Categorical dtype, category"
        )
Example #2
0
            def wrapper(*args, **kwargs):
                # Make DataFrame call and intercept the result
                result = dataframe_attr(*args, **kwargs)

                # Try to initialize Woodwork with the existing schema
                if _is_dataframe(result):
                    invalid_schema_message = get_invalid_schema_message(
                        result, self._schema)
                    if invalid_schema_message:
                        warnings.warn(
                            TypingInfoMismatchWarning().get_warning_message(
                                attr, invalid_schema_message, 'DataFrame'),
                            TypingInfoMismatchWarning)
                    else:
                        copied_schema = self.schema
                        result.ww.init(schema=copied_schema, validate=False)
                        result.ww.make_index = self.make_index
                else:
                    # Confirm that the schema is still valid on original DataFrame
                    # Important for inplace operations
                    invalid_schema_message = get_invalid_schema_message(
                        self._dataframe, self._schema)

                    if invalid_schema_message:
                        warnings.warn(
                            TypingInfoMismatchWarning().get_warning_message(
                                attr, invalid_schema_message, 'DataFrame'),
                            TypingInfoMismatchWarning)
                        self._schema = None

                # Always return the results of the DataFrame operation whether or not Woodwork is initialized
                return result
Example #3
0
def test_get_invalid_schema_message(sample_df):
    schema_df = sample_df.copy()
    schema_df.ww.init(
        name="test_schema",
        index="id",
        logical_types={
            "id": "Double",
            "full_name": "PersonFullName"
        },
    )
    schema = schema_df.ww.schema

    assert get_invalid_schema_message(schema_df, schema) is None
    assert (
        get_invalid_schema_message(sample_df, schema) ==
        "dtype mismatch for column id between DataFrame dtype, int64, and Double dtype, float64"
    )

    sampled_df = schema_df.sample(frac=0.3)
    assert get_invalid_schema_message(sampled_df, schema) is None

    dropped_df = schema_df.drop("id", axis=1)
    assert (
        get_invalid_schema_message(dropped_df, schema) ==
        "The following columns in the typing information were missing from the DataFrame: {'id'}"
    )

    renamed_df = schema_df.rename(columns={"id": "new_col"})
    assert (
        get_invalid_schema_message(renamed_df, schema) ==
        "The following columns in the DataFrame were missing from the typing information: {'new_col'}"
    )
Example #4
0
def test_get_invalid_schema_message_index_checks(sample_df):
    if not isinstance(sample_df, pd.DataFrame):
        pytest.xfail('Index validation not performed for Dask or Koalas DataFrames')

    schema_df = sample_df.copy()
    schema_df.ww.init(name='test_schema', index='id', logical_types={'id': 'Double', 'full_name': 'PersonFullName'})
    schema = schema_df.ww.schema

    different_underlying_index_df = schema_df.copy()
    different_underlying_index_df['id'] = pd.Series([9, 8, 7, 6], dtype='float64')
    assert (get_invalid_schema_message(different_underlying_index_df, schema) ==
            "Index mismatch between DataFrame and typing information")

    not_unique_df = schema_df.replace({3: 1})
    not_unique_df.index = not_unique_df['id']
    not_unique_df.index.name = None
    assert get_invalid_schema_message(not_unique_df, schema) == 'Index column is not unique'
Example #5
0
def _check_schema(dataframe, schema):
    if not isinstance(schema, TableSchema):
        raise TypeError(
            'Provided schema must be a Woodwork.TableSchema object.')
    invalid_schema_message = get_invalid_schema_message(dataframe, schema)
    if invalid_schema_message:
        raise ValueError(
            f'Woodwork typing information is not valid for this DataFrame: {invalid_schema_message}'
        )
Example #6
0
def test_get_invalid_schema_message(sample_df):
    schema_df = sample_df.copy()
    schema_df.ww.init(name='test_schema', index='id', logical_types={'id': 'Double', 'full_name': 'PersonFullName'})
    schema = schema_df.ww.schema

    assert get_invalid_schema_message(schema_df, schema) is None
    assert (get_invalid_schema_message(sample_df, schema) ==
            'dtype mismatch for column id between DataFrame dtype, int64, and Double dtype, float64')

    sampled_df = schema_df.sample(frac=0.3)
    assert get_invalid_schema_message(sampled_df, schema) is None

    dropped_df = schema_df.drop('id', axis=1)
    assert (get_invalid_schema_message(dropped_df, schema) ==
            "The following columns in the typing information were missing from the DataFrame: {'id'}")

    renamed_df = schema_df.rename(columns={'id': 'new_col'})
    assert (get_invalid_schema_message(renamed_df, schema) ==
            "The following columns in the DataFrame were missing from the typing information: {'new_col'}")
Example #7
0
def test_get_invalid_schema_message_dtype_mismatch(sample_df):
    schema_df = sample_df.copy()
    schema_df.ww.init(logical_types={'age': 'Categorical'})
    schema = schema_df.ww.schema

    incorrect_int_dtype_df = schema_df.ww.astype({'id': 'Int64'})
    incorrect_bool_dtype_df = schema_df.ww.astype({'is_registered': 'Int64'})
    incorrect_str_dtype_df = schema_df.ww.astype({'full_name': 'object'})  # wont work for koalas
    incorrect_categorical_dtype_df = schema_df.ww.astype({'age': 'string'})  # wont work for koalas

    assert (get_invalid_schema_message(incorrect_int_dtype_df, schema) ==
            'dtype mismatch for column id between DataFrame dtype, Int64, and Integer dtype, int64')
    assert (get_invalid_schema_message(incorrect_bool_dtype_df, schema) ==
            'dtype mismatch for column is_registered between DataFrame dtype, Int64, and BooleanNullable dtype, boolean')
    # Koalas backup dtypes make these checks not relevant
    if ks and not isinstance(sample_df, ks.DataFrame):
        assert (get_invalid_schema_message(incorrect_str_dtype_df, schema) ==
                'dtype mismatch for column full_name between DataFrame dtype, object, and NaturalLanguage dtype, string')
        assert (get_invalid_schema_message(incorrect_categorical_dtype_df, schema) ==
                'dtype mismatch for column age between DataFrame dtype, string, and Categorical dtype, category')
Example #8
0
def test_get_invalid_schema_message_index_checks(sample_df):
    if not isinstance(sample_df, pd.DataFrame):
        pytest.xfail(
            "Index validation not performed for Dask or Spark DataFrames")

    schema_df = sample_df.copy()
    schema_df.ww.init(
        name="test_schema",
        index="id",
        logical_types={
            "id": "Double",
            "full_name": "PersonFullName"
        },
    )
    schema = schema_df.ww.schema

    different_underlying_index_df = schema_df.copy()
    different_underlying_index_df["id"] = pd.Series([9, 8, 7, 6],
                                                    dtype="float64")
    assert (get_invalid_schema_message(
        different_underlying_index_df,
        schema) == "Index mismatch between DataFrame and typing information")

    not_unique_df = schema_df.replace({3: 1})
    not_unique_df.index = not_unique_df["id"]
    not_unique_df.index.name = None
    assert (get_invalid_schema_message(not_unique_df,
                                       schema) == "Index column is not unique")

    df = pd.DataFrame({
        "id": pd.Series([5, 4, 3, 2], dtype="float64"),
        "col": pd.Series(["b", "b", "b", "d"], dtype="category"),
    })
    df.ww.init(index="id")
    df_schema = df.ww.schema

    nan_df = df.replace({3: None})
    nan_df["id"] = nan_df["id"].astype("float64")
    nan_df = nan_df.set_index("id", drop=False)
    actual = get_invalid_schema_message(nan_df, df_schema)
    assert actual == "Index contains null values"