Ejemplo n.º 1
0
def test_email_inference_failure(bad_emails):
    dtypes = ["object", "string"]
    if _is_spark_series(bad_emails[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in bad_emails:
        if _is_spark_series(series) and isinstance(series.iloc[0], ps.series.Row):
            continue

        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert not isinstance(inferred_type, EmailAddress)
Ejemplo n.º 2
0
def col_is_datetime(col, datetime_format=None):
    """Determine if a dataframe column contains datetime values or not. Returns True if column
    contains datetimes, False if not. Optionally specify the datetime format string for the column."""
    if _is_spark_series(col):
        col = col.to_pandas()

    if pd.api.types.is_datetime64_any_dtype(col):
        return True

    col = col.dropna()
    if len(col) == 0:
        return False

    col = col.astype(str)

    try:
        pd.to_datetime(
            col,
            errors="raise",
            format=datetime_format,
            infer_datetime_format=True,
        )
        return True

    except (ParserError, ValueError, OverflowError, TypeError):
        return False
Ejemplo n.º 3
0
def test_init_series_all_parameters(sample_series):
    if _is_spark_series(sample_series):
        sample_series = sample_series.astype("str")
    else:
        sample_series = sample_series.astype("object")

    metadata = {"meta_key": "meta_value"}
    description = "custom description"
    origin = "base"
    series = init_series(
        sample_series,
        logical_type="categorical",
        semantic_tags=["custom_tag"],
        metadata=metadata,
        description=description,
        origin=origin,
        use_standard_tags=False,
    )
    assert series is not sample_series
    correct_dtype = Categorical._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, Categorical)
    assert series.ww.semantic_tags == {"custom_tag"}
    assert series.ww.metadata == metadata
    assert series.ww.description == description
    assert series.ww.origin == origin
Ejemplo n.º 4
0
def _is_numeric_series(series, logical_type):
    """Determines whether a series will be considered numeric
    for the purposes of determining if it can be a time_index."""
    if _is_spark_series(series):
        series = series.to_pandas()
    if _is_dask_series(series):
        series = series.get_partition(0).compute()

    # If column can't be made to be numeric, don't bother checking Logical Type
    try:
        pd.to_numeric(series, errors="raise")
    except (ValueError, TypeError):
        return False

    if logical_type is not None:
        if isinstance(logical_type, str):
            logical_type = ww.type_system.str_to_logical_type(logical_type)

        # Allow numeric columns to be interpreted as Datetimes - doesn't allow strings even if they could be numeric
        if _get_ltype_class(
                logical_type
        ) == ww.logical_types.Datetime and pd.api.types.is_numeric_dtype(
                series):
            return True
    else:
        logical_type = ww.type_system.infer_logical_type(series)

    return "numeric" in logical_type.standard_tags
Ejemplo n.º 5
0
def test_accessor_init_with_schema_errors(sample_series):
    sample_series.ww.init(semantic_tags={"test_tag"},
                          description="this is a column",
                          origin="base")
    schema = sample_series.ww.schema

    head_series = sample_series.head(2)
    assert head_series.ww.schema is None

    error = "Provided schema must be a Woodwork.ColumnSchema object."
    with pytest.raises(TypeError, match=error):
        head_series.ww.init(schema=int)

    if _is_spark_series(sample_series):
        ltype_dtype = "string"
        new_dtype = "<U0"
    else:
        ltype_dtype = "category"
        new_dtype = "object"

    error = re.escape(
        f"dtype mismatch between Series dtype {new_dtype}, and Categorical dtype, {ltype_dtype}"
    )
    diff_dtype_series = sample_series.astype(new_dtype)
    with pytest.raises(ValueError, match=error):
        diff_dtype_series.ww.init(schema=schema)
Ejemplo n.º 6
0
def test_categorical_inference(categories):
    dtypes = ["object", "string", "category"]
    if _is_spark_series(categories[0]):
        dtypes = get_spark_dtypes(dtypes)
    for series in categories:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, Categorical)
Ejemplo n.º 7
0
def test_natural_language_inference(natural_language):
    dtypes = ["object", "string"]
    if _is_spark_series(natural_language[0]):
        dtypes = get_spark_dtypes(dtypes)
    for series in natural_language:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, NaturalLanguage)
Ejemplo n.º 8
0
def test_get_valid_dtype(sample_series):
    valid_dtype = Categorical._get_valid_dtype(type(sample_series))
    if _is_spark_series(sample_series):
        assert valid_dtype == "string[pyarrow]"
    else:
        assert valid_dtype == "category"

    valid_dtype = Boolean._get_valid_dtype(type(sample_series))
    assert valid_dtype == "bool"
Ejemplo n.º 9
0
def test_datetime_inference(datetimes):
    dtypes = ["object", "string", "datetime64[ns]"]
    if _is_spark_series(datetimes[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in datetimes:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, Datetime)
Ejemplo n.º 10
0
def test_integer_inference(integers):
    dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"]
    if _is_spark_series(integers[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in integers:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, Integer)
Ejemplo n.º 11
0
def test_unknown_inference(strings):
    dtypes = ["object", "string"]
    if _is_spark_series(strings[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in strings:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, Unknown)
Ejemplo n.º 12
0
def test_categorical_double_inference(doubles):
    with ww.config.with_options(numeric_categorical_threshold=0.5):
        dtypes = ["float", "float32", "float64", "float_"]
        if _is_spark_series(doubles[0]):
            dtypes = get_spark_dtypes(dtypes)
        for series in doubles:
            for dtype in dtypes:
                inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
                assert isinstance(inferred_type, Categorical)
Ejemplo n.º 13
0
def test_categorical_integers_inference(integers):
    with ww.config.with_options(numeric_categorical_threshold=0.5):
        dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"]
        if _is_spark_series(integers[0]):
            dtypes = get_spark_dtypes(dtypes)
        for series in integers:
            for dtype in dtypes:
                inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
                assert isinstance(inferred_type, Categorical)
Ejemplo n.º 14
0
def test_double_inference(doubles):
    dtypes = ["float", "float32", "float64", "float_"]
    if _is_spark_series(doubles[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in doubles:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, Double)
Ejemplo n.º 15
0
def test_email_inference(emails):
    dtypes = ["object", "string"]
    if _is_spark_series(emails[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in emails:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, EmailAddress)
Ejemplo n.º 16
0
def test_unknown_inference_all_null(nulls):
    dtypes = ["object", "string", "category", "datetime64[ns]"]
    if _is_spark_series(nulls[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in nulls:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            inferred_type.transform(series)
            assert isinstance(inferred_type, Unknown)
Ejemplo n.º 17
0
def test_accessor_repr(sample_series):
    sample_series.ww.init(use_standard_tags=False)
    # Spark doesn't support categorical
    if _is_spark_series(sample_series):
        dtype = "string"
    else:
        dtype = "category"
    assert (sample_series.ww.__repr__() ==
            f"<Series: sample_series (Physical Type = {dtype}) "
            "(Logical Type = Categorical) (Semantic Tags = set())>")
Ejemplo n.º 18
0
def test_init_series_valid_conversion_inferred_ltype(sample_series):
    if _is_spark_series(sample_series):
        sample_series = sample_series.astype("str")
    else:
        sample_series = sample_series.astype("object")

    series = init_series(sample_series)
    assert series is not sample_series
    correct_dtype = Categorical._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, Categorical)
    assert series.ww.semantic_tags == {"category"}
Ejemplo n.º 19
0
def test_ordinal_with_incomplete_ranking(sample_series):
    if _is_spark_series(sample_series) or _is_dask_series(sample_series):
        pytest.xfail(
            "Fails with Dask and Spark - ordinal data validation not supported"
        )

    ordinal_incomplete_order = Ordinal(order=["a", "b"])
    error_msg = re.escape(
        "Ordinal column sample_series contains values that are not "
        "present in the order values provided: ['c']")
    with pytest.raises(ValueError, match=error_msg):
        sample_series.ww.init(logical_type=ordinal_incomplete_order)
Ejemplo n.º 20
0
def test_accessor_init_with_invalid_logical_type(sample_series):
    if _is_spark_series(sample_series):
        series_dtype = "<U0"
    else:
        series_dtype = "object"
    series = sample_series.astype(series_dtype)
    correct_dtype = "string[pyarrow]"
    error_message = re.escape(
        f"Cannot initialize Woodwork. Series dtype '{series_dtype}' is incompatible with "
        f"NaturalLanguage dtype. Try converting series dtype to '{correct_dtype}' before "
        "initializing or use the woodwork.init_series function to initialize.")
    with pytest.raises(TypeValidationError, match=error_message):
        series.ww.init(logical_type=NaturalLanguage)
Ejemplo n.º 21
0
def test_series_methods_on_accessor_returning_series_valid_schema(
        sample_series):
    if _is_spark_series(sample_series):
        pytest.xfail(
            "Running replace on Spark series changes series dtype to object, invalidating schema"
        )
    sample_series.ww.init()

    replace_series = sample_series.ww.replace("a", "d")
    assert replace_series.ww._schema == sample_series.ww._schema
    assert replace_series.ww._schema is not sample_series.ww._schema
    pd.testing.assert_series_equal(to_pandas(replace_series),
                                   to_pandas(sample_series.replace("a", "d")))
Ejemplo n.º 22
0
def test_latlong_formatting_with_init_series(latlongs):
    expected_series = pd.Series([(1.0, 2.0), (3.0, 4.0)])
    if _is_dask_series(latlongs[0]):
        expected_series = dd.from_pandas(expected_series, npartitions=2)
    elif _is_spark_series(latlongs[0]):
        expected_series = ps.Series([[1.0, 2.0], [3.0, 4.0]])

    expected_series.ww.init(logical_type=LatLong)
    for series in latlongs:
        new_series = init_series(series, logical_type=LatLong)
        assert isinstance(new_series.ww.logical_type, LatLong)
        pd.testing.assert_series_equal(to_pandas(new_series),
                                       to_pandas(expected_series))
        assert expected_series.ww._schema == new_series.ww._schema
Ejemplo n.º 23
0
def test_set_logical_type_invalid_dtype_change(sample_series):
    if _is_dask_series(sample_series):
        pytest.xfail(
            "Dask type conversion with astype does not fail until compute is called"
        )
    if _is_spark_series(sample_series):
        pytest.xfail(
            "Spark allows this conversion, filling values it cannot convert with NaN "
            "and converting dtype to float.")
    sample_series.ww.init(logical_type="Categorical")
    error_message = (
        "Error converting datatype for sample_series from type category to "
        "type int64. Please confirm the underlying data is consistent with logical type Integer."
    )
    with pytest.raises(TypeConversionError, match=error_message):
        sample_series.ww.set_logical_type("Integer")
Ejemplo n.º 24
0
def test_set_logical_type_valid_dtype_change(sample_series):
    sample_series.ww.init(logical_type="Categorical")

    new_series = sample_series.ww.set_logical_type("NaturalLanguage")

    if _is_spark_series(sample_series):
        # Spark uses string dtype for Categorical
        original_dtype = "string"
    else:
        original_dtype = "category"
    new_dtype = "string"

    assert isinstance(sample_series.ww.logical_type, Categorical)
    assert sample_series.dtype == original_dtype
    assert isinstance(new_series.ww.logical_type, NaturalLanguage)
    assert new_series.dtype == new_dtype
Ejemplo n.º 25
0
def test_init_series_error_on_invalid_conversion(sample_series):
    if _is_dask_series(sample_series):
        pytest.xfail(
            "Dask type conversion with astype does not fail until compute is called"
        )
    if _is_spark_series(sample_series):
        pytest.xfail(
            "Spark allows this conversion, filling values it cannot convert with NaN "
            "and converting dtype to float.")

    error_message = (
        "Error converting datatype for sample_series from type category to type Int64. "
        "Please confirm the underlying data is consistent with logical type IntegerNullable."
    )
    with pytest.raises(TypeConversionError, match=error_message):
        init_series(sample_series, logical_type="integer_nullable")
Ejemplo n.º 26
0
def test_ordinal_with_order(sample_series):
    if _is_spark_series(sample_series) or _is_dask_series(sample_series):
        pytest.xfail(
            "Fails with Dask and Spark - ordinal data validation not compatible"
        )

    series = sample_series.copy()
    ordinal_with_order = Ordinal(order=["a", "b", "c"])
    series.ww.init(logical_type=ordinal_with_order)
    assert isinstance(series.ww.logical_type, Ordinal)
    assert series.ww.logical_type.order == ["a", "b", "c"]

    series = sample_series.copy()
    series.ww.init(logical_type="Categorical")
    new_series = series.ww.set_logical_type(ordinal_with_order)
    assert isinstance(new_series.ww.logical_type, Ordinal)
    assert new_series.ww.logical_type.order == ["a", "b", "c"]
Ejemplo n.º 27
0
def test_updated_ltype_inference(integers, type_sys):
    inference_fn = type_sys.inference_functions[ww.logical_types.Integer]
    type_sys.remove_type(ww.logical_types.Integer)

    class Integer(LogicalType):
        primary_dtype = "string"

    type_sys.add_type(Integer, inference_function=inference_fn)

    dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"]
    if _is_spark_series(integers[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in integers:
        for dtype in dtypes:
            inferred_type = type_sys.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, Integer)
            assert inferred_type.primary_dtype == "string"
Ejemplo n.º 28
0
def test_init_series_valid_conversion_specified_ltype(sample_series):
    if _is_spark_series(sample_series):
        sample_series = sample_series.astype("str")
    else:
        sample_series = sample_series.astype("object")

    series = init_series(sample_series, logical_type="categorical")
    assert series is not sample_series
    correct_dtype = Categorical._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, Categorical)
    assert series.ww.semantic_tags == {"category"}

    series = init_series(sample_series, logical_type="natural_language")
    assert series is not sample_series
    correct_dtype = NaturalLanguage._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, NaturalLanguage)
    assert series.ww.semantic_tags == set()
Ejemplo n.º 29
0
def test_series_methods_on_accessor_returning_series_invalid_schema(
        sample_series):
    sample_series.ww.init()

    if _is_spark_series(sample_series):
        # Spark uses `string` for Categorical, so must try a different conversion
        original_type = r"string\[pyarrow\]"
        new_type = "Int64"
    else:
        original_type = "category"
        new_type = "string"

    warning = (
        "Operation performed by astype has invalidated the Woodwork typing information:\n "
        f"dtype mismatch between original dtype, {original_type}, and returned dtype, {new_type}.\n "
        "Please initialize Woodwork with Series.ww.init")
    with pytest.warns(TypingInfoMismatchWarning, match=warning):
        new_series = sample_series.ww.astype(new_type)

    assert new_series.ww._schema is None
Ejemplo n.º 30
0
def test_url_validate(sample_df):
    logical_type = URL()
    dtype = logical_type.primary_dtype
    series = sample_df["url"].astype(dtype)
    invalid_row = pd.Series({4: "bad_url"}, name="url").astype(dtype)
    if _is_spark_series(series):
        invalid_row = ps.from_pandas(invalid_row)

    assert logical_type.validate(series) is None

    series = series.append(invalid_row).astype(dtype)
    match = "Series url contains invalid url values. "
    match += "The url_inference_regex can be changed in the config if needed."

    with pytest.raises(TypeValidationError, match=match):
        logical_type.validate(series)

    actual = logical_type.validate(series, return_invalid_values=True)
    expected = pd.Series({4: "bad_url"}, name="url").astype(dtype)
    assert to_pandas(actual).equals(expected)