def test_email_inference_failure(bad_emails): dtypes = ["object", "string"] if _is_spark_series(bad_emails[0]): dtypes = get_spark_dtypes(dtypes) for series in bad_emails: if _is_spark_series(series) and isinstance(series.iloc[0], ps.series.Row): continue for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert not isinstance(inferred_type, EmailAddress)
def col_is_datetime(col, datetime_format=None): """Determine if a dataframe column contains datetime values or not. Returns True if column contains datetimes, False if not. Optionally specify the datetime format string for the column.""" if _is_spark_series(col): col = col.to_pandas() if pd.api.types.is_datetime64_any_dtype(col): return True col = col.dropna() if len(col) == 0: return False col = col.astype(str) try: pd.to_datetime( col, errors="raise", format=datetime_format, infer_datetime_format=True, ) return True except (ParserError, ValueError, OverflowError, TypeError): return False
def test_init_series_all_parameters(sample_series): if _is_spark_series(sample_series): sample_series = sample_series.astype("str") else: sample_series = sample_series.astype("object") metadata = {"meta_key": "meta_value"} description = "custom description" origin = "base" series = init_series( sample_series, logical_type="categorical", semantic_tags=["custom_tag"], metadata=metadata, description=description, origin=origin, use_standard_tags=False, ) assert series is not sample_series correct_dtype = Categorical._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, Categorical) assert series.ww.semantic_tags == {"custom_tag"} assert series.ww.metadata == metadata assert series.ww.description == description assert series.ww.origin == origin
def _is_numeric_series(series, logical_type): """Determines whether a series will be considered numeric for the purposes of determining if it can be a time_index.""" if _is_spark_series(series): series = series.to_pandas() if _is_dask_series(series): series = series.get_partition(0).compute() # If column can't be made to be numeric, don't bother checking Logical Type try: pd.to_numeric(series, errors="raise") except (ValueError, TypeError): return False if logical_type is not None: if isinstance(logical_type, str): logical_type = ww.type_system.str_to_logical_type(logical_type) # Allow numeric columns to be interpreted as Datetimes - doesn't allow strings even if they could be numeric if _get_ltype_class( logical_type ) == ww.logical_types.Datetime and pd.api.types.is_numeric_dtype( series): return True else: logical_type = ww.type_system.infer_logical_type(series) return "numeric" in logical_type.standard_tags
def test_accessor_init_with_schema_errors(sample_series): sample_series.ww.init(semantic_tags={"test_tag"}, description="this is a column", origin="base") schema = sample_series.ww.schema head_series = sample_series.head(2) assert head_series.ww.schema is None error = "Provided schema must be a Woodwork.ColumnSchema object." with pytest.raises(TypeError, match=error): head_series.ww.init(schema=int) if _is_spark_series(sample_series): ltype_dtype = "string" new_dtype = "<U0" else: ltype_dtype = "category" new_dtype = "object" error = re.escape( f"dtype mismatch between Series dtype {new_dtype}, and Categorical dtype, {ltype_dtype}" ) diff_dtype_series = sample_series.astype(new_dtype) with pytest.raises(ValueError, match=error): diff_dtype_series.ww.init(schema=schema)
def test_categorical_inference(categories): dtypes = ["object", "string", "category"] if _is_spark_series(categories[0]): dtypes = get_spark_dtypes(dtypes) for series in categories: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Categorical)
def test_natural_language_inference(natural_language): dtypes = ["object", "string"] if _is_spark_series(natural_language[0]): dtypes = get_spark_dtypes(dtypes) for series in natural_language: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, NaturalLanguage)
def test_get_valid_dtype(sample_series): valid_dtype = Categorical._get_valid_dtype(type(sample_series)) if _is_spark_series(sample_series): assert valid_dtype == "string[pyarrow]" else: assert valid_dtype == "category" valid_dtype = Boolean._get_valid_dtype(type(sample_series)) assert valid_dtype == "bool"
def test_datetime_inference(datetimes): dtypes = ["object", "string", "datetime64[ns]"] if _is_spark_series(datetimes[0]): dtypes = get_spark_dtypes(dtypes) for series in datetimes: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Datetime)
def test_integer_inference(integers): dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"] if _is_spark_series(integers[0]): dtypes = get_spark_dtypes(dtypes) for series in integers: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Integer)
def test_unknown_inference(strings): dtypes = ["object", "string"] if _is_spark_series(strings[0]): dtypes = get_spark_dtypes(dtypes) for series in strings: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Unknown)
def test_categorical_double_inference(doubles): with ww.config.with_options(numeric_categorical_threshold=0.5): dtypes = ["float", "float32", "float64", "float_"] if _is_spark_series(doubles[0]): dtypes = get_spark_dtypes(dtypes) for series in doubles: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Categorical)
def test_categorical_integers_inference(integers): with ww.config.with_options(numeric_categorical_threshold=0.5): dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"] if _is_spark_series(integers[0]): dtypes = get_spark_dtypes(dtypes) for series in integers: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Categorical)
def test_double_inference(doubles): dtypes = ["float", "float32", "float64", "float_"] if _is_spark_series(doubles[0]): dtypes = get_spark_dtypes(dtypes) for series in doubles: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Double)
def test_email_inference(emails): dtypes = ["object", "string"] if _is_spark_series(emails[0]): dtypes = get_spark_dtypes(dtypes) for series in emails: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, EmailAddress)
def test_unknown_inference_all_null(nulls): dtypes = ["object", "string", "category", "datetime64[ns]"] if _is_spark_series(nulls[0]): dtypes = get_spark_dtypes(dtypes) for series in nulls: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) inferred_type.transform(series) assert isinstance(inferred_type, Unknown)
def test_accessor_repr(sample_series): sample_series.ww.init(use_standard_tags=False) # Spark doesn't support categorical if _is_spark_series(sample_series): dtype = "string" else: dtype = "category" assert (sample_series.ww.__repr__() == f"<Series: sample_series (Physical Type = {dtype}) " "(Logical Type = Categorical) (Semantic Tags = set())>")
def test_init_series_valid_conversion_inferred_ltype(sample_series): if _is_spark_series(sample_series): sample_series = sample_series.astype("str") else: sample_series = sample_series.astype("object") series = init_series(sample_series) assert series is not sample_series correct_dtype = Categorical._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, Categorical) assert series.ww.semantic_tags == {"category"}
def test_ordinal_with_incomplete_ranking(sample_series): if _is_spark_series(sample_series) or _is_dask_series(sample_series): pytest.xfail( "Fails with Dask and Spark - ordinal data validation not supported" ) ordinal_incomplete_order = Ordinal(order=["a", "b"]) error_msg = re.escape( "Ordinal column sample_series contains values that are not " "present in the order values provided: ['c']") with pytest.raises(ValueError, match=error_msg): sample_series.ww.init(logical_type=ordinal_incomplete_order)
def test_accessor_init_with_invalid_logical_type(sample_series): if _is_spark_series(sample_series): series_dtype = "<U0" else: series_dtype = "object" series = sample_series.astype(series_dtype) correct_dtype = "string[pyarrow]" error_message = re.escape( f"Cannot initialize Woodwork. Series dtype '{series_dtype}' is incompatible with " f"NaturalLanguage dtype. Try converting series dtype to '{correct_dtype}' before " "initializing or use the woodwork.init_series function to initialize.") with pytest.raises(TypeValidationError, match=error_message): series.ww.init(logical_type=NaturalLanguage)
def test_series_methods_on_accessor_returning_series_valid_schema( sample_series): if _is_spark_series(sample_series): pytest.xfail( "Running replace on Spark series changes series dtype to object, invalidating schema" ) sample_series.ww.init() replace_series = sample_series.ww.replace("a", "d") assert replace_series.ww._schema == sample_series.ww._schema assert replace_series.ww._schema is not sample_series.ww._schema pd.testing.assert_series_equal(to_pandas(replace_series), to_pandas(sample_series.replace("a", "d")))
def test_latlong_formatting_with_init_series(latlongs): expected_series = pd.Series([(1.0, 2.0), (3.0, 4.0)]) if _is_dask_series(latlongs[0]): expected_series = dd.from_pandas(expected_series, npartitions=2) elif _is_spark_series(latlongs[0]): expected_series = ps.Series([[1.0, 2.0], [3.0, 4.0]]) expected_series.ww.init(logical_type=LatLong) for series in latlongs: new_series = init_series(series, logical_type=LatLong) assert isinstance(new_series.ww.logical_type, LatLong) pd.testing.assert_series_equal(to_pandas(new_series), to_pandas(expected_series)) assert expected_series.ww._schema == new_series.ww._schema
def test_set_logical_type_invalid_dtype_change(sample_series): if _is_dask_series(sample_series): pytest.xfail( "Dask type conversion with astype does not fail until compute is called" ) if _is_spark_series(sample_series): pytest.xfail( "Spark allows this conversion, filling values it cannot convert with NaN " "and converting dtype to float.") sample_series.ww.init(logical_type="Categorical") error_message = ( "Error converting datatype for sample_series from type category to " "type int64. Please confirm the underlying data is consistent with logical type Integer." ) with pytest.raises(TypeConversionError, match=error_message): sample_series.ww.set_logical_type("Integer")
def test_set_logical_type_valid_dtype_change(sample_series): sample_series.ww.init(logical_type="Categorical") new_series = sample_series.ww.set_logical_type("NaturalLanguage") if _is_spark_series(sample_series): # Spark uses string dtype for Categorical original_dtype = "string" else: original_dtype = "category" new_dtype = "string" assert isinstance(sample_series.ww.logical_type, Categorical) assert sample_series.dtype == original_dtype assert isinstance(new_series.ww.logical_type, NaturalLanguage) assert new_series.dtype == new_dtype
def test_init_series_error_on_invalid_conversion(sample_series): if _is_dask_series(sample_series): pytest.xfail( "Dask type conversion with astype does not fail until compute is called" ) if _is_spark_series(sample_series): pytest.xfail( "Spark allows this conversion, filling values it cannot convert with NaN " "and converting dtype to float.") error_message = ( "Error converting datatype for sample_series from type category to type Int64. " "Please confirm the underlying data is consistent with logical type IntegerNullable." ) with pytest.raises(TypeConversionError, match=error_message): init_series(sample_series, logical_type="integer_nullable")
def test_ordinal_with_order(sample_series): if _is_spark_series(sample_series) or _is_dask_series(sample_series): pytest.xfail( "Fails with Dask and Spark - ordinal data validation not compatible" ) series = sample_series.copy() ordinal_with_order = Ordinal(order=["a", "b", "c"]) series.ww.init(logical_type=ordinal_with_order) assert isinstance(series.ww.logical_type, Ordinal) assert series.ww.logical_type.order == ["a", "b", "c"] series = sample_series.copy() series.ww.init(logical_type="Categorical") new_series = series.ww.set_logical_type(ordinal_with_order) assert isinstance(new_series.ww.logical_type, Ordinal) assert new_series.ww.logical_type.order == ["a", "b", "c"]
def test_updated_ltype_inference(integers, type_sys): inference_fn = type_sys.inference_functions[ww.logical_types.Integer] type_sys.remove_type(ww.logical_types.Integer) class Integer(LogicalType): primary_dtype = "string" type_sys.add_type(Integer, inference_function=inference_fn) dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"] if _is_spark_series(integers[0]): dtypes = get_spark_dtypes(dtypes) for series in integers: for dtype in dtypes: inferred_type = type_sys.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Integer) assert inferred_type.primary_dtype == "string"
def test_init_series_valid_conversion_specified_ltype(sample_series): if _is_spark_series(sample_series): sample_series = sample_series.astype("str") else: sample_series = sample_series.astype("object") series = init_series(sample_series, logical_type="categorical") assert series is not sample_series correct_dtype = Categorical._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, Categorical) assert series.ww.semantic_tags == {"category"} series = init_series(sample_series, logical_type="natural_language") assert series is not sample_series correct_dtype = NaturalLanguage._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, NaturalLanguage) assert series.ww.semantic_tags == set()
def test_series_methods_on_accessor_returning_series_invalid_schema( sample_series): sample_series.ww.init() if _is_spark_series(sample_series): # Spark uses `string` for Categorical, so must try a different conversion original_type = r"string\[pyarrow\]" new_type = "Int64" else: original_type = "category" new_type = "string" warning = ( "Operation performed by astype has invalidated the Woodwork typing information:\n " f"dtype mismatch between original dtype, {original_type}, and returned dtype, {new_type}.\n " "Please initialize Woodwork with Series.ww.init") with pytest.warns(TypingInfoMismatchWarning, match=warning): new_series = sample_series.ww.astype(new_type) assert new_series.ww._schema is None
def test_url_validate(sample_df): logical_type = URL() dtype = logical_type.primary_dtype series = sample_df["url"].astype(dtype) invalid_row = pd.Series({4: "bad_url"}, name="url").astype(dtype) if _is_spark_series(series): invalid_row = ps.from_pandas(invalid_row) assert logical_type.validate(series) is None series = series.append(invalid_row).astype(dtype) match = "Series url contains invalid url values. " match += "The url_inference_regex can be changed in the config if needed." with pytest.raises(TypeValidationError, match=match): logical_type.validate(series) actual = logical_type.validate(series, return_invalid_values=True) expected = pd.Series({4: "bad_url"}, name="url").astype(dtype) assert to_pandas(actual).equals(expected)