def test_accessor_shallow_equality(sample_series): metadata_col = init_series( sample_series.copy(), logical_type="NaturalLanguage", metadata={"interesting_values": ["a", "b"]}, ) diff_metadata_col = init_series( sample_series.copy(), logical_type="NaturalLanguage", metadata={"interesting_values": ["c"]}, ) assert metadata_col.ww.__eq__(diff_metadata_col.ww, deep=False) assert not metadata_col.ww.__eq__(diff_metadata_col.ww, deep=True) schema = metadata_col.ww.schema diff_data_col = metadata_col.replace(to_replace="a", value="1") # dtype gets changed to object in replace diff_data_col = diff_data_col.astype("string[pyarrow]") diff_data_col.ww.init(schema=schema) same_data_col = metadata_col.ww.copy() assert diff_data_col.ww.schema.__eq__(metadata_col.ww.schema, deep=True) assert same_data_col.ww.schema.__eq__(metadata_col.ww.schema, deep=True) assert diff_data_col.ww.__eq__(metadata_col.ww, deep=False) assert same_data_col.ww.__eq__(metadata_col.ww, deep=False) assert same_data_col.ww.__eq__(metadata_col.ww, deep=True) if isinstance(sample_series, pd.Series): # We only check underlying data for equality with pandas dataframes assert not diff_data_col.ww.__eq__(metadata_col.ww, deep=True) else: assert diff_data_col.ww.__eq__(metadata_col.ww, deep=True)
def test_init_series_with_np_array(sample_series_pandas): series = init_series(sample_series_pandas.to_numpy()) series2 = init_series( sample_series_pandas) # Sample series panda contains ['a','b','c','a'] assert series.equals(series2) assert series.ww.logical_type == series2.ww.logical_type assert series.ww.semantic_tags == series2.ww.semantic_tags
def test_init_series_with_invalid_type(sample_df): inputs = [sample_df, 1, "string", None] for input_ in inputs: error_message = ( f"Input must be of series type. The current input is of type {type(input_)}" ) with pytest.raises(TypeError, match=error_message): init_series(input_)
def test_init_series_error_on_invalid_conversion(sample_series): if dd and isinstance(sample_series, dd.Series): pytest.xfail('Dask type conversion with astype does not fail until compute is called') if ks and isinstance(sample_series, ks.Series): pytest.xfail('Koalas allows this conversion, filling values it cannot convert with NaN ' 'and converting dtype to float.') error_message = "Error converting datatype for sample_series from type category to type Int64. " \ "Please confirm the underlying data is consistent with logical type IntegerNullable." with pytest.raises(TypeConversionError, match=error_message): init_series(sample_series, logical_type='integer_nullable')
def test_init_series_with_pd_extension_array(): extension_categories = pd.Categorical([1, 2, 3]) series = init_series(extension_categories) pd_reference_series = init_series(pd.Series([1, 2, 3], dtype="category")) assert series.equals(pd_reference_series) assert series.ww.logical_type == pd_reference_series.ww.logical_type assert series.ww.semantic_tags == pd_reference_series.ww.semantic_tags extension_ints = pd.array(np.array([1, 2, 3, 4], dtype="int64")) series = init_series(extension_ints) pd_reference_series = init_series(pd.Series([1, 2, 3, 4], dtype="Int64")) assert series.equals(pd_reference_series) assert series.ww.logical_type == pd_reference_series.ww.logical_type assert series.ww.semantic_tags == pd_reference_series.ww.semantic_tags
def test_init_series_error_on_invalid_conversion(sample_series): if _is_dask_series(sample_series): pytest.xfail( "Dask type conversion with astype does not fail until compute is called" ) if _is_spark_series(sample_series): pytest.xfail( "Spark allows this conversion, filling values it cannot convert with NaN " "and converting dtype to float.") error_message = ( "Error converting datatype for sample_series from type category to type Int64. " "Please confirm the underlying data is consistent with logical type IntegerNullable." ) with pytest.raises(TypeConversionError, match=error_message): init_series(sample_series, logical_type="integer_nullable")
def test_init_series_all_parameters(sample_series): if _is_spark_series(sample_series): sample_series = sample_series.astype("str") else: sample_series = sample_series.astype("object") metadata = {"meta_key": "meta_value"} description = "custom description" origin = "base" series = init_series( sample_series, logical_type="categorical", semantic_tags=["custom_tag"], metadata=metadata, description=description, origin=origin, use_standard_tags=False, ) assert series is not sample_series correct_dtype = Categorical._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, Categorical) assert series.ww.semantic_tags == {"custom_tag"} assert series.ww.metadata == metadata assert series.ww.description == description assert series.ww.origin == origin
def test_init_series_valid_conversion_specified_ltype(sample_series): if _is_spark_series(sample_series): sample_series = sample_series.astype("str") else: sample_series = sample_series.astype("object") series = init_series(sample_series, logical_type="categorical") assert series is not sample_series correct_dtype = Categorical._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, Categorical) assert series.ww.semantic_tags == {"category"} series = init_series(sample_series, logical_type="natural_language") assert series is not sample_series correct_dtype = NaturalLanguage._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, NaturalLanguage) assert series.ww.semantic_tags == set()
def test_init_series_valid_conversion_specified_ltype(sample_series): if ks and isinstance(sample_series, ks.Series): sample_series = sample_series.astype('str') else: sample_series = sample_series.astype('object') series = init_series(sample_series, logical_type='categorical') assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), Categorical) assert series.dtype == correct_dtype assert series.ww.logical_type == Categorical assert series.ww.semantic_tags == {'category'} series = init_series(sample_series, logical_type='natural_language') assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), NaturalLanguage) assert series.dtype == correct_dtype assert series.ww.logical_type == NaturalLanguage assert series.ww.semantic_tags == set()
def test_latlong_validate(latlong_df): error_message = re.escape( "Cannot initialize Woodwork. Series does not contain properly formatted " "LatLong data. Try reformatting before initializing or use the " "woodwork.init_series function to initialize.") latlong = LatLong() series = latlong_df["tuple_ints"] new_series = init_series(series, logical_type=LatLong) latlong.validate(new_series) with pytest.raises(TypeValidationError, match=error_message): latlong.validate(series)
def test_validate_logical_type(sample_df): series = sample_df["email"] series = init_series(series, logical_type="EmailAddress") assert series.ww.validate_logical_type() is None invalid_row = pd.Series({4: "bad_email"}, name="email", dtype="string") if _is_spark_series(series): invalid_row = ps.from_pandas(invalid_row) series = series.append(invalid_row) series = init_series(series, logical_type="EmailAddress") match = "Series email contains invalid email address values. " match += "The email_inference_regex can be changed in the config if needed." with pytest.raises(TypeValidationError, match=match): series.ww.validate_logical_type() actual = series.ww.validate_logical_type(return_invalid_values=True) expected = pd.Series({4: "bad_email"}, dtype="string[pyarrow]") assert to_pandas(actual).equals(expected)
def test_init_series_valid_conversion_inferred_ltype(sample_series): if _is_spark_series(sample_series): sample_series = sample_series.astype("str") else: sample_series = sample_series.astype("object") series = init_series(sample_series) assert series is not sample_series correct_dtype = Categorical._get_valid_dtype(type(sample_series)) assert series.dtype == correct_dtype assert isinstance(series.ww.logical_type, Categorical) assert series.ww.semantic_tags == {"category"}
def test_init_series_valid_conversion_inferred_ltype(sample_series): if ks and isinstance(sample_series, ks.Series): sample_series = sample_series.astype('str') else: sample_series = sample_series.astype('object') series = init_series(sample_series) assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), Categorical) assert series.dtype == correct_dtype assert series.ww.logical_type == Categorical assert series.ww.semantic_tags == {'category'}
def test_latlong_formatting_with_init_series(latlongs): expected_series = pd.Series([(1.0, 2.0), (3.0, 4.0)]) if dd and isinstance(latlongs[0], dd.Series): expected_series = dd.from_pandas(expected_series, npartitions=2) elif ks and isinstance(latlongs[0], ks.Series): expected_series = ks.Series([[1.0, 2.0], [3.0, 4.0]]) expected_series.ww.init(logical_type=LatLong) for series in latlongs: new_series = init_series(series, logical_type=LatLong) assert new_series.ww.logical_type == LatLong pd.testing.assert_series_equal(to_pandas(new_series), to_pandas(expected_series)) assert expected_series.ww._schema == new_series.ww._schema
def __setitem__(self, col_name, column): series = tuple(pkg.Series for pkg in (pd, dd, ks) if pkg) if not isinstance(column, series): raise ValueError('New column must be of Series type') # Don't allow reassigning of index or time index with setitem if self.index == col_name: raise KeyError( 'Cannot reassign index. Change column name and then use df.ww.set_index to reassign index.' ) if self.time_index == col_name: raise KeyError( 'Cannot reassign time index. Change column name and then use df.ww.set_time_index to reassign time index.' ) if column.ww._schema is None: column = init_series(column, use_standard_tags=True) self._dataframe[col_name] = column self._schema.columns[col_name] = column.ww._schema
def test_init_series_all_parameters(sample_series): if ks and isinstance(sample_series, ks.Series): sample_series = sample_series.astype('str') else: sample_series = sample_series.astype('object') metadata = {'meta_key': 'meta_value'} description = 'custom description' series = init_series(sample_series, logical_type='categorical', semantic_tags=['custom_tag'], metadata=metadata, description=description, use_standard_tags=False) assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), Categorical) assert series.dtype == correct_dtype assert series.ww.logical_type == Categorical assert series.ww.semantic_tags == {'custom_tag'} assert series.ww.metadata == metadata assert series.ww.description == description
def set_logical_type(self, logical_type): """Update the logical type for the series, clearing any previously set semantic tags, and returning a new series with Woodwork initialied. Args: logical_type (LogicalType, str): The new logical type to set for the series. Returns: Series: A new series with the updated logical type. """ if self._schema is None: _raise_init_error() # Create a new series without a schema to prevent new series from sharing a common # schema with current series new_series = self._series.copy() new_series._schema = None return init_series(new_series, logical_type=logical_type, semantic_tags=None, use_standard_tags=self._schema.use_standard_tags, description=self.description, metadata=copy.deepcopy(self.metadata))
def test_init_series_with_latlong(latlong_df): for column in latlong_df: series = latlong_df[column] series = init_series(series, logical_type="LatLong") assert isinstance(series.ww.logical_type, LatLong)
def test_init_series_with_datetime(sample_datetime_series): series = init_series(sample_datetime_series, logical_type='datetime') assert series.dtype == 'datetime64[ns]' assert series.ww.logical_type == Datetime
def test_init_series_with_multidimensional_np_array(): input_ = np.array([["a", "b"], ["a", "b"]]) error_message = f"np.ndarray input must be 1 dimensional. Current np.ndarray is {input_.ndim} dimensional" with pytest.raises(ValueError, match=error_message): init_series(input_)
def test_init_series_with_datetime(sample_datetime_series): series = init_series(sample_datetime_series, logical_type="datetime") assert series.dtype == "datetime64[ns]" assert isinstance(series.ww.logical_type, Datetime)