Ejemplo n.º 1
0
def test_accessor_shallow_equality(sample_series):
    metadata_col = init_series(
        sample_series.copy(),
        logical_type="NaturalLanguage",
        metadata={"interesting_values": ["a", "b"]},
    )
    diff_metadata_col = init_series(
        sample_series.copy(),
        logical_type="NaturalLanguage",
        metadata={"interesting_values": ["c"]},
    )

    assert metadata_col.ww.__eq__(diff_metadata_col.ww, deep=False)
    assert not metadata_col.ww.__eq__(diff_metadata_col.ww, deep=True)

    schema = metadata_col.ww.schema
    diff_data_col = metadata_col.replace(to_replace="a", value="1")
    # dtype gets changed to object in replace
    diff_data_col = diff_data_col.astype("string[pyarrow]")

    diff_data_col.ww.init(schema=schema)
    same_data_col = metadata_col.ww.copy()

    assert diff_data_col.ww.schema.__eq__(metadata_col.ww.schema, deep=True)
    assert same_data_col.ww.schema.__eq__(metadata_col.ww.schema, deep=True)

    assert diff_data_col.ww.__eq__(metadata_col.ww, deep=False)
    assert same_data_col.ww.__eq__(metadata_col.ww, deep=False)
    assert same_data_col.ww.__eq__(metadata_col.ww, deep=True)
    if isinstance(sample_series, pd.Series):
        # We only check underlying data for equality with pandas dataframes
        assert not diff_data_col.ww.__eq__(metadata_col.ww, deep=True)
    else:
        assert diff_data_col.ww.__eq__(metadata_col.ww, deep=True)
Ejemplo n.º 2
0
def test_init_series_with_np_array(sample_series_pandas):
    series = init_series(sample_series_pandas.to_numpy())
    series2 = init_series(
        sample_series_pandas)  # Sample series panda contains ['a','b','c','a']
    assert series.equals(series2)
    assert series.ww.logical_type == series2.ww.logical_type
    assert series.ww.semantic_tags == series2.ww.semantic_tags
Ejemplo n.º 3
0
def test_init_series_with_invalid_type(sample_df):
    inputs = [sample_df, 1, "string", None]
    for input_ in inputs:
        error_message = (
            f"Input must be of series type. The current input is of type {type(input_)}"
        )
        with pytest.raises(TypeError, match=error_message):
            init_series(input_)
Ejemplo n.º 4
0
def test_init_series_error_on_invalid_conversion(sample_series):
    if dd and isinstance(sample_series, dd.Series):
        pytest.xfail('Dask type conversion with astype does not fail until compute is called')
    if ks and isinstance(sample_series, ks.Series):
        pytest.xfail('Koalas allows this conversion, filling values it cannot convert with NaN '
                     'and converting dtype to float.')

    error_message = "Error converting datatype for sample_series from type category to type Int64. " \
        "Please confirm the underlying data is consistent with logical type IntegerNullable."
    with pytest.raises(TypeConversionError, match=error_message):
        init_series(sample_series, logical_type='integer_nullable')
Ejemplo n.º 5
0
def test_init_series_with_pd_extension_array():
    extension_categories = pd.Categorical([1, 2, 3])
    series = init_series(extension_categories)
    pd_reference_series = init_series(pd.Series([1, 2, 3], dtype="category"))
    assert series.equals(pd_reference_series)
    assert series.ww.logical_type == pd_reference_series.ww.logical_type
    assert series.ww.semantic_tags == pd_reference_series.ww.semantic_tags

    extension_ints = pd.array(np.array([1, 2, 3, 4], dtype="int64"))
    series = init_series(extension_ints)
    pd_reference_series = init_series(pd.Series([1, 2, 3, 4], dtype="Int64"))
    assert series.equals(pd_reference_series)
    assert series.ww.logical_type == pd_reference_series.ww.logical_type
    assert series.ww.semantic_tags == pd_reference_series.ww.semantic_tags
Ejemplo n.º 6
0
def test_init_series_error_on_invalid_conversion(sample_series):
    if _is_dask_series(sample_series):
        pytest.xfail(
            "Dask type conversion with astype does not fail until compute is called"
        )
    if _is_spark_series(sample_series):
        pytest.xfail(
            "Spark allows this conversion, filling values it cannot convert with NaN "
            "and converting dtype to float.")

    error_message = (
        "Error converting datatype for sample_series from type category to type Int64. "
        "Please confirm the underlying data is consistent with logical type IntegerNullable."
    )
    with pytest.raises(TypeConversionError, match=error_message):
        init_series(sample_series, logical_type="integer_nullable")
Ejemplo n.º 7
0
def test_init_series_all_parameters(sample_series):
    if _is_spark_series(sample_series):
        sample_series = sample_series.astype("str")
    else:
        sample_series = sample_series.astype("object")

    metadata = {"meta_key": "meta_value"}
    description = "custom description"
    origin = "base"
    series = init_series(
        sample_series,
        logical_type="categorical",
        semantic_tags=["custom_tag"],
        metadata=metadata,
        description=description,
        origin=origin,
        use_standard_tags=False,
    )
    assert series is not sample_series
    correct_dtype = Categorical._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, Categorical)
    assert series.ww.semantic_tags == {"custom_tag"}
    assert series.ww.metadata == metadata
    assert series.ww.description == description
    assert series.ww.origin == origin
Ejemplo n.º 8
0
def test_init_series_valid_conversion_specified_ltype(sample_series):
    if _is_spark_series(sample_series):
        sample_series = sample_series.astype("str")
    else:
        sample_series = sample_series.astype("object")

    series = init_series(sample_series, logical_type="categorical")
    assert series is not sample_series
    correct_dtype = Categorical._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, Categorical)
    assert series.ww.semantic_tags == {"category"}

    series = init_series(sample_series, logical_type="natural_language")
    assert series is not sample_series
    correct_dtype = NaturalLanguage._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, NaturalLanguage)
    assert series.ww.semantic_tags == set()
Ejemplo n.º 9
0
def test_init_series_valid_conversion_specified_ltype(sample_series):
    if ks and isinstance(sample_series, ks.Series):
        sample_series = sample_series.astype('str')
    else:
        sample_series = sample_series.astype('object')

    series = init_series(sample_series, logical_type='categorical')
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), Categorical)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == Categorical
    assert series.ww.semantic_tags == {'category'}

    series = init_series(sample_series, logical_type='natural_language')
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), NaturalLanguage)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == NaturalLanguage
    assert series.ww.semantic_tags == set()
Ejemplo n.º 10
0
def test_latlong_validate(latlong_df):
    error_message = re.escape(
        "Cannot initialize Woodwork. Series does not contain properly formatted "
        "LatLong data. Try reformatting before initializing or use the "
        "woodwork.init_series function to initialize.")
    latlong = LatLong()
    series = latlong_df["tuple_ints"]
    new_series = init_series(series, logical_type=LatLong)
    latlong.validate(new_series)
    with pytest.raises(TypeValidationError, match=error_message):
        latlong.validate(series)
Ejemplo n.º 11
0
def test_validate_logical_type(sample_df):
    series = sample_df["email"]
    series = init_series(series, logical_type="EmailAddress")
    assert series.ww.validate_logical_type() is None

    invalid_row = pd.Series({4: "bad_email"}, name="email", dtype="string")

    if _is_spark_series(series):
        invalid_row = ps.from_pandas(invalid_row)

    series = series.append(invalid_row)
    series = init_series(series, logical_type="EmailAddress")
    match = "Series email contains invalid email address values. "
    match += "The email_inference_regex can be changed in the config if needed."

    with pytest.raises(TypeValidationError, match=match):
        series.ww.validate_logical_type()

    actual = series.ww.validate_logical_type(return_invalid_values=True)
    expected = pd.Series({4: "bad_email"}, dtype="string[pyarrow]")
    assert to_pandas(actual).equals(expected)
Ejemplo n.º 12
0
def test_init_series_valid_conversion_inferred_ltype(sample_series):
    if _is_spark_series(sample_series):
        sample_series = sample_series.astype("str")
    else:
        sample_series = sample_series.astype("object")

    series = init_series(sample_series)
    assert series is not sample_series
    correct_dtype = Categorical._get_valid_dtype(type(sample_series))
    assert series.dtype == correct_dtype
    assert isinstance(series.ww.logical_type, Categorical)
    assert series.ww.semantic_tags == {"category"}
Ejemplo n.º 13
0
def test_init_series_valid_conversion_inferred_ltype(sample_series):
    if ks and isinstance(sample_series, ks.Series):
        sample_series = sample_series.astype('str')
    else:
        sample_series = sample_series.astype('object')

    series = init_series(sample_series)
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), Categorical)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == Categorical
    assert series.ww.semantic_tags == {'category'}
Ejemplo n.º 14
0
def test_latlong_formatting_with_init_series(latlongs):
    expected_series = pd.Series([(1.0, 2.0), (3.0, 4.0)])
    if dd and isinstance(latlongs[0], dd.Series):
        expected_series = dd.from_pandas(expected_series, npartitions=2)
    elif ks and isinstance(latlongs[0], ks.Series):
        expected_series = ks.Series([[1.0, 2.0], [3.0, 4.0]])

    expected_series.ww.init(logical_type=LatLong)
    for series in latlongs:
        new_series = init_series(series, logical_type=LatLong)
        assert new_series.ww.logical_type == LatLong
        pd.testing.assert_series_equal(to_pandas(new_series), to_pandas(expected_series))
        assert expected_series.ww._schema == new_series.ww._schema
Ejemplo n.º 15
0
    def __setitem__(self, col_name, column):
        series = tuple(pkg.Series for pkg in (pd, dd, ks) if pkg)
        if not isinstance(column, series):
            raise ValueError('New column must be of Series type')

        # Don't allow reassigning of index or time index with setitem
        if self.index == col_name:
            raise KeyError(
                'Cannot reassign index. Change column name and then use df.ww.set_index to reassign index.'
            )
        if self.time_index == col_name:
            raise KeyError(
                'Cannot reassign time index. Change column name and then use df.ww.set_time_index to reassign time index.'
            )

        if column.ww._schema is None:
            column = init_series(column, use_standard_tags=True)

        self._dataframe[col_name] = column
        self._schema.columns[col_name] = column.ww._schema
Ejemplo n.º 16
0
def test_init_series_all_parameters(sample_series):
    if ks and isinstance(sample_series, ks.Series):
        sample_series = sample_series.astype('str')
    else:
        sample_series = sample_series.astype('object')

    metadata = {'meta_key': 'meta_value'}
    description = 'custom description'
    series = init_series(sample_series,
                         logical_type='categorical',
                         semantic_tags=['custom_tag'],
                         metadata=metadata,
                         description=description,
                         use_standard_tags=False)
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), Categorical)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == Categorical
    assert series.ww.semantic_tags == {'custom_tag'}
    assert series.ww.metadata == metadata
    assert series.ww.description == description
Ejemplo n.º 17
0
    def set_logical_type(self, logical_type):
        """Update the logical type for the series, clearing any previously set semantic tags,
        and returning a new series with Woodwork initialied.

        Args:
            logical_type (LogicalType, str): The new logical type to set for the series.

        Returns:
            Series: A new series with the updated logical type.
        """
        if self._schema is None:
            _raise_init_error()
        # Create a new series without a schema to prevent new series from sharing a common
        # schema with current series
        new_series = self._series.copy()
        new_series._schema = None
        return init_series(new_series,
                           logical_type=logical_type,
                           semantic_tags=None,
                           use_standard_tags=self._schema.use_standard_tags,
                           description=self.description,
                           metadata=copy.deepcopy(self.metadata))
Ejemplo n.º 18
0
def test_init_series_with_latlong(latlong_df):
    for column in latlong_df:
        series = latlong_df[column]
        series = init_series(series, logical_type="LatLong")
        assert isinstance(series.ww.logical_type, LatLong)
Ejemplo n.º 19
0
def test_init_series_with_datetime(sample_datetime_series):
    series = init_series(sample_datetime_series, logical_type='datetime')
    assert series.dtype == 'datetime64[ns]'
    assert series.ww.logical_type == Datetime
Ejemplo n.º 20
0
def test_init_series_with_multidimensional_np_array():
    input_ = np.array([["a", "b"], ["a", "b"]])
    error_message = f"np.ndarray input must be 1 dimensional. Current np.ndarray is {input_.ndim} dimensional"
    with pytest.raises(ValueError, match=error_message):
        init_series(input_)
Ejemplo n.º 21
0
def test_init_series_with_datetime(sample_datetime_series):
    series = init_series(sample_datetime_series, logical_type="datetime")
    assert series.dtype == "datetime64[ns]"
    assert isinstance(series.ww.logical_type, Datetime)