def test_categorical_encoder_unfitted_fail(): unfitted_categorical_encoder = column_encoders.CategoricalEncoder( ["col_1"]) assert not unfitted_categorical_encoder.is_fitted() with pytest.raises(column_encoders.NotFittedError): unfitted_categorical_encoder.transform( pd.DataFrame({"col_1": ['a', 'b']}))
def test_categorical_encoder_numeric_nan(): df = pd.DataFrame({'brand': [1, 2, 3, None]}) try: column_encoders.CategoricalEncoder("brand").fit(df) except TypeError: pytest.fail( "fitting categorical encoder on integers with nulls should not fail" )
def test_categorical_encoder_max_token(): categorical_encoder = column_encoders.CategoricalEncoder( ['labels'], max_tokens=1e4).fit(df) assert categorical_encoder.max_tokens == 2
def test_categorical_encoder_numeric_transform(): df = pd.DataFrame({'brand': [1, 2, 3, 1, 2, 1, np.nan, None]}) col_enc = column_encoders.CategoricalEncoder("brand").fit(df) assert np.array_equal(col_enc.transform(df), np.array([[1], [2], [3], [1], [2], [1], [0], [0]]))
import numpy as np import pandas as pd import pytest from datawig import column_encoders df = pd.DataFrame({ 'features': [ 'xwcxG pQldP Cel0n 5LaWO 2cjTu', '2cjTu YizDY u1aEa Cel0n SntTK', '2cjTu YizDY u1aEa Cel0n SntTK' ], 'labels': ['xwcxG', 'SntTK', 'SntTK'] }) categorical_encoder = column_encoders.CategoricalEncoder(['labels'], max_tokens=3).fit(df) sequential_encoder = column_encoders.SequentialEncoder(['features'], max_tokens=50, seq_len=3).fit(df) # CategoricalEncoder Tests def test_categorical_encoder_unfitted_fail(): unfitted_categorical_encoder = column_encoders.CategoricalEncoder( ["col_1"]) assert not unfitted_categorical_encoder.is_fitted() with pytest.raises(column_encoders.NotFittedError): unfitted_categorical_encoder.transform( pd.DataFrame({"col_1": ['a', 'b']}))