def test_error_handling(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): # we exclude some columns X = tu.create_dataset(n_rows=100) X = X.drop(['unique_str', 'none'], axis=1) X_t = tu.create_dataset(n_rows=50, extras=True) X_t = X_t.drop(['unique_str', 'none'], axis=1) # illegal state, we have to first train the encoder... enc = getattr(encoders, encoder_name)() with self.assertRaises(ValueError): enc.transform(X) # wrong count of attributes enc = getattr(encoders, encoder_name)() enc.fit(X, y) with self.assertRaises(ValueError): enc.transform(X_t.iloc[:, 0:3]) # no cols enc = getattr(encoders, encoder_name)(cols=[]) enc.fit(X, y) self.assertTrue(enc.transform(X_t).equals(X_t))
def test_one_hot(self): enc = encoders.OneHotEncoder(verbose=1, return_df=False) enc.fit(X) self.assertEqual( enc.transform(X_t).shape[1], enc.transform(X_t[X_t['extra'] != 'A']).shape[1], 'We have to get the same count of columns') enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='ignore') enc.fit(X) out = enc.transform(X_t) self.assertEqual( len([x for x in out.columns.values if str(x).startswith('extra_')]), 3) enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='error') enc.fit(X) with self.assertRaises(ValueError): out = enc.transform(X_t) enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='ignore', use_cat_names=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_A', out.columns.values) enc = encoders.OneHotEncoder(verbose=1, return_df=True, use_cat_names=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) # test inverse_transform X_i = tu.create_dataset(n_rows=100, has_none=False) X_i_t = tu.create_dataset(n_rows=50, has_none=False) X_i_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False) cols = ['underscore', 'none', 'extra', 321] enc = encoders.OneHotEncoder(verbose=1, use_cat_names=True, cols=cols) enc.fit(X_i) obtained = enc.inverse_transform(enc.transform(X_i_t)) obtained[321] = obtained[321].astype( 'int64') # numeric columns are incorrectly typed as object... tu.verify_inverse_transform(X_i_t, obtained)
def test_one_hot(self): enc = encoders.OneHotEncoder(verbose=1, return_df=False) enc.fit(X) self.assertEqual( enc.transform(X_t).shape[1], enc.transform(X_t[X_t['extra'] != 'A']).shape[1], 'We have to get the same count of columns') enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='ignore') enc.fit(X) out = enc.transform(X_t) self.assertEqual( len([x for x in out.columns.values if str(x).startswith('extra_')]), 3) enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='error') # The exception is already raised in fit() because transform() is called there to get # feature_names right. with self.assertRaises(ValueError): enc.fit(X_t) enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='ignore', use_cat_names=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_A', out.columns.values) enc = encoders.OneHotEncoder(verbose=1, return_df=True, use_cat_names=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) # test inverse_transform X_i = tu.create_dataset(n_rows=100, has_none=False) X_i_t = tu.create_dataset(n_rows=50, has_none=False) X_i_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False) cols = ['underscore', 'none', 'extra', 321, 'categorical'] enc = encoders.OneHotEncoder(verbose=1, use_cat_names=True, cols=cols) enc.fit(X_i) obtained = enc.inverse_transform(enc.transform(X_i_t)) tu.verify_inverse_transform(X_i_t, obtained)
def test_handle_unknown_error(self): # BaseN has problems with None -> ignore None X = tu.create_dataset(n_rows=100, has_none=False) X_t = tu.create_dataset(n_rows=50, extras=True, has_none=False) for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): # new value during scoring enc = getattr(encoders, encoder_name)(handle_unknown='error') enc.fit(X, y) with self.assertRaises(ValueError): _ = enc.transform(X_t)
def test_inverse_transform(self): # we do not allow None in these data (but "none" column without any None is ok) X = tu.create_dataset(n_rows=100, has_none=False) X_t = tu.create_dataset(n_rows=50, has_none=False) X_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False) cols = ['underscore', 'none', 'extra', 321, 'categorical'] for encoder_name in ['BaseNEncoder', 'BinaryEncoder', 'OneHotEncoder', 'OrdinalEncoder']: with self.subTest(encoder_name=encoder_name): # simple run enc = getattr(encoders, encoder_name)(verbose=1, cols=cols) enc.fit(X) tu.verify_inverse_transform(X_t, enc.inverse_transform(enc.transform(X_t)))
import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ import category_encoders.tests.test_utils as tu import numpy as np import category_encoders as encoders np_X = tu.create_array(n_rows=100) np_X_t = tu.create_array(n_rows=50, extras=True) np_y = np.random.randn(np_X.shape[0]) > 0.5 np_y_t = np.random.randn(np_X_t.shape[0]) > 0.5 X = tu.create_dataset(n_rows=100) X_t = tu.create_dataset(n_rows=50, extras=True) y = pd.DataFrame(np_y) y_t = pd.DataFrame(np_y_t) class TestLeaveOneOutEncoder(TestCase): def test_leave_one_out(self): enc = encoders.LeaveOneOutEncoder(verbose=1, randomized=True, sigma=0.1) enc.fit(X, y) tu.verify_numeric(enc.transform(X_t)) tu.verify_numeric(enc.transform(X_t, y_t)) def test_leave_one_out_values(self): df = pd.DataFrame({ 'color': ["a", "a", "a", "b", "b", "b"], 'outcome': [1, 0, 0, 1, 0, 1] })