Esempio n. 1
0
 def test_leave_one_out(self):
     enc = encoders.LeaveOneOutEncoder(verbose=1,
                                       randomized=True,
                                       sigma=0.1)
     enc.fit(X, y)
     tu.verify_numeric(enc.transform(X_t))
     tu.verify_numeric(enc.transform(X_t, y_t))
Esempio n. 2
0
    def test_target_encoder(self):

        enc = encoders.TargetEncoder(verbose=1,
                                     smoothing=2,
                                     min_samples_leaf=2)
        enc.fit(X, y)
        tu.verify_numeric(enc.transform(X_t))
        tu.verify_numeric(enc.transform(X_t, y_t))
Esempio n. 3
0
    def test_np(self):
        for encoder_name in encoders.__all__:
            with self.subTest(encoder_name=encoder_name):

                # Encode a numpy array
                enc = getattr(encoders, encoder_name)()
                enc.fit(np_X, np_y)
                tu.verify_numeric(enc.transform(np_X_t))
Esempio n. 4
0
    def test_pandas_categorical(self):
        X = pd.DataFrame({
            'Str': ['a', 'c', 'c', 'd'],
            'Categorical':
            pd.Categorical(list('bbea'),
                           categories=['e', 'a', 'b'],
                           ordered=True)
        })

        enc = encoders.OrdinalEncoder()
        out = enc.fit_transform(X)

        tu.verify_numeric(out)
        self.assertEqual(3, out['Categorical'][0])
        self.assertEqual(3, out['Categorical'][1])
        self.assertEqual(1, out['Categorical'][2])
        self.assertEqual(2, out['Categorical'][3])
Esempio n. 5
0
    def test_impact_encoders(self):
        for encoder_name in ['LeaveOneOutEncoder', 'TargetEncoder', 'WOEEncoder']:
            with self.subTest(encoder_name=encoder_name):

                # encode a numpy array and transform with the help of the target
                enc = getattr(encoders, encoder_name)()
                enc.fit(np_X, np_y)
                tu.verify_numeric(enc.transform(np_X_t, np_y_t))

                # target is a DataFrame
                enc = getattr(encoders, encoder_name)()
                enc.fit(X, y)
                tu.verify_numeric(enc.transform(X_t, y_t))

                # when we run transform(X, y) and there is a new value in X, something is wrong and we raise an error
                enc = getattr(encoders, encoder_name)(impute_missing=True, handle_unknown='error', cols=['extra'])
                enc.fit(X, y)
                self.assertRaises(ValueError, enc.transform, (X_t, y_t))
Esempio n. 6
0
    def test_classification(self):
        for encoder_name in encoders.__all__:
            with self.subTest(encoder_name=encoder_name):
                cols = [
                    'unique_str', 'underscore', 'extra', 'none', 'invariant',
                    321, 'categorical'
                ]

                enc = getattr(encoders, encoder_name)(cols=cols)
                enc.fit(X, np_y)
                tu.verify_numeric(enc.transform(X_t))

                enc = getattr(encoders, encoder_name)(verbose=1)
                enc.fit(X, np_y)
                tu.verify_numeric(enc.transform(X_t))

                enc = getattr(encoders, encoder_name)(drop_invariant=True)
                enc.fit(X, np_y)
                tu.verify_numeric(enc.transform(X_t))

                enc = getattr(encoders, encoder_name)(return_df=False)
                enc.fit(X, np_y)
                self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
                self.assertEqual(
                    enc.transform(X_t).shape[0], X_t.shape[0],
                    'Row count must not change')
Esempio n. 7
0
    def test_woe(self):
        cols = ['unique_str', 'underscore', 'extra', 'none', 'invariant', 321]

        # balanced label with balanced features
        X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'],
                                  columns=['col1'])
        y_balanced = [True, False, True, False, True, False]
        enc = encoders.WOEEncoder()
        enc.fit(X_balanced, y_balanced)
        X1 = enc.transform(X_balanced)
        self.assertTrue(
            all(X1.sum() < 0.001),
            "When the class label is balanced, WoE should sum to 0 in each transformed column"
        )

        enc = encoders.WOEEncoder(cols=cols)
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        tu.verify_numeric(X1[cols])
        self.assertTrue(
            np.isfinite(X1[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')
        X2 = enc.transform(X_t, np_y_t)
        tu.verify_numeric(X2)
        self.assertTrue(
            np.isfinite(X2[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')
        X3 = enc.transform(X, np_y)
        tu.verify_numeric(X3)
        self.assertTrue(
            np.isfinite(X3[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X)), len(list(X3)),
                         'The count of attributes must not change')
        self.assertEqual(len(X), len(X3), 'The count of rows must not change')
        self.assertTrue(
            X3['unique_str'].var() < 0.001,
            'The unique string column must not be predictive of the label')
        X4 = enc.fit_transform(X, np_y)
        tu.verify_numeric(X4)
        self.assertTrue(
            np.isfinite(X4[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X)), len(list(X4)),
                         'The count of attributes must not change')
        self.assertEqual(len(X), len(X4), 'The count of rows must not change')
        self.assertTrue(
            X4['unique_str'].var() < 0.001,
            'The unique string column must not be predictive of the label')

        enc = encoders.WOEEncoder()
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')
        tu.verify_numeric(X1)
        X2 = enc.transform(X_t, np_y_t)
        tu.verify_numeric(X2)
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')

        # seed
        enc = encoders.WOEEncoder(cols=cols,
                                  random_state=2001,
                                  randomized=True)
        enc.fit(X, np_y)
        X1 = enc.transform(X_t, np_y_t)
        X2 = enc.transform(X_t, np_y_t)
        self.assertTrue(
            X1.equals(X2),
            "When the seed is given, the results must be identical")
        tu.verify_numeric(X1)
        tu.verify_numeric(X2)

        # invariant target
        y_invariant = [True, True, True, True, True, True]
        enc = encoders.WOEEncoder()
        with self.assertRaises(ValueError):
            enc.fit(X_balanced, y_invariant)

        # branch coverage unit tests - no cols
        enc = encoders.WOEEncoder(cols=[])
        enc.fit(X, np_y)
        self.assertTrue(enc.transform(X_t).equals(X_t))

        # missing values in the target
        y_missing = [True, True, None, True, True, True]
        enc = encoders.WOEEncoder()
        with self.assertRaises(ValueError):
            enc.fit(X_balanced, y_missing)

        # impute missing
        enc = encoders.WOEEncoder(impute_missing=False)
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        tu.verify_numeric(X1)
        self.assertTrue(X1.isnull().values.any())
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')

        X2 = enc.transform(X_t, np_y_t)
        tu.verify_numeric(X2)
        self.assertTrue(X1.isnull().values.any())
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')