Ejemplo n.º 1
0
    def test_invariance_to_data_types(self):
        x = np.array([
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['b', 'b', 'c'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['a', 'b', 'a'],
        ])
        y = [1, 2, 3, 3, 3, 3]
        wrapper = MultiClassWrapper(encoders.TargetEncoder())
        result = wrapper.fit_transform(x, y)
        th.verify_numeric(result)

        x = pd.DataFrame([
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['b', 'b', 'c'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['a', 'b', 'a'],
        ],
                         columns=['f1', 'f2', 'f3'])
        y = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog']
        wrapper = MultiClassWrapper(encoders.TargetEncoder())
        result2 = wrapper.fit_transform(x, y)

        self.assertTrue((result.values == result2.values).all(
        ), 'The content should be the same regardless whether we pass Numpy or Pandas data type.'
                        )
Ejemplo n.º 2
0
    def test_np(self):
        for encoder_name in encoders.__all__:
            with self.subTest(encoder_name=encoder_name):

                # Encode a numpy array
                enc = getattr(encoders, encoder_name)()
                enc.fit(np_X, np_y)
                th.verify_numeric(enc.transform(np_X_t))
Ejemplo n.º 3
0
    def test_target_encoder(self):

        enc = encoders.TargetEncoder(verbose=1,
                                     smoothing=2,
                                     min_samples_leaf=2)
        enc.fit(X, y)
        th.verify_numeric(enc.transform(X_t))
        th.verify_numeric(enc.transform(X_t, y_t))
Ejemplo n.º 4
0
    def test_pandas_categorical(self):
        X = pd.DataFrame({
            'Str': ['a', 'c', 'c', 'd'],
            'Categorical': pd.Categorical(list('bbea'), categories=['e', 'a', 'b'], ordered=True)
        })

        enc = encoders.OrdinalEncoder()
        out = enc.fit_transform(X)

        th.verify_numeric(out)
        self.assertEqual(3, out['Categorical'][0])
        self.assertEqual(3, out['Categorical'][1])
        self.assertEqual(1, out['Categorical'][2])
        self.assertEqual(2, out['Categorical'][3])
    def test_impact_encoders(self):
        for encoder_name in ['LeaveOneOutEncoder', 'TargetEncoder', 'WOEEncoder', 'MEstimateEncoder', 'JamesSteinEncoder', 'CatBoostEncoder']:
            with self.subTest(encoder_name=encoder_name):

                # encode a numpy array and transform with the help of the target
                enc = getattr(encoders, encoder_name)()
                enc.fit(np_X, np_y)
                th.verify_numeric(enc.transform(np_X_t, np_y_t))

                # target is a DataFrame
                enc = getattr(encoders, encoder_name)()
                enc.fit(X, y)
                th.verify_numeric(enc.transform(X_t, y_t))

                # when we run transform(X, y) and there is a new value in X, something is wrong and we raise an error
                enc = getattr(encoders, encoder_name)(handle_unknown='error', cols=['extra'])
                enc.fit(X, y)
                self.assertRaises(ValueError, enc.transform, (X_t, y_t))
Ejemplo n.º 6
0
    def test_is_numeric_numpy(self):
        # Whole numbers, regardless of the byte length, should not raise AssertionError
        X = np.ones([5, 5], dtype='int32')
        verify_numeric(pd.DataFrame(X))

        X = np.ones([5, 5], dtype='int64')
        verify_numeric(pd.DataFrame(X))

        # Floats
        X = np.ones([5, 5], dtype='float32')
        verify_numeric(pd.DataFrame(X))

        X = np.ones([5, 5], dtype='float64')
        verify_numeric(pd.DataFrame(X))
Ejemplo n.º 7
0
    def test_classification(self):
        for encoder_name in encoders.__all__:
            with self.subTest(encoder_name=encoder_name):
                cols = [
                    'unique_str', 'underscore', 'extra', 'none', 'invariant',
                    321, 'categorical', 'na_categorical'
                ]

                enc = getattr(encoders, encoder_name)(cols=cols)
                enc.fit(X, np_y)
                th.verify_numeric(enc.transform(X_t))

                enc = getattr(encoders, encoder_name)(verbose=1)
                enc.fit(X, np_y)
                th.verify_numeric(enc.transform(X_t))

                enc = getattr(encoders, encoder_name)(drop_invariant=True)
                enc.fit(X, np_y)
                th.verify_numeric(enc.transform(X_t))

                enc = getattr(encoders, encoder_name)(return_df=False)
                enc.fit(X, np_y)
                self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
                self.assertEqual(
                    enc.transform(X_t).shape[0], X_t.shape[0],
                    'Row count must not change')
Ejemplo n.º 8
0
    def test_is_numeric_pandas(self):
        # Whole numbers, regardless of the byte length, should not raise AssertionError
        X = pd.DataFrame(np.ones([5, 5]), dtype='int32')
        verify_numeric(pd.DataFrame(X))

        X = pd.DataFrame(np.ones([5, 5]), dtype='int64')
        verify_numeric(pd.DataFrame(X))

        # Strings should raise AssertionError
        X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']])
        with self.assertRaises(Exception):
            verify_numeric(pd.DataFrame(X))
Ejemplo n.º 9
0
 def test_verify_raises_AssertionError_on_categories(self):
     # Categories should raise AssertionError
     X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']], dtype='category')
     with self.assertRaises(Exception):
         verify_numeric(pd.DataFrame(X))
Ejemplo n.º 10
0
    def test_woe(self):
        cols = [
            'unique_str', 'underscore', 'extra', 'none', 'invariant', 321,
            'categorical', 'na_categorical', 'categorical_int'
        ]

        # balanced label with balanced features
        X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'],
                                  columns=['col1'])
        y_balanced = [True, False, True, False, True, False]
        enc = encoders.WOEEncoder()
        enc.fit(X_balanced, y_balanced)
        X1 = enc.transform(X_balanced)
        self.assertTrue(
            all(X1.sum() < 0.001),
            "When the class label is balanced, WoE should sum to 0 in each transformed column"
        )

        enc = encoders.WOEEncoder(cols=cols)
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        th.verify_numeric(X1[cols])
        self.assertTrue(
            np.isfinite(X1[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')
        X2 = enc.transform(X_t, np_y_t)
        th.verify_numeric(X2)
        self.assertTrue(
            np.isfinite(X2[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')
        X3 = enc.transform(X, np_y)
        th.verify_numeric(X3)
        self.assertTrue(
            np.isfinite(X3[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X)), len(list(X3)),
                         'The count of attributes must not change')
        self.assertEqual(len(X), len(X3), 'The count of rows must not change')
        self.assertTrue(
            X3['unique_str'].var() < 0.001,
            'The unique string column must not be predictive of the label')
        X4 = enc.fit_transform(X, np_y)
        th.verify_numeric(X4)
        self.assertTrue(
            np.isfinite(X4[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X)), len(list(X4)),
                         'The count of attributes must not change')
        self.assertEqual(len(X), len(X4), 'The count of rows must not change')
        self.assertTrue(
            X4['unique_str'].var() < 0.001,
            'The unique string column must not be predictive of the label')

        enc = encoders.WOEEncoder()
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')
        th.verify_numeric(X1)
        X2 = enc.transform(X_t, np_y_t)
        th.verify_numeric(X2)
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')

        # seed
        enc = encoders.WOEEncoder(cols=cols,
                                  random_state=2001,
                                  randomized=True)
        enc.fit(X, np_y)
        X1 = enc.transform(X_t, np_y_t)
        X2 = enc.transform(X_t, np_y_t)
        self.assertTrue(
            X1.equals(X2),
            "When the seed is given, the results must be identical")
        th.verify_numeric(X1)
        th.verify_numeric(X2)

        # invariant target
        y_invariant = [True, True, True, True, True, True]
        enc = encoders.WOEEncoder()
        with self.assertRaises(ValueError):
            enc.fit(X_balanced, y_invariant)

        # branch coverage unit tests - no cols
        enc = encoders.WOEEncoder(cols=[])
        enc.fit(X, np_y)
        self.assertTrue(enc.transform(X_t).equals(X_t))

        # missing values in the target
        y_missing = [True, True, None, True, True, True]
        enc = encoders.WOEEncoder()
        with self.assertRaises(ValueError):
            enc.fit(X_balanced, y_missing)

        # impute missing
        enc = encoders.WOEEncoder(handle_missing='return_nan')
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        th.verify_numeric(X1)
        self.assertTrue(X1.isnull().values.any())
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')

        X2 = enc.transform(X_t, np_y_t)
        th.verify_numeric(X2)
        self.assertTrue(X1.isnull().values.any())
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')
Ejemplo n.º 11
0
    for index in range(num):
        rsl = [encoder_name, index + 1, X.shape]

        if encoder_name == 'HashingEncoder':
            enc = encoders.HashingEncoder(max_process=index+1, cols=cols)
        else:
            enc = getattr(encoders, encoder_name)(cols=cols)

        t = []
        c = []
        for _ in range(benchmark_repeat):
            start = time.time()
            proc = multiprocessing.Process(target=get_cpu_utilization, args=())
            proc.start()
            enc.fit(X, np_y)
            th.verify_numeric(enc.transform(X_t))
            end = time.time()
            proc.terminate()
            proc.join()
            cost = []
            while not cpu_utilization.empty():
                cost.append(cpu_utilization.get())
            t.append(end - start)
            c.append(np.mean(cost))
        rsl.append(min(t))
        rsl.append(np.mean(t))
        rsl.append(max(c))
        rsl.append(np.mean(c))

        results.append(rsl)
        print(rsl)
 def test_leave_one_out(self):
     enc = encoders.LeaveOneOutEncoder(verbose=1, sigma=0.1)
     enc.fit(X, y)
     th.verify_numeric(enc.transform(X_t))
     th.verify_numeric(enc.transform(X_t, y_t))