def test_big_dataset_encoding(self): x, y = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=True) validator = InputValidator() with self.assertRaisesRegex( ValueError, 'Categorical features in a dataframe cannot contain missing/NaN' ): x_t, y_t = validator.validate(x, y, is_classification=True) # Make sure translation works apart from Nan # NaN is not supported in categories, so # drop columns with them. Also, do a proof of concept # that all nan column is preserved, so that the pipeline deal # with it x = x.dropna('columns', 'any') x.insert(len(x.columns), 'NaNColumn', np.nan, True) x_t, y_t = validator.validate(x, y, is_classification=True) self.assertTupleEqual(np.shape(x), np.shape(x_t)) self.assertTrue(np.all(pd.isnull(x_t[:, -1]))) # Leave columns that are complete NaN # The sklearn pipeline will handle that self.assertTrue(np.isnan(x_t).any()) np.testing.assert_array_equal( pd.isnull(x.dropna(axis='columns', how='all')), pd.isnull(x.dropna(axis='columns', how='any')) ) # make sure everything was encoded to number self.assertTrue(np.issubdtype(x_t.dtype, np.number)) # No change to numerical columns np.testing.assert_array_equal(x['carbon'].to_numpy(), x_t[:, 3]) # Categorical columns are sorted to the beginning self.assertEqual( validator.feature_types, (['categorical'] * 3) + (['numerical'] * 7) ) self.assertEqual(x.iloc[0, 6], 610) np.testing.assert_array_equal(x_t[0], [0, 0, 0, 8, 0, 0, 0.7, 610, 0, np.NaN]) return
def test_NaN(self): # numpy - categorical - classification # np.nan in categorical array means that the array will be # type string, and np.nan will be casted as 'nan'. # In turn, 'nan' will be another category x = np.array([1, 2, 3, 4, 5.0, np.nan]).reshape(-1, 1) y = np.array([1, 2, 3, 4, 5.0, 6.0]).reshape(-1, 1) validator = InputValidator() x_t, y_t = validator.validate(x, y, is_classification=True) self.assertTrue(np.issubdtype(x_t.dtype, np.number)) self.assertTrue(np.issubdtype(y_t.dtype, np.number)) self.assertTrue(np.isnan(x_t).any()) # Preserve NaN in features self.assertEqual(type_of_target(y_t), 'multiclass') self.assertTupleEqual(np.shape(x), np.shape(x_t)) # numpy - categorical - regression # nan in target should raise error y = np.random.random_sample((6, 1)) y[1] = np.nan with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'): InputValidator().validate_target(y) # numpy - numerical - classification # Numerical numpy features should continue without encoding # categorical encoding of Nan for the targets is not supported x = np.random.random_sample((4, 4)) x[3] = np.nan y = np.random.choice([0.0, 1.0], 4) y[1] = np.nan x_t = InputValidator().validate_features(x) self.assertTrue(np.issubdtype(x_t.dtype, np.number)) self.assertTrue(np.isnan(x_t).any()) self.assertEqual(type_of_target(y_t), 'multiclass') self.assertTupleEqual(np.shape(x), np.shape(x_t)) with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'): InputValidator().validate_target(y, is_classification=True) with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'): InputValidator().validate_target(y, is_classification=False) x = np.random.random_sample(4) x[3] = np.nan x = pd.DataFrame(data={'A': x, 'B': x*2}) y = np.random.choice([0.0, 1.0], 4) y[1] = np.nan y = pd.DataFrame(y) with self.assertRaisesRegex(ValueError, 'Categorical features in a dataframe cannot'): InputValidator().validate_features(x) with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'): InputValidator().validate_target(y, is_classification=True) with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'): InputValidator().validate_target(y, is_classification=False) return
def test_list_input(self): """ Makes sure that a list is converted to nparray """ validator = InputValidator() X, y = validator.validate(self.X, self.y) self.assertIsInstance(X, np.ndarray) self.assertIsInstance(y, np.ndarray)
def test_no_new_category_after_fit(self): # First make sure no problem if no categorical x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}) y = pd.DataFrame([1, 2, 3, 4]) validator = InputValidator() validator.validate(x, y, is_classification=True) validator.validate_features(x) x['A'] = x['A'].apply(lambda x: x * x) validator.validate_features(x) # Then make sure we catch categorical extra categories x = pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8] }, dtype='category') y = pd.DataFrame([1, 2, 3, 4]) validator = InputValidator() validator.validate(x, y, is_classification=True) validator.validate_features(x) x['A'] = x['A'].apply(lambda x: x * x) with self.assertRaisesRegex( ValueError, 'During fit, the input features contained categorical values'): validator.validate_features(x) # For label encoder of targets with self.assertRaisesRegex( ValueError, 'During fit, the target array contained the categorical'): validator.validate_target(pd.DataFrame([1, 2, 5, 4])) # For ordinal encoder of targets x = pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8] }, dtype='category') validator = InputValidator() validator.validate(x, x, is_classification=True) validator.validate_target( pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8] }, dtype='category')) with self.assertRaisesRegex( ValueError, 'During fit, the target array contained the categorical'): validator.validate_target( pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 9, 7, 8] }, dtype='category')) return
def test_noNaN(self): """ Makes sure that during classification/regression task, the transformed data is not corrupted. Testing is given without Nan and no sparse data """ # numpy - categorical - classification x = np.array(['a', 'b', 'c', 'a', 'b', 'c']).reshape(-1, 1) validator = InputValidator() with self.assertRaisesRegex(ValueError, 'the only valid dtypes are numerical ones'): x_t, y_t = validator.validate(x, np.copy(x), is_classification=True) # numpy - categorical - regression with self.assertRaisesRegex(ValueError, 'the only valid dtypes are numerical ones'): x_t, y_t = validator.validate(x, np.copy(x), is_classification=False) # numpy - numerical - classification x = np.random.random_sample((4, 4)) y = np.random.choice([0, 1], 4) validator = InputValidator() x_t, y_t = validator.validate(x, y, is_classification=True) self.assertTrue(np.issubdtype(x_t.dtype, np.number)) self.assertTrue(np.issubdtype(y_t.dtype, np.number)) self.assertEqual(type_of_target(y_t), 'binary') self.assertTupleEqual(np.shape(x), np.shape(x_t)) self.assertTupleEqual(np.shape(y), np.shape(y_t)) # numpy - numerical - regression x = np.random.random_sample((4, 4)) y = np.random.random_sample(4) validator = InputValidator() x_t, y_t = validator.validate(x, y, is_classification=False) np.testing.assert_array_equal(x, x_t) # No change to valid data np.testing.assert_array_equal(y, y_t) self.assertEqual(type_of_target(y_t), 'continuous') # pandas - categorical - classification x = pd.DataFrame({'A': np.random.choice(['a', 'b'], 4), 'B': np.random.choice(['a', 'b'], 4)}, dtype='category') y = pd.DataFrame(np.random.choice(['c', 'd'], 4), dtype='category') validator = InputValidator() x_t, y_t = validator.validate(x, y, is_classification=True) self.assertTrue(np.issubdtype(x_t.dtype, np.number)) self.assertTrue(np.issubdtype(y_t.dtype, np.number)) self.assertEqual(type_of_target(y_t), 'binary') self.assertTupleEqual(np.shape(x), np.shape(x_t)) self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t)) # ravel # pandas - categorical - regression x = pd.DataFrame({'A': np.random.choice(['a', 'b'], 4), 'B': np.random.choice(['a', 'b'], 4)}, dtype='category') y = pd.DataFrame(np.random.random_sample(4)) validator = InputValidator() x_t, y_t = validator.validate(x, y, is_classification=False) self.assertTrue(np.issubdtype(x_t.dtype, np.number)) self.assertTrue(np.issubdtype(y_t.dtype, np.number)) self.assertEqual(type_of_target(y_t), 'continuous') self.assertTupleEqual(np.shape(x), np.shape(x_t)) np.testing.assert_array_equal(y.to_numpy().reshape(-1), y_t) self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t)) # ravel version # pandas - numerical - classification x = pd.DataFrame({'A': np.random.random_sample(4), 'B': np.random.choice([2.5, 1.2], 4)}) y = pd.DataFrame([1.0, 2.2, 3.2, 2.2]) validator = InputValidator() x_t, y_t = validator.validate(x, y, is_classification=True) self.assertTrue(np.issubdtype(x_t.dtype, np.number)) self.assertTrue(np.issubdtype(y_t.dtype, np.number)) self.assertEqual(type_of_target(y_t), 'multiclass') self.assertTupleEqual(np.shape(x), np.shape(x_t)) np.testing.assert_array_equal(np.array([0, 1, 2, 1]), y_t) self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t)) # ravel # pandas - numerical - regression x = pd.DataFrame({'A': np.random.choice([1.5, 3.6], 4), 'B': np.random.choice([2.5, 1.2], 4)}) y = pd.DataFrame(np.random.random_sample(4)) validator = InputValidator() x_t, y_t = validator.validate(x, y, is_classification=False) self.assertTrue(np.issubdtype(x_t.dtype, np.number)) self.assertTrue(np.issubdtype(y_t.dtype, np.number)) self.assertEqual(type_of_target(y_t), 'continuous') self.assertTupleEqual(np.shape(x), np.shape(x_t)) self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t)) # ravel np.testing.assert_array_equal(y.to_numpy().reshape(-1), y_t) return