def test_dataframe_input_unsupported(self): """ Makes sure we raise a proper message to the user, when providing not supported data input """ validator = InputValidator() with self.assertRaisesRegex(ValueError, "Auto-sklearn does not support time"): validator.validate_features( pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}) ) with self.assertRaisesRegex(ValueError, "has invalid type object"): validator.validate_features( pd.DataFrame({'string': ['foo']}) ) validator = InputValidator() with self.assertRaisesRegex(ValueError, "Expected 2D array, got"): validator.validate_features({'input1': 1, 'input2': 2}) validator = InputValidator() with self.assertRaisesRegex(ValueError, "Expected 2D array, got"): validator.validate_features(InputValidator()) validator = InputValidator() X = pd.DataFrame(data=['a', 'b', 'c'], dtype='category') with unittest.mock.patch('autosklearn.data.validation.InputValidator._check_and_get_columns_to_encode') as mock_foo: # noqa E501 # Mock that all columns are ok. There should be a # checker to catch for bugs mock_foo.return_value = ([], []) with self.assertRaisesRegex(ValueError, 'Failed to convert the input'): validator.validate_features(X)
def test_sparse_numpy_input(self): """ Makes sure that no encoder is needed when working with sparse float data """ validator = InputValidator() # Sparse data row_ind = np.array([0, 1, 2]) col_ind = np.array([1, 2, 1]) X_sparse = sparse.csr_matrix((np.ones(3), (row_ind, col_ind))) X = validator.validate_features( X_sparse, ) y = validator.validate_target( np.array(self.y) ) self.assertIsInstance(X, sparse.csr.csr_matrix) self.assertIsInstance(y, np.ndarray) self.assertIsNone(validator.target_encoder) self.assertIsNone(validator.feature_encoder) # Sparse targets should not be supported data = np.array([1, 2, 3, 4, 5, 6]) col = np.array([0, 0, 0, 0, 0, 0]) row = np.array([0, 2, 3, 6, 7, 10]) y = sparse.csr_matrix((data, (row, col)), shape=(11, 1)) with self.assertRaisesRegex(ValueError, 'scipy.sparse.csr_matrix.todense'): validator = InputValidator().validate_target(y)
def test_no_new_category_after_fit(self): # First make sure no problem if no categorical x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}) y = pd.DataFrame([1, 2, 3, 4]) validator = InputValidator() validator.validate(x, y, is_classification=True) validator.validate_features(x) x['A'] = x['A'].apply(lambda x: x * x) validator.validate_features(x) # Then make sure we catch categorical extra categories x = pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8] }, dtype='category') y = pd.DataFrame([1, 2, 3, 4]) validator = InputValidator() validator.validate(x, y, is_classification=True) validator.validate_features(x) x['A'] = x['A'].apply(lambda x: x * x) with self.assertRaisesRegex( ValueError, 'During fit, the input features contained categorical values'): validator.validate_features(x) # For label encoder of targets with self.assertRaisesRegex( ValueError, 'During fit, the target array contained the categorical'): validator.validate_target(pd.DataFrame([1, 2, 5, 4])) # For ordinal encoder of targets x = pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8] }, dtype='category') validator = InputValidator() validator.validate(x, x, is_classification=True) validator.validate_target( pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8] }, dtype='category')) with self.assertRaisesRegex( ValueError, 'During fit, the target array contained the categorical'): validator.validate_target( pd.DataFrame({ 'A': [1, 2, 3, 4], 'B': [5, 9, 7, 8] }, dtype='category')) return
def test_all_posible_dtype_changes(self): """We do not allow a change in dtype once inputvalidator is fitted""" data = [[1, 0, 1], [1, 1, 1]] type_perms = list(itertools.permutations([ data, np.array(data), pd.DataFrame(data) ], r=2)) for first, second in type_perms: validator = InputValidator() validator.validate_target(first) with self.assertRaisesRegex(ValueError, "Auto-sklearn previously received targets of type"): validator.validate_target(second) validator.validate_features(first) with self.assertRaisesRegex(ValueError, "Auto-sklearn previously received features of type"): validator.validate_features(second)
def test_numpy_input(self): """ Makes sure that no encoding is needed for a numpy float object. Also test features/target validation methods """ validator = InputValidator() X = validator.validate_features(np.array(self.X), ) y = validator.validate_target(np.array(self.y)) self.assertIsInstance(X, np.ndarray) self.assertIsInstance(y, np.ndarray) self.assertIsNone(validator.target_encoder) self.assertIsNone(validator.feature_encoder)
def test_dataframe_input_numerical(self): """ Makes sure that we don't encode numerical data """ for test_type in ['int64', 'float64', 'int8']: validator = InputValidator() X = validator.validate_features( pd.DataFrame(data=self.X, dtype=test_type), ) y = validator.validate_target( pd.DataFrame(data=self.y, dtype=test_type), ) self.assertIsInstance(X, np.ndarray) self.assertIsInstance(y, np.ndarray) self.assertIsNone(validator.target_encoder) self.assertIsNone(validator.feature_encoder)
def test_dataframe_input_categorical(self): """ Makes sure we automatically encode categorical data """ for test_type in ['bool', 'category']: validator = InputValidator() X = validator.validate_features( pd.DataFrame(data=self.X, dtype=test_type), ) y = validator.validate_target( pd.DataFrame(data=self.y, dtype=test_type), is_classification=True, ) self.assertIsInstance(X, np.ndarray) self.assertIsInstance(y, np.ndarray) self.assertIsNotNone(validator.target_encoder) self.assertIsNotNone(validator.feature_encoder)