Beispiel #1
0
    def test_dataframe_input_unsupported(self):
        """
        Makes sure we raise a proper message to the user,
        when providing not supported data input
        """
        validator = InputValidator()
        with self.assertRaisesRegex(ValueError, "Auto-sklearn does not support time"):
            validator.validate_features(
                pd.DataFrame({'datetime': [pd.Timestamp('20180310')]})
            )
        with self.assertRaisesRegex(ValueError, "has invalid type object"):
            validator.validate_features(
                pd.DataFrame({'string': ['foo']})
            )

        validator = InputValidator()
        with self.assertRaisesRegex(ValueError, "Expected 2D array, got"):
            validator.validate_features({'input1': 1, 'input2': 2})

        validator = InputValidator()
        with self.assertRaisesRegex(ValueError, "Expected 2D array, got"):
            validator.validate_features(InputValidator())

        validator = InputValidator()
        X = pd.DataFrame(data=['a', 'b', 'c'], dtype='category')
        with unittest.mock.patch('autosklearn.data.validation.InputValidator._check_and_get_columns_to_encode') as mock_foo:  # noqa E501
            # Mock that all columns are ok. There should be a
            # checker to catch for bugs
            mock_foo.return_value = ([], [])
            with self.assertRaisesRegex(ValueError, 'Failed to convert the input'):
                validator.validate_features(X)
Beispiel #2
0
    def test_sparse_numpy_input(self):
        """
        Makes sure that no encoder is needed when
        working with sparse float data
        """
        validator = InputValidator()

        # Sparse data
        row_ind = np.array([0, 1, 2])
        col_ind = np.array([1, 2, 1])
        X_sparse = sparse.csr_matrix((np.ones(3), (row_ind, col_ind)))
        X = validator.validate_features(
            X_sparse,
        )
        y = validator.validate_target(
            np.array(self.y)
        )

        self.assertIsInstance(X, sparse.csr.csr_matrix)
        self.assertIsInstance(y, np.ndarray)
        self.assertIsNone(validator.target_encoder)
        self.assertIsNone(validator.feature_encoder)

        # Sparse targets should not be supported
        data = np.array([1, 2, 3, 4, 5, 6])
        col = np.array([0, 0, 0, 0, 0, 0])
        row = np.array([0,  2,  3,  6,  7, 10])
        y = sparse.csr_matrix((data, (row, col)), shape=(11, 1))
        with self.assertRaisesRegex(ValueError, 'scipy.sparse.csr_matrix.todense'):
            validator = InputValidator().validate_target(y)
Beispiel #3
0
    def test_no_new_category_after_fit(self):
        # First make sure no problem if no categorical
        x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]})
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x * x)
        validator.validate_features(x)

        # Then make sure we catch categorical extra categories
        x = pd.DataFrame({
            'A': [1, 2, 3, 4],
            'B': [5, 6, 7, 8]
        },
                         dtype='category')
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x * x)
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the input features contained categorical values'):
            validator.validate_features(x)

        # For label encoder of targets
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the target array contained the categorical'):
            validator.validate_target(pd.DataFrame([1, 2, 5, 4]))

        # For ordinal encoder of targets
        x = pd.DataFrame({
            'A': [1, 2, 3, 4],
            'B': [5, 6, 7, 8]
        },
                         dtype='category')
        validator = InputValidator()
        validator.validate(x, x, is_classification=True)
        validator.validate_target(
            pd.DataFrame({
                'A': [1, 2, 3, 4],
                'B': [5, 6, 7, 8]
            },
                         dtype='category'))
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the target array contained the categorical'):
            validator.validate_target(
                pd.DataFrame({
                    'A': [1, 2, 3, 4],
                    'B': [5, 9, 7, 8]
                },
                             dtype='category'))
        return
Beispiel #4
0
    def test_all_posible_dtype_changes(self):
        """We do not allow a change in dtype once inputvalidator
        is fitted"""
        data = [[1, 0, 1], [1, 1, 1]]
        type_perms = list(itertools.permutations([
            data,
            np.array(data),
            pd.DataFrame(data)
        ], r=2))

        for first, second in type_perms:
            validator = InputValidator()
            validator.validate_target(first)
            with self.assertRaisesRegex(ValueError,
                                        "Auto-sklearn previously received targets of type"):
                validator.validate_target(second)
            validator.validate_features(first)
            with self.assertRaisesRegex(ValueError,
                                        "Auto-sklearn previously received features of type"):
                validator.validate_features(second)
Beispiel #5
0
    def test_numpy_input(self):
        """
        Makes sure that no encoding is needed for a
        numpy float object. Also test features/target
        validation methods
        """
        validator = InputValidator()
        X = validator.validate_features(np.array(self.X), )
        y = validator.validate_target(np.array(self.y))

        self.assertIsInstance(X, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
        self.assertIsNone(validator.target_encoder)
        self.assertIsNone(validator.feature_encoder)
Beispiel #6
0
    def test_dataframe_input_numerical(self):
        """
        Makes sure that we don't encode numerical data
        """
        for test_type in ['int64', 'float64', 'int8']:
            validator = InputValidator()
            X = validator.validate_features(
                pd.DataFrame(data=self.X, dtype=test_type), )
            y = validator.validate_target(
                pd.DataFrame(data=self.y, dtype=test_type), )

            self.assertIsInstance(X, np.ndarray)
            self.assertIsInstance(y, np.ndarray)
            self.assertIsNone(validator.target_encoder)
            self.assertIsNone(validator.feature_encoder)
Beispiel #7
0
    def test_dataframe_input_categorical(self):
        """
        Makes sure we automatically encode categorical data
        """
        for test_type in ['bool', 'category']:
            validator = InputValidator()
            X = validator.validate_features(
                pd.DataFrame(data=self.X, dtype=test_type), )
            y = validator.validate_target(
                pd.DataFrame(data=self.y, dtype=test_type),
                is_classification=True,
            )

            self.assertIsInstance(X, np.ndarray)
            self.assertIsInstance(y, np.ndarray)
            self.assertIsNotNone(validator.target_encoder)
            self.assertIsNotNone(validator.feature_encoder)