Ejemplo n.º 1
0
    def test_big_dataset_encoding2(self):
        """
        Makes sure that when there are multiple classes,
        and test/train targets differ, we proactively encode together
        the data between test and train
        """
        X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True, as_frame=True)
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X,
            y,
            random_state=1
        )

        # Make sure this test makes sense, so that y_test
        # and y_train have different classes
        all_classes = set(np.unique(y_test)).union(set(np.unique(y_train)))
        elements_in_test_only = np.setdiff1d(np.unique(y_test), np.unique(y_train))
        self.assertGreater(len(elements_in_test_only), 0)

        validator = InputValidator()
        common = validator.join_and_check(
            pd.DataFrame(y),
            pd.DataFrame(y_test)
        )

        validator.validate_target(common, is_classification=True)

        encoded_classes = validator.target_encoder.classes_
        missing = all_classes - set(encoded_classes)
        self.assertEqual(len(missing), 0)
Ejemplo n.º 2
0
    def test_binary_conversion(self):
        """
        Makes sure that a encoded target for classification
        properly retains the binary target type
        """
        validator = InputValidator()

        # Just 2 classes, 1 and 2
        y_train = validator.validate_target(
            np.array([1.0, 2.0, 2.0, 1.0], dtype=np.float64),
            is_classification=True,
        )
        self.assertEqual('binary', type_of_target(y_train))

        # Also make sure that a re-use of the generator is also binary
        y_valid = validator.validate_target(
            np.array([2.0, 2.0, 2.0, 2.0], dtype=np.float64),
            is_classification=True,
        )
        self.assertEqual('binary', type_of_target(y_valid))

        # Make sure binary also works with PD dataframes
        validator = InputValidator()

        # Just 2 classes, 1 and 2
        y_train = validator.validate_target(
            pd.DataFrame([1.0, 2.0, 2.0, 1.0], dtype='category'),
            is_classification=True,
        )
        self.assertEqual('binary', type_of_target(y_train))
Ejemplo n.º 3
0
    def test_dataframe_econding_1D(self):
        """
        Test that the encoding/decoding works in 1D
        """
        validator = InputValidator()
        y = validator.validate_target(
            pd.DataFrame(data=self.y, dtype=bool),
            is_classification=True,
        )
        np.testing.assert_array_almost_equal(np.array([0, 1, 0]), y)

        # Result should not change on a multi call
        y = validator.validate_target(pd.DataFrame(data=self.y, dtype=bool))
        np.testing.assert_array_almost_equal(np.array([0, 1, 0]), y)

        y_decoded = validator.decode_target(y)
        np.testing.assert_array_almost_equal(np.array(self.y, dtype=bool), y_decoded)

        # Now go with categorical data
        validator = InputValidator()
        y = validator.validate_target(
            pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
            is_classification=True,
        )
        np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y)

        y_decoded = validator.decode_target(y)
        self.assertListEqual(['a', 'a', 'b', 'c', 'a'], y_decoded.tolist())
Ejemplo n.º 4
0
    def test_join_and_check(self):
        validator = InputValidator()

        # Numpy Testing
        y = np.array([2, 2, 3, 4, 5])
        y_test = np.array([3, 4, 5, 6, 1])

        joined = validator.join_and_check(y, y_test)
        np.testing.assert_array_equal(
            joined,
            np.array([2, 2, 3, 4, 5, 3, 4, 5, 6, 1])
        )

        validator.validate_target(joined, is_classification=True)
        y_encoded = validator.validate_target(y)
        y_test_encoded = validator.validate_target(y_test)

        # If a common encoding happened, then common elements
        # should have a common encoding
        self.assertEqual(y_encoded[2], y_test_encoded[0])

        # Pandas Testing
        validator = InputValidator()
        joined = validator.join_and_check(
            pd.DataFrame(y),
            pd.DataFrame(y_test)
        )
        np.testing.assert_array_equal(
            joined,
            pd.DataFrame([2, 2, 3, 4, 5, 3, 4, 5, 6, 1])
        )

        # List Testing
        validator = InputValidator()
        joined = validator.join_and_check(
            [2, 2, 3, 4, 5],
            [3, 4, 5, 6, 1]
        )
        np.testing.assert_array_equal(
            joined,
            [2, 2, 3, 4, 5, 3, 4, 5, 6, 1]
        )

        # Make sure some messages are triggered
        y = np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]])
        y_test = np.array([3, 4, 5, 6, 1])
        with self.assertRaisesRegex(
            ValueError,
            'Train and test targets must have the same dimensionality'
        ):
            joined = validator.join_and_check(y, y_test)
        with self.assertRaisesRegex(
            ValueError,
            'Train and test targets must be of the same type'
        ):
            joined = validator.join_and_check(y, pd.DataFrame(y_test))
Ejemplo n.º 5
0
    def test_sparse_numpy_input(self):
        """
        Makes sure that no encoder is needed when
        working with sparse float data
        """
        validator = InputValidator()

        # Sparse data
        row_ind = np.array([0, 1, 2])
        col_ind = np.array([1, 2, 1])
        X_sparse = sparse.csr_matrix((np.ones(3), (row_ind, col_ind)))
        X = validator.validate_features(
            X_sparse,
        )
        y = validator.validate_target(
            np.array(self.y)
        )

        self.assertIsInstance(X, sparse.csr.csr_matrix)
        self.assertIsInstance(y, np.ndarray)
        self.assertIsNone(validator.target_encoder)
        self.assertIsNone(validator.feature_encoder)

        # Sparse targets should not be supported
        data = np.array([1, 2, 3, 4, 5, 6])
        col = np.array([0, 0, 0, 0, 0, 0])
        row = np.array([0,  2,  3,  6,  7, 10])
        y = sparse.csr_matrix((data, (row, col)), shape=(11, 1))
        with self.assertRaisesRegex(ValueError, 'scipy.sparse.csr_matrix.todense'):
            validator = InputValidator().validate_target(y)
Ejemplo n.º 6
0
    def test_dataframe_econding_2D(self):
        """
        Test that the encoding/decoding works in 2D
        """
        validator = InputValidator()
        multi_label = pd.DataFrame(np.array([[1, 0, 0, 1], [0, 0, 1, 1],
                                             [0, 0, 0, 0]]),
                                   dtype=bool)
        y = validator.validate_target(multi_label, is_classification=True)

        # Result should not change on a multi call
        y_new = validator.validate_target(multi_label)
        np.testing.assert_array_almost_equal(y_new, y)

        y_decoded = validator.decode_target(y)
        np.testing.assert_array_almost_equal(y, y_decoded)
Ejemplo n.º 7
0
    def test_all_posible_dtype_changes(self):
        """We do not allow a change in dtype once inputvalidator
        is fitted"""
        data = [[1, 0, 1], [1, 1, 1]]
        type_perms = list(itertools.permutations([
            data,
            np.array(data),
            pd.DataFrame(data)
        ], r=2))

        for first, second in type_perms:
            validator = InputValidator()
            validator.validate_target(first)
            with self.assertRaisesRegex(ValueError,
                                        "Auto-sklearn previously received targets of type"):
                validator.validate_target(second)
            validator.validate_features(first)
            with self.assertRaisesRegex(ValueError,
                                        "Auto-sklearn previously received features of type"):
                validator.validate_features(second)
Ejemplo n.º 8
0
    def test_numpy_input(self):
        """
        Makes sure that no encoding is needed for a
        numpy float object. Also test features/target
        validation methods
        """
        validator = InputValidator()
        X = validator.validate_features(np.array(self.X), )
        y = validator.validate_target(np.array(self.y))

        self.assertIsInstance(X, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
        self.assertIsNone(validator.target_encoder)
        self.assertIsNone(validator.feature_encoder)
Ejemplo n.º 9
0
    def test_dataframe_input_numerical(self):
        """
        Makes sure that we don't encode numerical data
        """
        for test_type in ['int64', 'float64', 'int8']:
            validator = InputValidator()
            X = validator.validate_features(
                pd.DataFrame(data=self.X, dtype=test_type), )
            y = validator.validate_target(
                pd.DataFrame(data=self.y, dtype=test_type), )

            self.assertIsInstance(X, np.ndarray)
            self.assertIsInstance(y, np.ndarray)
            self.assertIsNone(validator.target_encoder)
            self.assertIsNone(validator.feature_encoder)
Ejemplo n.º 10
0
    def test_no_new_category_after_fit(self):
        # First make sure no problem if no categorical
        x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]})
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x * x)
        validator.validate_features(x)

        # Then make sure we catch categorical extra categories
        x = pd.DataFrame({
            'A': [1, 2, 3, 4],
            'B': [5, 6, 7, 8]
        },
                         dtype='category')
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x * x)
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the input features contained categorical values'):
            validator.validate_features(x)

        # For label encoder of targets
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the target array contained the categorical'):
            validator.validate_target(pd.DataFrame([1, 2, 5, 4]))

        # For ordinal encoder of targets
        x = pd.DataFrame({
            'A': [1, 2, 3, 4],
            'B': [5, 6, 7, 8]
        },
                         dtype='category')
        validator = InputValidator()
        validator.validate(x, x, is_classification=True)
        validator.validate_target(
            pd.DataFrame({
                'A': [1, 2, 3, 4],
                'B': [5, 6, 7, 8]
            },
                         dtype='category'))
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the target array contained the categorical'):
            validator.validate_target(
                pd.DataFrame({
                    'A': [1, 2, 3, 4],
                    'B': [5, 9, 7, 8]
                },
                             dtype='category'))
        return
Ejemplo n.º 11
0
 def test_regression_conversion(self):
     """
     Makes sure that a regression input
     properly retains the continious target type
     """
     for input_object in [
         [1.0, 76.9, 123, 4.0, 81.1],
         np.array([1.0, 76.9, 123, 4.0, 81.1]),
         pd.DataFrame([1.0, 76.9, 123, 4.0, 81.1]),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=False,
         )
         self.assertEqual('continuous', type_of_target(y_train))
Ejemplo n.º 12
0
 def test_continuous_multioutput_conversion(self):
     """
     Makes sure that an input for regression
     properly retains the multiout continious target type
     """
     # Regression multi out conversion for different datatype
     for input_object in [
         [[31.4, 94], [40.5, 109], [25.0, 30]],
         np.array([[31.4, 94], [40.5, 109], [25.0, 30]]),
         pd.DataFrame([[31.4, 94], [40.5, 109], [25.0, 30]]),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=False,
         )
         self.assertEqual('continuous-multioutput', type_of_target(y_train))
Ejemplo n.º 13
0
 def test_multilabel_conversion(self):
     """
     Makes sure that a encoded target for classification
     properly retains the multilabel target type
     """
     # Multi-label conversion for different datatype
     for input_object in [
         [[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]],
         np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
         pd.DataFrame([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]], dtype='category'),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=True,
         )
         self.assertEqual('multilabel-indicator', type_of_target(y_train))
Ejemplo n.º 14
0
 def test_multiclass_conversion(self):
     """
     Makes sure that a encoded target for classification
     properly retains the multiclass target type
     """
     # Multiclass conversion for different datatype
     for input_object in [
         [1.0, 2.0, 2.0, 4.0, 3],
         np.array([1.0, 2.0, 2.0, 4.0, 3], dtype=np.float64),
         pd.DataFrame([1.0, 2.0, 2.0, 4.0, 3], dtype='category'),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=True,
         )
         self.assertEqual('multiclass', type_of_target(y_train))
Ejemplo n.º 15
0
    def test_dataframe_input_categorical(self):
        """
        Makes sure we automatically encode categorical data
        """
        for test_type in ['bool', 'category']:
            validator = InputValidator()
            X = validator.validate_features(
                pd.DataFrame(data=self.X, dtype=test_type), )
            y = validator.validate_target(
                pd.DataFrame(data=self.y, dtype=test_type),
                is_classification=True,
            )

            self.assertIsInstance(X, np.ndarray)
            self.assertIsInstance(y, np.ndarray)
            self.assertIsNotNone(validator.target_encoder)
            self.assertIsNotNone(validator.feature_encoder)