Exemple #1
0
    def test_join_and_check(self):
        validator = InputValidator()

        # Numpy Testing
        y = np.array([2, 2, 3, 4, 5])
        y_test = np.array([3, 4, 5, 6, 1])

        joined = validator.join_and_check(y, y_test)
        np.testing.assert_array_equal(
            joined,
            np.array([2, 2, 3, 4, 5, 3, 4, 5, 6, 1])
        )

        validator.validate_target(joined, is_classification=True)
        y_encoded = validator.validate_target(y)
        y_test_encoded = validator.validate_target(y_test)

        # If a common encoding happened, then common elements
        # should have a common encoding
        self.assertEqual(y_encoded[2], y_test_encoded[0])

        # Pandas Testing
        validator = InputValidator()
        joined = validator.join_and_check(
            pd.DataFrame(y),
            pd.DataFrame(y_test)
        )
        np.testing.assert_array_equal(
            joined,
            pd.DataFrame([2, 2, 3, 4, 5, 3, 4, 5, 6, 1])
        )

        # List Testing
        validator = InputValidator()
        joined = validator.join_and_check(
            [2, 2, 3, 4, 5],
            [3, 4, 5, 6, 1]
        )
        np.testing.assert_array_equal(
            joined,
            [2, 2, 3, 4, 5, 3, 4, 5, 6, 1]
        )

        # Make sure some messages are triggered
        y = np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]])
        y_test = np.array([3, 4, 5, 6, 1])
        with self.assertRaisesRegex(
            ValueError,
            'Train and test targets must have the same dimensionality'
        ):
            joined = validator.join_and_check(y, y_test)
        with self.assertRaisesRegex(
            ValueError,
            'Train and test targets must be of the same type'
        ):
            joined = validator.join_and_check(y, pd.DataFrame(y_test))
Exemple #2
0
    def test_big_dataset_encoding2(self):
        """
        Makes sure that when there are multiple classes,
        and test/train targets differ, we proactively encode together
        the data between test and train
        """
        X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True, as_frame=True)
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X,
            y,
            random_state=1
        )

        # Make sure this test makes sense, so that y_test
        # and y_train have different classes
        all_classes = set(np.unique(y_test)).union(set(np.unique(y_train)))
        elements_in_test_only = np.setdiff1d(np.unique(y_test), np.unique(y_train))
        self.assertGreater(len(elements_in_test_only), 0)

        validator = InputValidator()
        common = validator.join_and_check(
            pd.DataFrame(y),
            pd.DataFrame(y_test)
        )

        validator.validate_target(common, is_classification=True)

        encoded_classes = validator.target_encoder.classes_
        missing = all_classes - set(encoded_classes)
        self.assertEqual(len(missing), 0)