コード例 #1
0
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)

        X_train = scipy.sparse.csc_matrix(X_train)
        X_test = scipy.sparse.csc_matrix(X_test)

        pipeline = sklearn.pipeline.Pipeline((
            ('shift', CategoryShift()),
            ('imput', SimpleImputer(strategy='constant', fill_value=2)),
            ('ohe', SparseOneHotEncoder()),
            ('tree', DecisionTreeClassifier(random_state=1)),
        ))

        pipeline.fit(X_train, y_train)
        pred_train = pipeline.predict(X_train)
        self.assertTrue((pred_train == y_train).all())
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        pred_test = pipeline.predict(X_test)
        self.assertTrue((pred_test == y_test).all())
コード例 #2
0
    def test_classification_workflow(self):
        X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True)
        print(type(X))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)

        X_train = scipy.sparse.csc_matrix(X_train)
        X_test = scipy.sparse.csc_matrix(X_test)

        pipeline = sklearn.pipeline.Pipeline((
            ('shift', CategoryShift()),
            ('imput', SimpleImputer(strategy='constant', fill_value=2)),
            ('ohe', SparseOneHotEncoder()),
            ('tree', DecisionTreeClassifier(random_state=1)),
            ))

        pipeline.fit(X_train, y_train)
        pred_train = pipeline.predict(X_train)
        self.assertTrue((pred_train == y_train).all())
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        pred_test = pipeline.predict(X_test)
        self.assertTrue((pred_test == y_test).all())
コード例 #3
0
 def test_transform_with_unknown_value(self):
     # fit_data: this is going to be used to fit.
     # note that 0 is no category because the data here is sparse.
     fit_data = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
     fds = scipy.sparse.csr_matrix(fit_data)
     ohe = SparseOneHotEncoder()
     ohe.fit(fds)
     # transf_data: this is going to be used in a transform call.
     # Note that transf_data has categories not seen at the fit.
     # Unseen categories are ignored (encoded just with zeros).
     transf_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
     tds = scipy.sparse.csr_matrix(transf_data)
     output = ohe.transform(tds).todense()
     # From tds, just 3 categories (1 and 2 in the 1st feature and 1 in the 2nd
     # feature) have been seen during fit, therefore:
     self.assertEqual(3, np.sum(output))
コード例 #4
0
 def fit(self, X, y=None):
     if scipy.sparse.issparse(X):
         self.preprocessor = SparseOneHotEncoder()
     else:
         self.preprocessor = DenseOneHotEncoder(
             sparse=False, categories='auto', handle_unknown='ignore')
     self.preprocessor.fit(X, y)
     return self
コード例 #5
0
 def fit(self,
         X: PIPELINE_DATA_DTYPE,
         y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'OneHotEncoder':
     if scipy.sparse.issparse(X):
         self.preprocessor = SparseOneHotEncoder()
     else:
         self.preprocessor = DenseOneHotEncoder(sparse=False,
                                                categories='auto',
                                                handle_unknown='ignore')
     self.preprocessor.fit(X, y)
     return self
コード例 #6
0
    def _fit_then_transform(self, expected, input):
        # Test fit_transform
        input_copy = input.copy()
        ohe = SparseOneHotEncoder()
        transformation = ohe.fit_transform(input)
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        np.testing.assert_array_almost_equal(expected.astype(float),
                                             transformation.todense())
        self._check_arrays_equal(input, input_copy)

        # Test fit, and afterwards transform
        ohe2 = SparseOneHotEncoder()
        ohe2.fit(input)
        transformation = ohe2.transform(input)
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        np.testing.assert_array_almost_equal(expected,
                                             transformation.todense())
        self._check_arrays_equal(input, input_copy)