def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) X_train = scipy.sparse.csc_matrix(X_train) X_test = scipy.sparse.csc_matrix(X_test) pipeline = sklearn.pipeline.Pipeline(( ('shift', CategoryShift()), ('imput', SimpleImputer(strategy='constant', fill_value=2)), ('ohe', SparseOneHotEncoder()), ('tree', DecisionTreeClassifier(random_state=1)), )) pipeline.fit(X_train, y_train) pred_train = pipeline.predict(X_train) self.assertTrue((pred_train == y_train).all()) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% pred_test = pipeline.predict(X_test) self.assertTrue((pred_test == y_test).all())
def test_classification_workflow(self): X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True) print(type(X)) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) X_train = scipy.sparse.csc_matrix(X_train) X_test = scipy.sparse.csc_matrix(X_test) pipeline = sklearn.pipeline.Pipeline(( ('shift', CategoryShift()), ('imput', SimpleImputer(strategy='constant', fill_value=2)), ('ohe', SparseOneHotEncoder()), ('tree', DecisionTreeClassifier(random_state=1)), )) pipeline.fit(X_train, y_train) pred_train = pipeline.predict(X_train) self.assertTrue((pred_train == y_train).all()) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% pred_test = pipeline.predict(X_test) self.assertTrue((pred_test == y_test).all())
def test_transform_with_unknown_value(self): # fit_data: this is going to be used to fit. # note that 0 is no category because the data here is sparse. fit_data = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() fds = scipy.sparse.csr_matrix(fit_data) ohe = SparseOneHotEncoder() ohe.fit(fds) # transf_data: this is going to be used in a transform call. # Note that transf_data has categories not seen at the fit. # Unseen categories are ignored (encoded just with zeros). transf_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() tds = scipy.sparse.csr_matrix(transf_data) output = ohe.transform(tds).todense() # From tds, just 3 categories (1 and 2 in the 1st feature and 1 in the 2nd # feature) have been seen during fit, therefore: self.assertEqual(3, np.sum(output))
def fit(self, X, y=None): if scipy.sparse.issparse(X): self.preprocessor = SparseOneHotEncoder() else: self.preprocessor = DenseOneHotEncoder( sparse=False, categories='auto', handle_unknown='ignore') self.preprocessor.fit(X, y) return self
def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'OneHotEncoder': if scipy.sparse.issparse(X): self.preprocessor = SparseOneHotEncoder() else: self.preprocessor = DenseOneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore') self.preprocessor.fit(X, y) return self
def _fit_then_transform(self, expected, input): # Test fit_transform input_copy = input.copy() ohe = SparseOneHotEncoder() transformation = ohe.fit_transform(input) self.assertIsInstance(transformation, scipy.sparse.csr_matrix) np.testing.assert_array_almost_equal(expected.astype(float), transformation.todense()) self._check_arrays_equal(input, input_copy) # Test fit, and afterwards transform ohe2 = SparseOneHotEncoder() ohe2.fit(input) transformation = ohe2.transform(input) self.assertIsInstance(transformation, scipy.sparse.csr_matrix) np.testing.assert_array_almost_equal(expected, transformation.todense()) self._check_arrays_equal(input, input_copy)