def test_pickle_index(): nelem = 10 idx = GenericIndex(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) # TODO: Once operations like `all` are supported on Index objects, we can # just use that without calling values first. assert (idx == out).values.all()
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df["fo"] = Series(values, index=GenericIndex(indices)) out = df.one_hot_encoding( "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 ) assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_pickle_index(): nelem = 10 idx = GenericIndex(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) assert idx == out
def inverse_transform(self, X): """ Convert the data back to the original representation. In case unknown categories are encountered (all zeros in the one-hot encoding), ``None`` is used to represent this category. The return type is the same as the type of the input used by the first call to fit on this estimator instance. Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. Returns ------- X_tr : cudf.DataFrame or cupy.ndarray Inverse transformed array. """ self._check_is_fitted() if cp.sparse.issparse(X): # cupy.sparse 7.x does not support argmax, when we upgrade cupy to # 8.x, we should add a condition in the # if close: `and not cp.sparse.issparsecsc(X)` # and change the following line by `X = X.tocsc()` X = X.toarray() result = DataFrame(columns=self._encoders.keys()) j = 0 for feature in self._encoders.keys(): feature_enc = self._encoders[feature] cats = feature_enc.classes_ if self.drop is not None: # Remove dropped categories dropped_class_idx = Series(self.drop_idx_[feature]) dropped_class_mask = Series(cats).isin(cats[dropped_class_idx]) if len(cats) == 1: inv = Series(GenericIndex(cats[0]).repeat(X.shape[0])) result[feature] = inv continue cats = cats[~dropped_class_mask] enc_size = len(cats) x_feature = X[:, j:j + enc_size] idx = cp.argmax(x_feature, axis=1) inv = Series(cats.iloc[idx]).reset_index(drop=True) if self.handle_unknown == 'ignore': not_null_idx = x_feature.any(axis=1) inv.iloc[~not_null_idx] = None elif self.drop is not None: # drop will either be None or handle_unknown will be error. If # self.drop is not None, then we can safely assume that all of # the nulls in each column are the dropped value dropped_mask = cp.asarray(x_feature.sum(axis=1) == 0).flatten() if dropped_mask.any(): inv[dropped_mask] = feature_enc.inverse_transform( Series(self.drop_idx_[feature]))[0] result[feature] = inv j += enc_size if self.input_type == 'array': try: result = cp.asarray(result.as_gpu_matrix()) except ValueError: warnings.warn("The input one hot encoding contains rows with " "unknown categories. Arrays do not support null " "values. Returning output as a DataFrame " "instead.") return result
def test_pickle_index(): nelem = 10 idx = GenericIndex(rmm.to_device(np.arange(nelem))) pickled = pickle.dumps(idx) out = pickle.loads(pickled) assert idx == out