def test_fit(self): (train_X_pd, _), (_, _) = self.tgt2creditg["pandas"] cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) rasl_trainable = prefix >> RaslOneHotEncoder() sk_trainable = prefix >> SkOneHotEncoder() sk_trained = sk_trainable.fit(train_X_pd) for tgt, dataset in self.tgt2creditg.items(): (train_X, train_y), (test_X, test_y) = dataset rasl_trained = rasl_trainable.fit(train_X) self._check_last_trained(sk_trained, rasl_trained, tgt)
def test_onehot_vs_skonehot(client): X = DataFrame({'gender': ['Male', 'Female', 'Female'], 'int': [1, 3, 2]}) skX = from_df_to_numpy(X) X = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False) skohe = SkOneHotEncoder(sparse=False) ohe = enc.fit_transform(X) ref = skohe.fit_transform(skX) cp.testing.assert_array_equal(ohe.compute(), ref)
def test_onehot_drop_idx_first(client): X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False, drop='first') sk_enc = SkOneHotEncoder(sparse=False, drop='first') ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
def test_onehot_vs_skonehot(as_array): X = DataFrame({'gender': ['M', 'F', 'F'], 'int': [1, 3, 2]}) skX = from_df_to_array(X) if as_array: X = _from_df_to_cupy(X) skX = cp.asnumpy(X) enc = OneHotEncoder(sparse=True) skohe = SkOneHotEncoder(sparse=True) ohe = enc.fit_transform(X) ref = skohe.fit_transform(skX) cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
def test_onehot_drop_idx_first(as_array): X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) if as_array: X = _from_df_to_cupy(X) X_ary = cp.asnumpy(X) enc = OneHotEncoder(sparse=False, drop='first', categories='auto') sk_enc = SkOneHotEncoder(sparse=False, drop='first', categories='auto') ohe = enc.fit_transform(X) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe, ref) inv = enc.inverse_transform(ohe) assert_inverse_equal(inv, X)
def test_onehot_random_inputs(drop, sparse, n_samples, as_array): X, ary = generate_inputs_from_categories(n_samples=n_samples, as_array=as_array) enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto') ohe = enc.fit_transform(X) ref = sk_enc.fit_transform(ary) if sparse: cp.testing.assert_array_equal(ohe.toarray(), ref.toarray()) else: cp.testing.assert_array_equal(ohe, ref) inv_ohe = enc.inverse_transform(ohe) assert_inverse_equal(inv_ohe, X)
def test_onehot_drop_one_of_each(cluster): client = Client(cluster) X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'}) enc = OneHotEncoder(sparse=False, drop=drop) sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b']) ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas()) client.close()
def test_onehot_sparse_drop(as_array): X = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2], 'l': [5, 5, 6]}) drop = {'g': 'F', 'i': 3, 'l': 6} ary = from_df_to_array(X) drop_ary = ['F', 3, 6] if as_array: X = _from_df_to_cupy(X) ary = cp.asnumpy(X) drop = drop_ary = _convert_drop(drop) enc = OneHotEncoder(sparse=True, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=True, drop=drop_ary, categories='auto') ohe = enc.fit_transform(X) ref = sk_enc.fit_transform(ary) cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
def test_onehot_drop_one_of_each(as_array): X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'}) X_ary = from_df_to_array(X) drop_ary = ['b', 2, 'b'] if as_array: X = _from_df_to_cupy(X) X_ary = cp.asnumpy(X) drop = drop_ary = _convert_drop(drop) enc = OneHotEncoder(sparse=False, drop=drop, categories='auto') ohe = enc.fit_transform(X) print(ohe.dtype) ref = SkOneHotEncoder(sparse=False, drop=drop_ary, categories='auto').fit_transform(X_ary) cp.testing.assert_array_equal(ohe, ref) inv = enc.inverse_transform(ohe) assert_inverse_equal(inv, X)
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples): X, ary = generate_inputs_from_categories(n_samples=n_samples, as_array=as_array) if as_array: dX = da.from_array(X) else: dX = dask_cudf.from_cudf(X, npartitions=1) enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto') ohe = enc.fit_transform(dX) ref = sk_enc.fit_transform(ary) if sparse: cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray()) else: cp.testing.assert_array_equal(ohe.compute(), ref) inv_ohe = enc.inverse_transform(ohe) assert_inverse_equal(inv_ohe.compute(), dX.compute())
def test_predict(self): (train_X_pd, train_y_pd), (test_X_pd, test_y_pd) = self.tgt2creditg["pandas"] cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) to_pd = FunctionTransformer( func=lambda X: X if isinstance(X, pd.DataFrame) else X.toPandas()) lr = LogisticRegression() sk_trainable = prefix >> SkOneHotEncoder(sparse=False) >> lr sk_trained = sk_trainable.fit(train_X_pd, train_y_pd) sk_predicted = sk_trained.predict(test_X_pd) rasl_trainable = prefix >> RaslOneHotEncoder( sparse=False) >> to_pd >> lr for tgt, dataset in self.tgt2creditg.items(): (train_X, train_y), (test_X, test_y) = dataset rasl_trained = rasl_trainable.fit(train_X, train_y) rasl_predicted = rasl_trained.predict(test_X) self.assertEqual(sk_predicted.shape, rasl_predicted.shape, tgt) self.assertEqual(sk_predicted.tolist(), rasl_predicted.tolist(), tgt)
def test_transform(self): (train_X_pd, train_y_pd), (test_X_pd, test_y_pd) = self.tgt2creditg["pandas"] cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) rasl_trainable = prefix >> RaslOneHotEncoder(sparse=False) sk_trainable = prefix >> SkOneHotEncoder(sparse=False) sk_trained = sk_trainable.fit(train_X_pd) sk_transformed = sk_trained.transform(test_X_pd) for tgt, dataset in self.tgt2creditg.items(): (train_X, train_y), (test_X, test_y) = dataset rasl_trained = rasl_trainable.fit(train_X) self._check_last_trained(sk_trained, rasl_trained, tgt) rasl_transformed = rasl_trained.transform(test_X) if tgt == "spark": rasl_transformed = rasl_transformed.toPandas() self.assertEqual(sk_transformed.shape, rasl_transformed.shape, tgt) for row_idx in range(sk_transformed.shape[0]): for col_idx in range(sk_transformed.shape[1]): self.assertEqual( sk_transformed[row_idx, col_idx], rasl_transformed.iloc[row_idx, col_idx], (row_idx, col_idx, tgt), )