def test_onehot_inverse_transform(client, drop): df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]}) X = dask_cudf.from_cudf(df, npartitions=2) enc = OneHotEncoder(drop=drop) ohe = enc.fit_transform(X) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())
def test_onehot_categories(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) cats = DataFrame({'chars': ['a', 'b', 'c'], 'int': [0, 1, 2]}) enc = OneHotEncoder(categories=cats, sparse=False) ref = cp.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]]) res = enc.fit_transform(X) cp.testing.assert_array_equal(res.compute(), ref)
def test_onehot_inverse_transform_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) Y_ohe = da.from_array(Y_ohe) enc = OneHotEncoder(handle_unknown='ignore') enc = enc.fit(X) df = enc.inverse_transform(Y_ohe) ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]}) assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())
def test_onehot_vs_skonehot(client): X = DataFrame({'gender': ['Male', 'Female', 'Female'], 'int': [1, 3, 2]}) skX = from_df_to_numpy(X) X = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False) skohe = SkOneHotEncoder(sparse=False) ohe = enc.fit_transform(X) ref = skohe.fit_transform(skX) cp.testing.assert_array_equal(ohe.compute(), ref)
def test_onehot_drop_idx_first(client): X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False, drop='first') sk_enc = SkOneHotEncoder(sparse=False, drop='first') ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
def test_onehot_drop_one_of_each(cluster): client = Client(cluster) X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'}) enc = OneHotEncoder(sparse=False, drop=drop) sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b']) ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas()) client.close()
def test_onehot_drop_exceptions(cluster, drop, pattern): client = Client(cluster) X = DataFrame({'chars': ['c', 'b', 'd'], 'int': [2, 1, 0]}) X = dask_cudf.from_cudf(X, npartitions=2) with pytest.raises(ValueError, match=pattern): OneHotEncoder(sparse=False, drop=drop).fit(X) client.close()
def test_onehot_fit_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(handle_unknown='error', categories=Y) with pytest.raises(KeyError): enc.fit(X) enc = OneHotEncoder(handle_unknown='ignore', categories=Y) enc.fit(X)
def test_onehot_get_categories(client): X = DataFrame({'chars': ['c', 'b', 'd'], 'ints': [2, 1, 0]}) X = dask_cudf.from_cudf(X, npartitions=2) ref = [np.array(['b', 'c', 'd']), np.array([0, 1, 2])] enc = OneHotEncoder().fit(X) cats = enc.categories_ for i in range(len(ref)): np.testing.assert_array_equal(ref[i], cats[i].to_numpy())
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples): X, ary = generate_inputs_from_categories(n_samples=n_samples, as_array=as_array) if as_array: dX = da.from_array(X) else: dX = dask_cudf.from_cudf(X, npartitions=1) enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto') ohe = enc.fit_transform(dX) ref = sk_enc.fit_transform(ary) if sparse: cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray()) else: cp.testing.assert_array_equal(ohe.compute(), ref) inv_ohe = enc.inverse_transform(ohe) assert_inverse_equal(inv_ohe.compute(), dX.compute())
def test_onehot_transform_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) Y = dask_cudf.from_cudf(Y, npartitions=2) enc = OneHotEncoder(handle_unknown='error', sparse=False) enc = enc.fit(X) with pytest.raises(KeyError): enc.transform(Y).compute() enc = OneHotEncoder(handle_unknown='ignore', sparse=False) enc = enc.fit(X) ohe = enc.transform(Y) ref = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) cp.testing.assert_array_equal(ohe.compute(), ref)