def test_onehot_fit_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(handle_unknown='error', categories=Y) with pytest.raises(KeyError): enc.fit(X) enc = OneHotEncoder(handle_unknown='ignore', categories=Y) enc.fit(X)
def test_onehot_transform_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) Y = dask_cudf.from_cudf(Y, npartitions=2) enc = OneHotEncoder(handle_unknown='error', sparse=False) enc = enc.fit(X) with pytest.raises(KeyError): enc.transform(Y).compute() enc = OneHotEncoder(handle_unknown='ignore', sparse=False) enc = enc.fit(X) ohe = enc.transform(Y) ref = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) cp.testing.assert_array_equal(ohe.compute(), ref)
def test_onehot_inverse_transform_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) Y_ohe = da.from_array(Y_ohe) enc = OneHotEncoder(handle_unknown='ignore') enc = enc.fit(X) df = enc.inverse_transform(Y_ohe) ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]}) assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())