Esempio n. 1
0
def test_onehot_inverse_transform(client, drop):
    df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]})
    X = dask_cudf.from_cudf(df, npartitions=2)

    enc = OneHotEncoder(drop=drop)
    ohe = enc.fit_transform(X)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())
Esempio n. 2
0
def test_onehot_categories(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    cats = DataFrame({'chars': ['a', 'b', 'c'], 'int': [0, 1, 2]})
    enc = OneHotEncoder(categories=cats, sparse=False)
    ref = cp.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]])
    res = enc.fit_transform(X)
    cp.testing.assert_array_equal(res.compute(), ref)
Esempio n. 3
0
def test_onehot_inverse_transform_handle_unknown(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    Y_ohe = da.from_array(Y_ohe)

    enc = OneHotEncoder(handle_unknown='ignore')
    enc = enc.fit(X)
    df = enc.inverse_transform(Y_ohe)
    ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]})
    assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())
Esempio n. 4
0
def test_onehot_vs_skonehot(client):
    X = DataFrame({'gender': ['Male', 'Female', 'Female'], 'int': [1, 3, 2]})
    skX = from_df_to_numpy(X)
    X = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False)
    skohe = SkOneHotEncoder(sparse=False)

    ohe = enc.fit_transform(X)
    ref = skohe.fit_transform(skX)

    cp.testing.assert_array_equal(ohe.compute(), ref)
Esempio n. 5
0
def test_onehot_drop_idx_first(client):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False, drop='first')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first')
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
Esempio n. 6
0
def test_onehot_drop_one_of_each(cluster):
    client = Client(cluster)
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    enc = OneHotEncoder(sparse=False, drop=drop)
    sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b'])
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
    client.close()
Esempio n. 7
0
def test_onehot_drop_exceptions(cluster, drop, pattern):
    client = Client(cluster)
    X = DataFrame({'chars': ['c', 'b', 'd'], 'int': [2, 1, 0]})
    X = dask_cudf.from_cudf(X, npartitions=2)

    with pytest.raises(ValueError, match=pattern):
        OneHotEncoder(sparse=False, drop=drop).fit(X)
    client.close()
Esempio n. 8
0
def test_onehot_fit_handle_unknown(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(handle_unknown='error', categories=Y)
    with pytest.raises(KeyError):
        enc.fit(X)

    enc = OneHotEncoder(handle_unknown='ignore', categories=Y)
    enc.fit(X)
Esempio n. 9
0
def test_onehot_get_categories(client):
    X = DataFrame({'chars': ['c', 'b', 'd'], 'ints': [2, 1, 0]})
    X = dask_cudf.from_cudf(X, npartitions=2)

    ref = [np.array(['b', 'c', 'd']), np.array([0, 1, 2])]
    enc = OneHotEncoder().fit(X)
    cats = enc.categories_

    for i in range(len(ref)):
        np.testing.assert_array_equal(ref[i], cats[i].to_numpy())
Esempio n. 10
0
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)
    if as_array:
        dX = da.from_array(X)
    else:
        dX = dask_cudf.from_cudf(X, npartitions=1)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(dX)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray())
    else:
        cp.testing.assert_array_equal(ohe.compute(), ref)

    inv_ohe = enc.inverse_transform(ohe)
    assert_inverse_equal(inv_ohe.compute(), dX.compute())
Esempio n. 11
0
def test_onehot_transform_handle_unknown(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y = dask_cudf.from_cudf(Y, npartitions=2)

    enc = OneHotEncoder(handle_unknown='error', sparse=False)
    enc = enc.fit(X)
    with pytest.raises(KeyError):
        enc.transform(Y).compute()

    enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    enc = enc.fit(X)
    ohe = enc.transform(Y)
    ref = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    cp.testing.assert_array_equal(ohe.compute(), ref)