def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] df["col2"] = [ 2992443.0, 2992447.0, 2992466.0, 2992440.0, 2992441.0, 2992442.0, 2992444.0, 2992445.0, 2992446.0, 2992448.0, ] assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, ) df = df.set_index("col2") assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, )
def test_onehot_inverse_transform(client, drop): df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]}) X = dask_cudf.from_cudf(df, npartitions=2) enc = OneHotEncoder(drop=drop) ohe = enc.fit_transform(X) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())
def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): np.random.seed(0) aa = np.random.random(nelem) bb = np.random.random(nelem) df = DataFrame({"a": aa, "b": bb}) pdf = df.to_pandas() assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns))
def test_onehot_inverse_transform_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) Y_ohe = da.from_array(Y_ohe) enc = OneHotEncoder(handle_unknown='ignore') enc = enc.fit(X) df = enc.inverse_transform(Y_ohe) ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]}) assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())
def test_onehot_drop_idx_first(client): X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False, drop='first') sk_enc = SkOneHotEncoder(sparse=False, drop='first') ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
def test_dataframe_sort_values_ignore_index(index, ignore_index): gdf = DataFrame( {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]} ) gdf = gdf.set_index(index) pdf = gdf.to_pandas() expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) assert_eq(expect, got)
def test_categorical_basic(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) dsr = dgd.from_cudf(sr, npartitions=2) result = dsr.compute() np.testing.assert_array_equal(cat.codes, result.to_array()) assert dsr.dtype.to_pandas() == pdsr.dtype # Test attributes assert pdsr.cat.ordered == dsr.cat.ordered assert tuple(pdsr.cat.categories) == tuple(dsr.cat.categories) np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype) string = str(result) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) from cudf.tests.utils import assert_eq df = DataFrame() df["a"] = ["xyz", "abc", "def"] * 10 pdf = df.to_pandas() cddf = dgd.from_cudf(df, 1) cddf["b"] = cddf["a"].astype("category") ddf = dd.from_pandas(pdf, 1) ddf["b"] = ddf["a"].astype("category") assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"]) with pytest.raises(NotImplementedError): cddf["b"].cat.categories with pytest.raises(NotImplementedError): ddf["b"].cat.categories cddf = cddf.categorize() ddf = ddf.categorize() assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories) assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)
def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = DataFrame() gdf["a"] = list(range(nelem)) gdf["b"] = list(range(nelem, 2 * nelem)) gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] expect = do_slice(gdf.to_pandas()) got = do_slice(gdf).to_pandas() assert_eq(expect, got, check_dtype=False)
def test_str_slice(): df = DataFrame({"a": ["abc,def,123", "xyz,hi,bye"]}) ddf = dgd.from_cudf(df, 1) pdf = df.to_pandas() dd.assert_eq( pdf.a.str.split(",", expand=True, n=1), ddf.a.str.split(",", expand=True, n=1), ) dd.assert_eq( pdf.a.str.split(",", expand=True, n=2), ddf.a.str.split(",", expand=True, n=2), )
def test_onehot_drop_one_of_each(cluster): client = Client(cluster) X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'}) enc = OneHotEncoder(sparse=False, drop=drop) sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b']) ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas()) client.close()
def test_dataframe_take(ntake): np.random.seed(0) df = DataFrame() nelem = 123 df["ii"] = np.random.randint(0, 20, nelem) df["ff"] = np.random.random(nelem) take_indices = np.random.randint(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) assert actual.ii.null_count == 0 assert actual.ff.null_count == 0 assert_eq(actual, expected)
def test_categorical_categories(): df = DataFrame( {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)} ) df["a"] = df["a"].astype("category") pdf = df.to_pandas(nullable_pd_dtype=False) ddf = dgd.from_cudf(df, 2) dpdf = dd.from_pandas(pdf, 2) dd.assert_eq( ddf.a.cat.categories.to_series().to_pandas(nullable_pd_dtype=False), dpdf.a.cat.categories.to_series(), check_index=False, )
def test_dataframe_take_with_multiIndex(ntake): np.random.seed(0) df = DataFrame(index=cudf.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], )) nelem = 9 df["ii"] = np.random.randint(0, 20, nelem) df["ff"] = np.random.random(nelem) take_indices = np.random.randint(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) assert_eq(actual, expected)
def test_to_pandas(): df = DataFrame() df["a"] = np.arange(5, dtype=np.int32) df["b"] = np.arange(10, 15, dtype=np.float64) df["c"] = np.array([True, False, None, True, True]) pdf = df.to_pandas() assert tuple(df.columns) == tuple(pdf.columns) assert df["a"].dtype == pdf["a"].dtype assert df["b"].dtype == pdf["b"].dtype # Notice, the dtype differ when Pandas and cudf boolean series # contains None/NaN assert df["c"].dtype == np.bool_ assert pdf["c"].dtype == np.object_ assert len(df["a"]) == len(pdf["a"]) assert len(df["b"]) == len(pdf["b"]) assert len(df["c"]) == len(pdf["c"])