def test_masked_encode(client): n_workers = len(client.has_what()) df = cudf.DataFrame({ "filter_col": [1, 1, 2, 3, 1, 1, 1, 1, 6, 5], "cat_col": ['a', 'b', 'c', 'd', 'a', 'a', 'a', 'c', 'b', 'c'] }) ddf = dask_cudf.from_cudf(df, npartitions=n_workers) ddf_filter = ddf[ddf["filter_col"] == 1] filter_encoded = LabelEncoder().fit_transform(ddf_filter["cat_col"]) ddf_filter = ddf_filter.assign(filter_encoded=filter_encoded.values) encoded_filter = LabelEncoder().fit_transform(ddf["cat_col"]) ddf = ddf.assign(encoded_filter=encoded_filter.values) ddf = ddf[ddf.filter_col == 1] assert (ddf.encoded_filter == ddf_filter.filter_encoded).compute().all()
def _prep_training_data(c, X_train, partitions_per_worker, reverse_order=False): workers = c.has_what().keys() if reverse_order: workers = list(workers)[::-1] n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) X_train_df, = dask_utils.persist_across_workers(c, [X_train_df], workers=list(workers)) return X_train_df
def test_mixing_series_frame_error(): nelem = 20 df = gd.DataFrame() df["x"] = np.arange(nelem) df["y"] = np.random.randint(nelem, size=nelem) ddf = dgd.from_cudf(df, npartitions=5) delay_frame = ddf.to_delayed() delay_series = ddf.x.to_delayed() combined = dgd.from_delayed(delay_frame + delay_series) with pytest.raises(ValueError) as raises: combined.compute() raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"Expected partition of type `DataFrame` but got `Series`")
def test_groupby_std(func): pdf = pd.DataFrame({ "x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000), }) gdf = cudf.DataFrame.from_pandas(pdf) ddf = dask_cudf.from_cudf(gdf, npartitions=5) a = func(gdf.to_pandas()) b = func(ddf).compute().to_pandas() a.index.name = None a.name = None b.index.name = None dd.assert_eq(a, b)
def test_dataframe_assign_col(): df = cudf.DataFrame(list(range(100))) pdf = pd.DataFrame(list(range(100))) ddf = dgd.from_cudf(df, npartitions=4) ddf["fold"] = 0 ddf["fold"] = ddf["fold"].map_partitions( lambda cudf_df: cp.random.randint(0, 4, len(cudf_df)) ) pddf = dd.from_pandas(pdf, npartitions=4) pddf["fold"] = 0 pddf["fold"] = pddf["fold"].map_partitions( lambda p_df: np.random.randint(0, 4, len(p_df)) ) dd.assert_eq(ddf[0], pddf[0]) dd.assert_eq(len(ddf["fold"]), len(pddf["fold"]))
def test_query(): np.random.seed(0) df = pd.DataFrame({ "x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10) }) gdf = gd.DataFrame.from_pandas(df) expr = "x > 2" assert_frame_equal(gdf.query(expr).to_pandas(), df.query(expr)) queried = dgd.from_cudf(gdf, npartitions=2).query(expr) got = queried.compute().to_pandas() expect = gdf.query(expr).to_pandas() assert_frame_equal(got, expect)
def test_set_index_w_series(): with dask.config.set(scheduler="single-threaded"): nelem = 20 np.random.seed(0) df = pd.DataFrame({ "x": 100 + np.random.randint(0, nelem // 2, size=nelem), "y": np.random.normal(size=nelem), }) expect = df.set_index(df.x).sort_index() dgf = dgd.from_cudf(gd.DataFrame.from_pandas(df), npartitions=4) res = dgf.set_index(dgf.x) # sort by default got = res.compute().to_pandas() assert set(expect.columns) == set(got.columns) assert_frame_equal_by_index_group(expect, got)
def test_target_encode_multi(tmpdir, npartitions): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = cudf.DataFrame({ "cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2 }) df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_groups = ["cat", "cat2", ["cat", "cat2"]] te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"], out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32") workflow = nvt.Workflow(te_features) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def create_tables(bc, dir_data_lc, fileSchemaType, **kwargs): ext = get_extension(fileSchemaType) tables = kwargs.get('tables', tpchTables) bool_orders_index = kwargs.get('bool_orders_index', -1) dir_data_lc = dir_data_lc + "tpch/" for i, table in enumerate(tables): # using wildcard, note the _ after the table name (it will avoid collisions) table_files = ("%s/%s_[0-9]*.%s") % (dir_data_lc, table, ext) t = None if fileSchemaType == DataType.CSV: bool_orders_flag = False if i == bool_orders_index: bool_orders_flag = True dtypes = get_dtypes(table, bool_orders_flag) col_names = get_column_names(table, bool_orders_flag) t = bc.create_table(table, table_files, delimiter='|', dtype=dtypes, names=col_names) elif fileSchemaType == DataType.CUDF: bool_column = bool_orders_index != -1 gdf = read_data(table, dir_data_lc, bool_column) t = bc.create_table(table, gdf) elif fileSchemaType == DataType.DASK_CUDF: bool_column = bool_orders_index != -1 gdf = read_data(table, dir_data_lc, bool_column) nRals = Settings.data['RunSettings']['nRals'] num_partitions = nRals ds = dask_cudf.from_cudf(gdf, npartitions=num_partitions) t = bc.create_table(table, ds) # elif fileSchemaType == DataType.DASK_CUDF: # bool_column = bool_orders_index != -1 # table_files = ("%s/%s_[0-9]*.%s") % (dir_data_lc, table, 'parquet') # dask_df = dask_cudf.read_parquet(table_files) # dask_df = bc.unify_partitions(dask_df) # t = bc.create_table(table, dask_df) else: t = bc.create_table(table, table_files)
def test_target_encode_multi(tmpdir, npartitions): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = cudf.DataFrame({"cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2}) df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_names = ["cat", "cat2"] cont_names = ["num", "num_2"] label_name = [] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) cat_groups = ["cat", "cat2", ["cat", "cat2"]] processor.add_preprocess( ops.TargetEncoding( cat_groups, ["num", "num_2"], # cont_target out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32", ) ) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def __init__(self, input, fileType, files=None, datasource=[], calcite_to_file_indices=None, num_row_groups=None, args={}, convert_gdf_to_dask=False, convert_gdf_to_dask_partitions=1, client=None, uri_values=[], in_file=[], force_conversion=False): self.fileType = fileType if fileType == DataType.ARROW: if force_conversion: #converts to cudf for querying self.input = cudf.DataFrame.from_arrow(input) self.fileType = DataType.CUDF else: self.input = cudf.DataFrame.from_arrow( input.schema.empty_table()) self.arrow_table = input else: self.input = input self.calcite_to_file_indices = calcite_to_file_indices self.files = files self.datasource = datasource self.num_row_groups = num_row_groups self.args = args if fileType == DataType.CUDF or DataType.DASK_CUDF: if (convert_gdf_to_dask and isinstance(self.input, cudf.DataFrame)): self.input = dask_cudf.from_cudf( self.input, npartitions=convert_gdf_to_dask_partitions) if (isinstance(self.input, dask_cudf.core.DataFrame)): self.dask_mapping = getNodePartitions(self.input, client) self.uri_values = uri_values self.in_file = in_file
def test_groupby_multiindex_reset_index(npartitions): df = cudf.DataFrame({ "a": [1, 1, 2, 3, 4], "b": [5, 2, 1, 2, 5], "c": [1, 2, 2, 3, 5] }) ddf = dask_cudf.from_cudf(df, npartitions=npartitions) pddf = dd.from_pandas(df.to_pandas(), npartitions=npartitions) gr = ddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index() pr = pddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index() # CuDF uses "int32" for count. Pandas uses "int64" gr_out = gr.compute().sort_values(by=["a", "c"]).reset_index(drop=True) gr_out[("b", "count")] = gr_out[("b", "count")].astype("int64") dd.assert_eq( gr_out, pr.compute().sort_values(by=["a", "c"]).reset_index(drop=True), )
def test_multicolumn_groupby(): import cudf, dask_cudf tmp_df = cudf.DataFrame() tmp_df['id'] = [0, 0, 1, 2, 2, 2] tmp_df['val1'] = [0, 1, 0, 0, 1, 2] tmp_df['val2'] = [9, 9, 9, 9, 9, 9] ddf = dask_cudf.from_cudf(tmp_df, npartitions=2) actual = ddf.groupby(['id', 'val1']).count().compute() # FIXME: this is not idiomatic cudf! expectedVals = [1, 1, 1, 1, 1, 1] expected = cudf.DataFrame() expected['val'] = expectedVals assert False not in ( expected.to_pandas().values == actual.to_pandas().values)
def get_clusters(client, ml_input_df): import dask_cudf ml_tasks = [ delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER) for df in ml_input_df.to_delayed() ] results_dict = client.compute(*ml_tasks, sync=True) output = ml_input_df.index.to_frame().reset_index(drop=True) labels_final = dask_cudf.from_cudf(results_dict["cid_labels"], npartitions=output.npartitions) output["label"] = labels_final.reset_index()[0] # Based on CDH6.1 q25-result formatting results_dict["cid_labels"] = output return results_dict
def test_get_dummies_large(): gdf = cudf.datasets.randomdata( nrows=200000, dtypes={ "C": int, "first": "category", "b": float, "second": "category", }, ) df = gdf.to_pandas() ddf = dd.from_pandas(df, npartitions=25) dd.assert_eq(dd.get_dummies(ddf).compute(), pd.get_dummies(df)) gddf = dask_cudf.from_cudf(gdf, npartitions=25) dd.assert_eq( dd.get_dummies(ddf).compute(), dd.get_dummies(gddf).compute(), check_dtype=False, )
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples): X, ary = generate_inputs_from_categories(n_samples=n_samples, as_array=as_array) if as_array: dX = da.from_array(X) else: dX = dask_cudf.from_cudf(X, npartitions=1) enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto') ohe = enc.fit_transform(dX) ref = sk_enc.fit_transform(ary) if sparse: cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray()) else: cp.testing.assert_array_equal(ohe.compute(), ref) inv_ohe = enc.inverse_transform(ohe) assert_inverse_equal(inv_ohe.compute(), dX.compute())
def test_groupby_reset_index_names(): df = cudf.datasets.randomdata(nrows=10, dtypes={ "a": str, "b": int, "c": int }) pdf = df.to_pandas() gddf = dask_cudf.from_cudf(df, 2) pddf = dd.from_pandas(pdf, 2) g_res = gddf.groupby("a", sort=True).sum() p_res = pddf.groupby("a", sort=True).sum() got = g_res.reset_index().compute().sort_values(["a", "b", "c"]) expect = p_res.reset_index().compute().sort_values(["a", "b", "c"]) dd.assert_eq(got, expect)
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu): df = cudf.DataFrame({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) if cpu: df = dd.from_pandas(df.to_pandas(), npartitions=3) else: df = dask_cudf.from_cudf(df, npartitions=3) cont_names = ["Cost"] te_features = cat_groups >> ops.TargetEncoding( cont_names, out_path=str(tmpdir), kfold=kfold, out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate ) cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = cudf.io.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check)
def test_groupby_split_out(split_out, column): df = pd.DataFrame({ "a": np.arange(8), "b": [1, 0, 0, 2, 1, 1, 2, 0], "c": [0, 1] * 4, "d": ["dog", "cat", "cat", "dog", "dog", "dog", "cat", "bird"], }) df["e"] = df["d"].astype("category") gdf = cudf.from_pandas(df) ddf = dd.from_pandas(df, npartitions=3) gddf = dask_cudf.from_cudf(gdf, npartitions=3) ddf_result = (ddf.groupby(column).a.mean( split_out=split_out).compute().sort_values().dropna()) gddf_result = (gddf.groupby(column).a.mean( split_out=split_out).compute().sort_values()) dd.assert_eq(gddf_result, ddf_result, check_index=False)
def test_sort_values_binned(): np.random.seed(43) nelem = 100 nparts = 5 by = 'a' df = gd.DataFrame() df['a'] = np.random.randint(1, 5, nelem) ddf = dgd.from_cudf(df, npartitions=nparts) parts = ddf.sort_values_binned(by=by).to_delayed() part_uniques = [] for i, p in enumerate(parts): part = dask.compute(p)[0] part_uniques.append(set(part.a.unique())) # Partitions do not have intersecting keys for i in range(len(part_uniques)): for j in range(i + 1, len(part_uniques)): assert not (part_uniques[i] & part_uniques[j]), \ "should have empty intersection"
def test_workflow_transform_ddf_dtypes(): # Initial Dataset df = cudf.datasets.timeseries().reset_index() ddf = dask_cudf.from_cudf(df, npartitions=2) dataset = Dataset(ddf) # Create and Execute Workflow cols = ["name", "x", "y", "timestamp"] cat_cols = ["id"] >> ops.Normalize() workflow = Workflow(cols + cat_cols) workflow.fit(dataset) transformed_ddf = workflow.transform(dataset).to_ddf() # no transforms on the pass through cols, should have original dtypes for col in cols: assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col]) # Followup dask-cudf sorting used to throw an exception because of dtype issues, # check that it works now transformed_ddf.sort_values(["id", "timestamp"]).compute()
def test_groupby_reset_index_string_name(): df = cudf.DataFrame({"value": range(5), "key": ["a", "a", "b", "a", "c"]}) pdf = df.to_pandas() gddf = dask_cudf.from_cudf(df, npartitions=1) pddf = dd.from_pandas(pdf, npartitions=1) g_res = (gddf.groupby(["key"]).agg({ "value": "mean" }).reset_index(drop=False)) p_res = (pddf.groupby(["key"]).agg({ "value": "mean" }).reset_index(drop=False)) got = g_res.compute().sort_values(["key", "value"]).reset_index(drop=True) expect = (p_res.compute().sort_values(["key", "value"]).reset_index(drop=True)) dd.assert_eq(got, expect) assert len(g_res) == len(p_res)
def test_to_sp_dask_array(input_type, nrows, ncols, cluster): c = Client(cluster) try: from cuml.dask.common import to_sp_dask_array a = cp.sparse.random(nrows, ncols, format='csr', dtype=cp.float32) if input_type == "dask_dataframe": df = cudf.DataFrame.from_gpu_matrix(a.todense()) inp = dask_cudf.from_cudf(df, npartitions=2) elif input_type == "dask_array": inp = dask.array.from_array(a.todense().get()) elif input_type == "dataframe": inp = cudf.DataFrame.from_gpu_matrix(a.todense()) elif input_type == "scipysparse": inp = a.get() elif input_type == "cupysparse": inp = a elif input_type == "numpy": inp = a.get().todense() elif input_type == "cupy": inp = a.todense() arr = to_sp_dask_array(inp, c) arr.compute_chunk_sizes() assert arr.shape == (nrows, ncols) # We can't call compute directly on this array yet when it has # multiple partitions yet so we will manually concat any # potential pieces. parts = c.sync(extract_arr_partitions, arr) local_parts = cp.vstack([part[1].result().todense() for part in parts]).get() assert array_equal(a.todense().get(), local_parts) finally: c.close()
def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata): tmpdir = str(tmpdir) df = pd.DataFrame() df["year"] = [2018, 2019, 2019, 2019, 2020, 2021] df["month"] = [1, 2, 3, 3, 3, 2] df["day"] = [1, 1, 1, 2, 2, 1] df["data"] = [0, 0, 0, 0, 0, 0] df.index.name = "index" if daskcudf: ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2) ddf2.to_parquet(tmpdir, write_metadata_file=metadata, partition_on=parts) else: ddf2 = dd.from_pandas(df, npartitions=2) ddf2.to_parquet( tmpdir, engine="pyarrow", write_metadata_file=metadata, partition_on=parts, ) df_read = dd.read_parquet(tmpdir, engine="pyarrow") gdf_read = dask_cudf.read_parquet(tmpdir) # TODO: Avoid column selection after `CudfEngine` # can be aligned with dask/dask#6534 columns = list(df_read.columns) assert set(df_read.columns) == set(gdf_read.columns) dd.assert_eq( df_read.compute(scheduler=dask.get)[columns], gdf_read.compute(scheduler=dask.get)[columns], ) assert gdf_read.index.name == "index" # Check that we don't have uuid4 file names for _, _, files in os.walk(tmpdir): for fn in files: if not fn.startswith("_"): assert "part" in fn
def test_mnmg(): cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) n_workers = len(client.scheduler_info()['workers']) # Create and populate a GPU DataFrame df_float = cudf.DataFrame() df_float['0'] = [1.0, 2.0, 5.0] df_float['1'] = [4.0, 2.0, 1.0] df_float['2'] = [4., 2, 1] ddf_float = dask_cudf.from_cudf(df_float, npartitions=2*n_workers) X = ddf_float[ddf_float.columns.difference(['2'])] y = ddf_float['2'] mod = LinearRegression() mod = mod.fit(X, y) actual_output = str(mod.predict(X).compute().values) expected_output = '[4. 2. 1.]' assert actual_output == expected_output
def test_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({ "x": np.random.randint(0, 5, size=n), "y": np.random.normal(size=n) }) gdf = gd.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) # Combine with .append head = frags[0] tail = frags[1:] appended = dgd.from_cudf(head, npartitions=1) for each in tail: appended = appended.append(each) assert_frame_equal(df, appended.compute().to_pandas())
def test_series_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({ "x": np.random.randint(0, 5, size=n), "y": np.random.normal(size=n) }) gdf = gd.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) frags = [df.x for df in frags] appending = dgd.from_cudf(frags[0], npartitions=1) for frag in frags[1:]: appending = appending.append(frag) appended = appending.compute().to_pandas() assert isinstance(appended, pd.Series) np.testing.assert_array_equal(appended, df.x)
def test_make_meta_backends(index): dtypes = ["int8", "int32", "int64", "float64"] df = cudf.DataFrame( {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes} ) df["strings"] = ["cat", "dog", "fish"] df["cats"] = df["strings"].astype("category") df["time_s"] = np.array( ["2018-10-07", "2018-10-08", "2018-10-09"], dtype="datetime64[s]" ) df["time_ms"] = df["time_s"].astype("datetime64[ms]") df["time_ns"] = df["time_s"].astype("datetime64[ns]") df = df.set_index(index) ddf = dgd.from_cudf(df, npartitions=1) # Check "empty" metadata types dd.assert_eq(ddf._meta.dtypes, df.dtypes) # Check "non-empty" metadata types dd.assert_eq(ddf._meta.dtypes, ddf._meta_nonempty.dtypes)
def test_groupby_apply(): np.random.seed(0) nelem = 100 xs = _gen_uniform_keys(nelem) ys = _gen_uniform_keys(nelem) df = pd.DataFrame({ 'x': xs, 'y': ys, 'idx': np.arange(nelem), 'v1': np.random.normal(size=nelem), 'v2': np.random.normal(size=nelem) }) gdf = gd.DataFrame.from_pandas(df) dgf = dgd.from_cudf(gdf, npartitions=2) def transform(df): df['out1'] = df.y * (df.v1 + df.v2) return df grouped = dgf.groupby(by=['x', 'y']).apply(transform) # Compute with dask dgd_grouped = grouped.compute().to_pandas() binning = {} for _, row in dgd_grouped.iterrows(): binning[row.idx] = row # Emulate the operation with pandas pd_groupby = df.groupby(by=['x', 'y'], sort=True, as_index=True).apply(transform) # Check the result for _, expect in pd_groupby.iterrows(): got = binning[expect.idx] attrs = ['x', 'y', 'v1', 'v2', 'out1'] for a in attrs: np.testing.assert_equal(getattr(got, a), getattr(expect, a))
def test_groupby_basic_aggs(agg): pdf = pd.DataFrame({ "x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000), }) gdf = cudf.DataFrame.from_pandas(pdf) ddf = dask_cudf.from_cudf(gdf, npartitions=5) a = getattr(gdf.groupby("x"), agg)().to_pandas() b = getattr(ddf.groupby("x"), agg)().compute().to_pandas() a.index.name = None a.name = None b.index.name = None b.name = None if agg == "count": a["y"] = a["y"].astype(np.int64) dd.assert_eq(a, b)