def test_get_dummies_large(): gdf = cudf.datasets.randomdata( nrows=200000, dtypes={ "C": int, "first": "category", "b": float, "second": "category", }, ) df = gdf.to_pandas() ddf = dd.from_pandas(df, npartitions=25) dd.assert_eq(dd.get_dummies(ddf).compute(), pd.get_dummies(df)) gddf = dask_cudf.from_cudf(gdf, npartitions=25) dd.assert_eq( dd.get_dummies(ddf).compute(), dd.get_dummies(gddf).compute(), check_dtype=False, )
def test_from_cudf(): np.random.seed(0) df = pd.DataFrame( { "x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000), } ) gdf = cudf.DataFrame.from_pandas(df) # Test simple around to/from dask ingested = dd.from_pandas(gdf, npartitions=2) dd.assert_eq(ingested, df) # Test conversion to dask.dataframe ddf = ingested.to_dask_dataframe() dd.assert_eq(ddf, df)
def test_groupby_multiindex_reset_index(npartitions): df = cudf.DataFrame({ "a": [1, 1, 2, 3, 4], "b": [5, 2, 1, 2, 5], "c": [1, 2, 2, 3, 5] }) ddf = dask_cudf.from_cudf(df, npartitions=npartitions) pddf = dd.from_pandas(df.to_pandas(), npartitions=npartitions) gr = ddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index() pr = pddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index() # CuDF uses "int32" for count. Pandas uses "int64" gr_out = gr.compute().sort_values(by=["a", "c"]).reset_index(drop=True) gr_out[("b", "count")] = gr_out[("b", "count")].astype("int64") dd.assert_eq( gr_out, pr.compute().sort_values(by=["a", "c"]).reset_index(drop=True), )
def test_indexed_join(how): p_left = pd.DataFrame({"x": np.arange(10)}, index=np.arange(10) * 2) p_right = pd.DataFrame({"y": 1}, index=np.arange(15)) g_left = cudf.from_pandas(p_left) g_right = cudf.from_pandas(p_right) dg_left = dd.from_pandas(g_left, npartitions=4) dg_right = dd.from_pandas(g_right, npartitions=5) d = g_left.merge(g_right, left_index=True, right_index=True, how=how) dg = dg_left.merge(dg_right, left_index=True, right_index=True, how=how) # occassionally order is not correct (possibly do to hashing in the merge) d = d.sort_values("x") # index is preserved dg = dg.sort_values( "x") # index is reset -- sort_values will slow test down dd.assert_eq(d, dg, check_index=False)
def test_dataset_shuffle_on_keys(tmpdir, cpu, partition_on, keys, npartitions): # Initial timeseries dataset size = 60 df1 = pd.DataFrame({ "name": np.random.choice(["Dave", "Zelda"], size=size), "id": np.random.choice([0, 1], size=size), "x": np.random.uniform(low=0.0, high=10.0, size=size), "y": np.random.uniform(low=0.0, high=10.0, size=size), }) ddf1 = dd.from_pandas(df1, npartitions=3) # Write the dataset to disk path = str(tmpdir) ddf1.to_parquet(str(tmpdir), partition_on=partition_on) # Construct NVT Dataset ds = nvt.Dataset(path, engine="parquet") # Shuffle the dataset by `keys` ds2 = ds.shuffle_by_keys(keys, npartitions=npartitions) # Inspect the result ddf2 = ds2.to_ddf() if npartitions: assert ddf2.npartitions == npartitions # A successful shuffle will return the same unique-value # count for both the full dask algorithm and a partition-wise sum n1 = sum([len(p[keys].drop_duplicates()) for p in ddf2.partitions]) n2 = len(ddf2[keys].drop_duplicates()) assert n1 == n2 # Check that none of the rows was changed df1 = df1.sort_values(["id", "x", "y"]).reset_index(drop=True) df2 = ddf2.compute().sort_values(["id", "x", "y"]).reset_index(drop=True) if partition_on: # Dask will convert partitioned columns to Categorical df2["name"] = df2["name"].astype("object") df2["id"] = df2["id"].astype("int64") for col in df1: # Order of columns can change after round-trip partitioning assert_eq(df1[col], df2[col], check_index=False)
def test_on(how, on): left = cudf.DataFrame( {"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]} ) right = cudf.DataFrame( {"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]} ) dleft = dd.from_pandas(left, npartitions=2) dright = dd.from_pandas(right, npartitions=3) expected = left.merge(right, how=how, on=on) result = dleft.merge(dright, how=how, on=on) dd.assert_eq( result.compute().to_pandas().sort_values(on), expected.to_pandas().sort_values(on), check_index=False, )
def test_groupby_agg(func): pdf = pd.DataFrame({ "x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000), }) gdf = cudf.DataFrame.from_pandas(pdf) ddf = dask_cudf.from_cudf(gdf, npartitions=5) a = func(gdf).to_pandas() b = func(ddf).compute().to_pandas() a.index.name = None a.name = None b.index.name = None b.name = None dd.assert_eq(a, b)
def test_groupby_reset_index_names(): df = cudf.datasets.randomdata(nrows=10, dtypes={ "a": str, "b": int, "c": int }) pdf = df.to_pandas() gddf = dask_cudf.from_cudf(df, 2) pddf = dd.from_pandas(pdf, 2) g_res = gddf.groupby("a", sort=True).sum() p_res = pddf.groupby("a", sort=True).sum() got = g_res.reset_index().compute().sort_values(["a", "b", "c"]) expect = p_res.reset_index().compute().sort_values(["a", "b", "c"]) dd.assert_eq(got, expect)
def test_groupby_split_out(split_out, column): df = pd.DataFrame({ "a": np.arange(8), "b": [1, 0, 0, 2, 1, 1, 2, 0], "c": [0, 1] * 4, "d": ["dog", "cat", "cat", "dog", "dog", "dog", "cat", "bird"], }) df["e"] = df["d"].astype("category") gdf = cudf.from_pandas(df) ddf = dd.from_pandas(df, npartitions=3) gddf = dask_cudf.from_cudf(gdf, npartitions=3) ddf_result = (ddf.groupby(column).a.mean( split_out=split_out).compute().sort_values().dropna()) gddf_result = (gddf.groupby(column).a.mean( split_out=split_out).compute().sort_values()) dd.assert_eq(gddf_result, ddf_result, check_index=False)
def test_groupby_reset_index_string_name(): df = cudf.DataFrame({"value": range(5), "key": ["a", "a", "b", "a", "c"]}) pdf = df.to_pandas() gddf = dask_cudf.from_cudf(df, npartitions=1) pddf = dd.from_pandas(pdf, npartitions=1) g_res = (gddf.groupby(["key"]).agg({ "value": "mean" }).reset_index(drop=False)) p_res = (pddf.groupby(["key"]).agg({ "value": "mean" }).reset_index(drop=False)) got = g_res.compute().sort_values(["key", "value"]).reset_index(drop=True) expect = (p_res.compute().sort_values(["key", "value"]).reset_index(drop=True)) dd.assert_eq(got, expect) assert len(g_res) == len(p_res)
def test_conditional_join_with_limit(c): df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) ddf = dd.from_pandas(df, 5) c.create_table("many_partitions", ddf) df = df.assign(common=1) expected_df = df.merge(df, on="common", suffixes=("", "0")).drop(columns="common") expected_df = expected_df[expected_df["a"] >= 2][:4] actual_df = c.sql(""" SELECT * FROM many_partitions as df1, many_partitions as df2 WHERE df1."a" >= 2 LIMIT 4 """) dd.assert_eq(actual_df, expected_df, check_index=False)
def test_groupby_nested_dict(func): pdf = pd.DataFrame( { "x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000), } ) ddf = dd.from_pandas(pdf, npartitions=5) c_ddf = ddf.map_partitions(cudf.from_pandas) a = func(ddf).compute() b = func(c_ddf).compute().to_pandas() a.index.name = None a.name = None b.index.name = None b.name = None dd.assert_eq(a, b)
def test_single_dataframe_merge(daskify): right = cudf.DataFrame({"x": [1, 2, 1, 2], "y": [1, 2, 3, 4]}) left = cudf.DataFrame({"x": np.arange(100) % 10, "z": np.arange(100)}) dleft = dd.from_pandas(left, npartitions=10) if daskify: dright = dd.from_pandas(right, npartitions=1) else: dright = right expected = left.merge(right, how="inner") result = dd.merge(dleft, dright, how="inner") assert len(result.dask) < 25 dd.assert_eq( result.compute().to_pandas().sort_values(["z", "y"]), expected.to_pandas().sort_values(["z", "y"]), check_index=False, )
def test_merge_left( left_nrows, right_nrows, left_nkeys, right_nkeys, how="left" ): chunksize = 3 np.random.seed(0) # cuDF left = cudf.DataFrame( { "x": np.random.randint(0, left_nkeys, size=left_nrows), "y": np.random.randint(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { "x": np.random.randint(0, right_nkeys, size=right_nrows), "y": np.random.randint(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) expect = left.merge(right, on=("x", "y"), how=how) def normalize(df): return ( df.to_pandas() .sort_values(["x", "y", "a_x", "a_y"]) .reset_index(drop=True) ) # dask_cudf left = dgd.from_cudf(left, chunksize=chunksize) right = dgd.from_cudf(right, chunksize=chunksize) result = left.merge(right, on=("x", "y"), how=how).compute( scheduler="single-threaded" ) dd.assert_eq(normalize(expect), normalize(result))
def test_dask_dataset(datasets, engine, num_files, cpu): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) paths = paths[:num_files] if engine == "parquet": ddf0 = dask_cudf.read_parquet(paths)[mycols_pq] dataset = nvtabular.io.Dataset(paths, cpu=cpu) result = dataset.to_ddf(columns=mycols_pq) else: ddf0 = dask_cudf.read_csv(paths, header=None, names=allcols_csv)[mycols_csv] dataset = nvtabular.io.Dataset(paths, cpu=cpu, header=None, names=allcols_csv) result = dataset.to_ddf(columns=mycols_csv) # We do not preserve the index in NVTabular if engine == "parquet": assert_eq(ddf0, result, check_index=False) else: assert_eq(ddf0, result) # Check that the cpu kwarg is working correctly if cpu: assert isinstance(result.compute(), pd.DataFrame) # Should still work if we move to the GPU # (test behavior after repetitive conversion) dataset.to_gpu() dataset.to_cpu() dataset.to_cpu() dataset.to_gpu() result = dataset.to_ddf() assert isinstance(result.compute(), cudf.DataFrame) else: assert isinstance(result.compute(), cudf.DataFrame) # Should still work if we move to the CPU # (test behavior after repetitive conversion) dataset.to_cpu() dataset.to_gpu() dataset.to_gpu() dataset.to_cpu() result = dataset.to_ddf() assert isinstance(result.compute(), pd.DataFrame)
def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata): tmpdir = str(tmpdir) df = pd.DataFrame() df["year"] = [2018, 2019, 2019, 2019, 2020, 2021] df["month"] = [1, 2, 3, 3, 3, 2] df["day"] = [1, 1, 1, 2, 2, 1] df["data"] = [0, 0, 0, 0, 0, 0] df.index.name = "index" if daskcudf: ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2) ddf2.to_parquet(tmpdir, write_metadata_file=metadata, partition_on=parts) else: ddf2 = dd.from_pandas(df, npartitions=2) ddf2.to_parquet( tmpdir, engine="pyarrow", write_metadata_file=metadata, partition_on=parts, ) df_read = dd.read_parquet(tmpdir, engine="pyarrow") gdf_read = dask_cudf.read_parquet(tmpdir) # TODO: Avoid column selection after `CudfEngine` # can be aligned with dask/dask#6534 columns = list(df_read.columns) assert set(df_read.columns) == set(gdf_read.columns) dd.assert_eq( df_read.compute(scheduler=dask.get)[columns], gdf_read.compute(scheduler=dask.get)[columns], ) assert gdf_read.index.name == "index" # Check that we don't have uuid4 file names for _, _, files in os.walk(tmpdir): for fn in files: if not fn.startswith("_"): assert "part" in fn
def test_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({ "x": np.random.randint(0, 5, size=n), "y": np.random.normal(size=n) }) gdf = cudf.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) # Combine with .append head = frags[0] tail = frags[1:] appended = dd.from_pandas(head, npartitions=1) for each in tail: appended = appended.append(each) dd.assert_eq(df, appended)
def test_make_meta_backends(index): dtypes = ["int8", "int32", "int64", "float64"] df = cudf.DataFrame( {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes} ) df["strings"] = ["cat", "dog", "fish"] df["cats"] = df["strings"].astype("category") df["time_s"] = np.array( ["2018-10-07", "2018-10-08", "2018-10-09"], dtype="datetime64[s]" ) df["time_ms"] = df["time_s"].astype("datetime64[ms]") df["time_ns"] = df["time_s"].astype("datetime64[ns]") df = df.set_index(index) ddf = dgd.from_cudf(df, npartitions=1) # Check "empty" metadata types dd.assert_eq(ddf._meta.dtypes, df.dtypes) # Check "non-empty" metadata types dd.assert_eq(ddf._meta.dtypes, ddf._meta_nonempty.dtypes)
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_cols=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check results. Need to sort for direct comparison expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() gb_e = expect.groupby("name-cat").aggregate({ "name-cat": "count", "x": ["sum", "min", "std"] }) gb_e.columns = ["count", "sum", "min", "std"] df_check = got.merge(gb_e, left_on="name-cat", right_index=True, how="left") assert_eq(df_check["name-cat_count"], df_check["count"].astype("int64"), check_names=False) assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False) assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False) assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
def test_hash_object_dispatch(index): obj = cudf.DataFrame({ "x": ["a", "b", "c"], "y": [1, 2, 3], "z": [1, 1, 0] }, index=[2, 4, 6]) # DataFrame result = dd.utils.hash_object_dispatch(obj, index=index) expected = dgd.backends.hash_object_cudf(obj, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # Series result = dd.utils.hash_object_dispatch(obj["x"], index=index) expected = dgd.backends.hash_object_cudf(obj["x"], index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # DataFrame with MultiIndex obj_multi = obj.set_index(["x", "z"], drop=True) result = dd.utils.hash_object_dispatch(obj_multi, index=index) expected = dgd.backends.hash_object_cudf(obj_multi, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected)
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_size="1MB", cpu=cpu) else: dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu) # Simple transform (normalize) cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Normalize() workflow = Workflow(conts + cat_names + label_name, client=client) transformed = workflow.fit_transform(dataset) # Write out dataset output_path = os.path.join(tmpdir, "processed") transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=4) # Check the final result df_disk = dd_read_parquet(output_path, engine="pyarrow").compute() assert_eq( df0.sort_values(["id", "x"])[["name-string", "label"]], df_disk.sort_values(["id", "x"])[["name-string", "label"]], check_index=False, )
def test_groupby_agg_empty_partition(tmpdir, split_out): # Write random and empty cudf DataFrames # to two distinct files. df = cudf.datasets.randomdata() df.to_parquet(str(tmpdir.join("f0.parquet"))) cudf.DataFrame( columns=["id", "x", "y"], dtype={ "id": "int64", "x": "float64", "y": "float64" }, ).to_parquet(str(tmpdir.join("f1.parquet"))) # Read back our two partitions as a single # dask_cudf DataFrame (one partition is now empty) ddf = dask_cudf.read_parquet(str(tmpdir)) gb = ddf.groupby(["id"]).agg({"x": ["sum"]}, split_out=split_out) expect = df.groupby(["id"]).agg({"x": ["sum"]}).sort_index() dd.assert_eq(gb.compute().sort_index(), expect)
def test_read_csv_compression(tmp_path): df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20))) df.to_csv(tmp_path / "data.csv.gz", index=False, compression="gzip") with pytest.warns(UserWarning) as w: df2 = dask_cudf.read_csv( tmp_path / "*.csv.gz", chunksize="50 B", compression="gzip" ) assert len(w) == 1 msg = str(w[0].message) assert "gzip" in msg assert df2.npartitions == 1 dd.assert_eq(df2, df, check_index=False) with warnings.catch_warnings(record=True) as record: df2 = dask_cudf.read_csv( tmp_path / "*.csv.gz", chunksize=None, compression="gzip" ) assert not record
def test_multifile_parquet(tmpdir, dataset, df, engine, num_io_threads, nfiles, shuffle): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] columns = cat_names + cont_names + label_names workflow = nvt.Workflow(nvt.ColumnGroup(columns)) outdir = str(tmpdir.mkdir("out")) transformed = workflow.transform(nvt.Dataset(df)) transformed.to_parquet( output_path=outdir, num_threads=num_io_threads, shuffle=shuffle, out_files_per_proc=nfiles ) # Check that our output data is exactly the same out_paths = glob.glob(os.path.join(outdir, "*.parquet")) df_check = cudf.read_parquet(out_paths) assert_eq( df_check[columns].sort_values(["x", "y"]), df[columns].sort_values(["x", "y"]), check_index=False, )
def test_merge_1col_left( left_nrows, right_nrows, left_nkeys, right_nkeys, how="left" ): chunksize = 3 np.random.seed(0) # cuDF left = cudf.DataFrame( { "x": np.random.randint(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { "x": np.random.randint(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) expect = left.merge(right, on=["x"], how=how) expect = ( expect.to_pandas() .sort_values(["x", "a_x", "a_y"]) .reset_index(drop=True) ) # dask_cudf left = dgd.from_cudf(left, chunksize=chunksize) right = dgd.from_cudf(right, chunksize=chunksize) joined = left.merge(right, on=["x"], how=how) got = joined.compute().to_pandas() got = got.sort_values(["x", "a_x", "a_y"]).reset_index(drop=True) dd.assert_eq(expect, got)
def test_create_metadata_file_inconsistent_schema(tmpdir): # NOTE: This test demonstrates that the CudfEngine # can be used to generate a global `_metadata` file # even if there are inconsistent schemas in the dataset. # Write file 0 df0 = pd.DataFrame({"a": [None] * 10, "b": range(10)}) p0 = os.path.join(tmpdir, "part.0.parquet") df0.to_parquet(p0, engine="pyarrow") # Write file 1 b = list(range(10)) b[1] = None df1 = pd.DataFrame({"a": range(10), "b": b}) p1 = os.path.join(tmpdir, "part.1.parquet") df1.to_parquet(p1, engine="pyarrow") # New pyarrow-dataset base can handle an inconsistent # schema (even without a _metadata file), but computing # and dtype validation may fail ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) # Add global metadata file. # Dask-CuDF can do this without requiring schema # consistency. dask_cudf.io.parquet.create_metadata_file([p0, p1]) # Check that we can still read the ddf # with the _metadata file present ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) # Check that the result is the same with and # without the _metadata file. Note that we must # call `compute` on `ddf1`, because the dtype of # the inconsistent column ("a") may be "object" # before computing, and "int" after dd.assert_eq(ddf1.compute(), ddf2) dd.assert_eq(ddf1.compute(), ddf2.compute())
def test_groupby_basic_aggs(agg): pdf = pd.DataFrame({ "x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000), }) gdf = cudf.DataFrame.from_pandas(pdf) ddf = dask_cudf.from_cudf(gdf, npartitions=5) a = getattr(gdf.groupby("x"), agg)().to_pandas() b = getattr(ddf.groupby("x"), agg)().compute().to_pandas() a.index.name = None a.name = None b.index.name = None b.name = None if agg == "count": a["y"] = a["y"].astype(np.int64) dd.assert_eq(a, b)
def test_row_groups_per_part(tmpdir, row_groups, index): nparts = 2 df_size = 100 row_group_size = 5 file_row_groups = 10 # Known apriori npartitions_expected = math.ceil(file_row_groups / row_groups) * 2 df = pd.DataFrame({ "a": np.random.choice(["apple", "banana", "carrot"], size=df_size), "b": np.random.random(size=df_size), "c": np.random.randint(1, 5, size=df_size), "index": np.arange(0, df_size), }) if index: df = df.set_index("index") ddf1 = dd.from_pandas(df, npartitions=nparts) ddf1.to_parquet( str(tmpdir), engine="pyarrow", row_group_size=row_group_size, write_metadata_file=True, write_index=index, ) ddf2 = dask_cudf.read_parquet( str(tmpdir), row_groups_per_part=row_groups, index="index" if index else False, ) dd.assert_eq(ddf1, ddf2, check_divisions=False, check_index=index) assert ddf2.npartitions == npartitions_expected
def test_reset_index_multiindex(): df = cudf.DataFrame() df["id_1"] = ["a", "a", "b"] df["id_2"] = [0, 0, 1] df["val"] = [1, 2, 3] df_lookup = cudf.DataFrame() df_lookup["id_1"] = ["a", "b"] df_lookup["metadata"] = [0, 1] gddf = dask_cudf.from_cudf(df, npartitions=2) gddf_lookup = dask_cudf.from_cudf(df_lookup, npartitions=2) ddf = dd.from_pandas(df.to_pandas(), npartitions=2) ddf_lookup = dd.from_pandas(df_lookup.to_pandas(), npartitions=2) # Note: 'id_2' has wrong type (object) until after compute dd.assert_eq( gddf.groupby(by=["id_1", "id_2"]).val.sum().reset_index().merge( gddf_lookup, on="id_1").compute(), ddf.groupby(by=["id_1", "id_2"]).val.sum().reset_index().merge( ddf_lookup, on="id_1"), )
def test_create_metadata_file(tmpdir, partition_on): tmpdir = str(tmpdir) # Write ddf without a _metadata file df1 = cudf.DataFrame({"b": range(100), "a": ["A", "B", "C", "D"] * 25}) df1.index.name = "myindex" ddf1 = dask_cudf.from_cudf(df1, npartitions=10) ddf1.to_parquet( tmpdir, write_metadata_file=False, partition_on=partition_on, ) # Add global _metadata file if partition_on: fns = glob.glob(os.path.join(tmpdir, partition_on + "=*/*.parquet")) else: fns = glob.glob(os.path.join(tmpdir, "*.parquet")) dask_cudf.io.parquet.create_metadata_file( fns, split_every=3, # Force tree reduction ) # Check that we can now read the ddf # with the _metadata file present ddf2 = dask_cudf.read_parquet( tmpdir, gather_statistics=True, split_row_groups=False, index="myindex", ) if partition_on: ddf1 = df1.sort_values("b") ddf2 = ddf2.compute().sort_values("b") ddf2.a = ddf2.a.astype("object") dd.assert_eq(ddf1, ddf2)