def read(self, table, relevant_cols=None, **kwargs): import dask_cudf filepath = self.table_path_mapping[table] # we ignore split_row_groups if gather_statistics=False if self.split_row_groups: df = dask_cudf.read_parquet( filepath, columns=relevant_cols, split_row_groups=self.split_row_groups, gather_statistics=True, **kwargs, ) else: df = dask_cudf.read_parquet( filepath, columns=relevant_cols, split_row_groups=self.split_row_groups, gather_statistics=False, **kwargs, ) ## Repartition small tables to a single partition to prevent ## distributed merges when possible ## Only matters when partition size<3GB if (table in SMALL_TABLES) or (table in SUPER_SMALL_TABLES): df = df.repartition(npartitions=1) return df
def test_roundtrip_from_dask(tmpdir): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine='pyarrow') files = sorted([ os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if not f.endswith('_metadata') ], key=natural_sort_key) # Read list of parquet files ddf2 = dask_cudf.read_parquet(files) assert_eq(ddf, ddf2, check_divisions=False) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet(files, columns=['x']) assert_eq(ddf[['x']], ddf2, check_divisions=False) # Specify columns='y' ddf2 = dask_cudf.read_parquet(files, columns='y') assert_eq(ddf[['y']], ddf2, check_divisions=False) # Read parquet-dataset directory # dask_cudf.read_parquet will ignore *_metadata files ddf2 = dask_cudf.read_parquet(os.path.join(tmpdir, '*')) assert_eq(ddf, ddf2, check_divisions=False)
def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata): tmpdir = str(tmpdir) df = pd.DataFrame() df["year"] = [2018, 2019, 2019, 2019, 2020, 2021] df["month"] = [1, 2, 3, 3, 3, 2] df["day"] = [1, 1, 1, 2, 2, 1] df["data"] = [0, 0, 0, 0, 0, 0] df.index.name = "index" if daskcudf: ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2) ddf2.to_parquet(tmpdir, write_metadata_file=metadata, partition_on=parts) else: ddf2 = dd.from_pandas(df, npartitions=2) ddf2.to_parquet( tmpdir, engine="pyarrow", write_metadata_file=metadata, partition_on=parts, ) df_read = dd.read_parquet(tmpdir, engine="pyarrow") gdf_read = dask_cudf.read_parquet(tmpdir) # TODO: Avoid column selection after `CudfEngine` # can be aligned with dask/dask#6534 columns = list(df_read.columns) assert set(df_read.columns) == set(gdf_read.columns) dd.assert_eq( df_read.compute(scheduler=dask.get)[columns], gdf_read.compute(scheduler=dask.get)[columns], ) assert gdf_read.index.name == "index" # Check that we don't have uuid4 file names for _, _, files in os.walk(tmpdir): for fn in files: if not fn.startswith("_"): assert "part" in fn if parse_version(dask.__version__) > parse_version("2021.07.0"): # This version of Dask supports `aggregate_files=True`. # Check that we can aggregate by a partition name. df_read = dd.read_parquet(tmpdir, engine="pyarrow", aggregate_files="year") gdf_read = dask_cudf.read_parquet(tmpdir, aggregate_files="year") dd.assert_eq(df_read, gdf_read)
def test_roundtrip_from_pandas(tmpdir): fn = str(tmpdir.join("test.parquet")) # First without specifying an index dfp = df.copy() dfp.to_parquet(fn, engine="pyarrow", index=False) dfp = dfp.reset_index(drop=True) ddf2 = dask_cudf.read_parquet(fn) dd.assert_eq(dfp, ddf2, check_index=True) # Now, specifying an index dfp = df.copy() dfp.to_parquet(fn, engine="pyarrow", index=True) ddf2 = dask_cudf.read_parquet(fn, index=["index"]) dd.assert_eq(dfp, ddf2, check_index=True)
def test_roundtrip_from_dask_cudf(tmpdir, write_meta): tmpdir = str(tmpdir) gddf = dask_cudf.from_dask_dataframe(ddf) gddf.to_parquet(tmpdir, write_metadata_file=write_meta) gddf2 = dask_cudf.read_parquet(tmpdir) dd.assert_eq(gddf, gddf2, check_divisions=write_meta)
def to_ddf(self, columns=None, cpu=None): # Check if we are using cpu cpu = self.cpu if cpu is None else cpu if cpu: # Return a Dask-Dataframe in CPU memory for try_engine in ["pyarrow-dataset", "pyarrow"]: # Try to use the "pyarrow-dataset" engine, if # available, but fall back on vanilla "pyarrow" # for older Dask versions. try: return dd.read_parquet( self.paths, engine=try_engine, columns=columns, index=None if columns is None else False, gather_statistics=False, split_row_groups=self.row_groups_per_part, storage_options=self.storage_options, ) except ValueError: pass raise RuntimeError("dask.dataframe.read_parquet failed.") return dask_cudf.read_parquet( self.paths, columns=columns, # can't omit reading the index in if we aren't being passed columns index=None if columns is None else False, gather_statistics=False, split_row_groups=self.row_groups_per_part, storage_options=self.storage_options, )
def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata): tmpdir = str(tmpdir) df = pd.DataFrame() df["year"] = [2018, 2019, 2019, 2019, 2020, 2021] df["month"] = [1, 2, 3, 3, 3, 2] df["day"] = [1, 1, 1, 2, 2, 1] df["data"] = [0, 0, 0, 0, 0, 0] df.index.name = "index" if daskcudf: ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2) ddf2.to_parquet(tmpdir, write_metadata_file=metadata, partition_on=parts) else: ddf2 = dd.from_pandas(df, npartitions=2) ddf2.to_parquet( tmpdir, engine="pyarrow", write_metadata_file=metadata, partition_on=parts, ) df_read = dd.read_parquet(tmpdir, engine="pyarrow", index="index") gdf_read = dask_cudf.read_parquet(tmpdir, index="index") assert_eq( df_read.compute(scheduler=dask.get), gdf_read.compute(scheduler=dask.get), )
def dask_gpu_parquet_ingest(self, target_files, columns=None): if self.rapids_version < 15: # rapids 0.14 has a known issue with read_parquet https://github.com/rapidsai/cudf/issues/5579 return dask_cudf.from_dask_dataframe( self.dask_cpu_parquet_ingest(target_files, columns=columns)) else: return dask_cudf.read_parquet(target_files, columns=columns)
def fetch_data(self): """ Fetch data using dask based on provided config object """ df = None input_format = self.config["input_format"].lower() filepath = self.config["input_path"] kwargs = self.config.copy() del kwargs["type"] del kwargs["input_format"] del kwargs["input_path"] if "csv" == input_format: df = dask_cudf.read_csv(filepath, **kwargs) elif "parquet" == input_format: df = dask_cudf.read_parquet(filepath, **kwargs) elif "orc" == input_format: df = dask_cudf.read_orc(filepath, engine="cudf") elif "json" == input_format: df = dask_cudf.read_json(filepath, **kwargs) else: raise NotImplementedError("%s is not a supported input_format" % (input_format)) self.has_data = False return df
def test_dask_timeseries_from_dask(tmpdir): fn = str(tmpdir) ddf2 = dask.datasets.timeseries(freq="D") ddf2.to_parquet(fn, engine="pyarrow") read_df = dask_cudf.read_parquet(fn) # Note: Loosing the index name here assert_eq(ddf2, read_df.compute().to_pandas(), check_index=False)
def test_dask_timeseries_from_pandas(tmpdir): fn = str(tmpdir.join("test.parquet")) ddf2 = dask.datasets.timeseries(freq="D") pdf = ddf2.compute() pdf.to_parquet(fn, engine="pyarrow") read_df = dask_cudf.read_parquet(fn) dd.assert_eq(ddf2, read_df.compute())
def test_roundtrip_from_dask_cudf(tmpdir): tmpdir = str(tmpdir) gddf = dask_cudf.from_dask_dataframe(ddf) gddf.to_parquet(tmpdir) # NOTE: Need `.compute()` to resolve correct index # name after `from_dask_dataframe` gddf2 = dask_cudf.read_parquet(tmpdir) assert_eq(gddf.compute(), gddf2)
def test_roundtrip_from_dask_none_index_false(tmpdir): tmpdir = str(tmpdir) path = os.path.join(tmpdir, "test.parquet") df2 = ddf.reset_index(drop=True).compute() df2.to_parquet(path, engine="pyarrow") ddf3 = dask_cudf.read_parquet(path, index=False) dd.assert_eq(df2, ddf3)
def test_cudf_dtypes_from_pandas(tmpdir, data): # Simple test that we can read in list and struct types fn = str(tmpdir.join("test.parquet")) dfp = pd.DataFrame({"data": data}) dfp.to_parquet(fn, engine="pyarrow", index=True) # Use `split_row_groups=True` to avoid "fast path" where # schema is not is passed through in older Dask versions ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True) dd.assert_eq(cudf.from_pandas(dfp), ddf2)
def test_filters_at_row_group_level(tmpdir): tmp_path = str(tmpdir) df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) ddf = dd.from_pandas(df, npartitions=5) assert ddf.npartitions == 5 ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=10 / 5) a = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)]) assert a.npartitions == 1 assert (a.shape[0] == 2).compute() ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1) b = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)]) assert b.npartitions == 1 assert (b.shape[0] == 1).compute()
def test_read_parquet(): pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]}) buffer = BytesIO() pdf.to_parquet(fname=buffer) buffer.seek(0) with s3_context("daskparquet", {"file.parq": buffer}): df = dask_cudf.read_parquet("s3://daskparquet/*.parq") assert df.a.sum().compute() == 10 assert df.b.sum().compute() == 9
def test_strings(tmpdir): fn = str(tmpdir) dfp = pd.DataFrame({ "a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"] }) dfp.set_index("a", inplace=True, drop=True) ddf2 = dd.from_pandas(dfp, npartitions=2) ddf2.to_parquet(fn, engine="pyarrow") read_df = dask_cudf.read_parquet(fn, index=["a"]) dd.assert_eq(ddf2, read_df.compute().to_pandas()) read_df_cats = dask_cudf.read_parquet(fn, index=["a"], strings_to_categorical=True) dd.assert_eq(read_df_cats.dtypes, read_df_cats.compute().dtypes) dd.assert_eq(read_df_cats.dtypes[0], "int32")
def test_parquet_concat_within_workers(client_connection): if not os.path.exists("test_files_parquet"): print("Generate data... ") os.mkdir("test_files_parquet") for x in range(10): if not os.path.exists("test_files_parquet/df" + str(x)): df = utils.random_edgelist(e=100, ef=16, dtypes={ "src": np.int32, "dst": np.int32 }, seed=x) df.to_parquet("test_files_parquet/df" + str(x), index=False) n_gpu = get_n_workers() print("Read_parquet... ") t1 = time.time() ddf = dask_cudf.read_parquet("test_files_parquet/*", dtype=["int32", "int32"]) ddf = ddf.persist() futures_of(ddf) wait(ddf) t1 = time.time() - t1 print("*** Read Time: ", t1, "s") print(ddf) assert ddf.npartitions > n_gpu print("Drop_duplicates... ") t2 = time.time() ddf.drop_duplicates(inplace=True) ddf = ddf.persist() futures_of(ddf) wait(ddf) t2 = time.time() - t2 print("*** Drop duplicate time: ", t2, "s") assert t2 < t1 print("Repartition... ") t3 = time.time() # Notice that ideally we would use : # ddf = ddf.repartition(npartitions=n_gpu) # However this is slower than reading and requires more memory # Using custom concat instead client = default_client() ddf = concat_within_workers(client, ddf) ddf = ddf.persist() futures_of(ddf) wait(ddf) t3 = time.time() - t3 print("*** repartition Time: ", t3, "s") print(ddf) assert t3 < t1
def test_dask_timeseries_from_dask(tmpdir, index, stats): fn = str(tmpdir) ddf2 = dask.datasets.timeseries(freq="D") ddf2.to_parquet(fn, engine="pyarrow", write_index=index) read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats) dd.assert_eq(ddf2, read_df, check_divisions=(stats and index), check_index=index)
def to_ddf(self, columns=None): return dask_cudf.read_parquet( self.paths, columns=columns, # can't omit reading the index in if we aren't being passed columns index=None if columns is None else False, gather_statistics=False, split_row_groups=self.row_groups_per_part, storage_options=self.storage_options, )
def test_empty(tmpdir, index): fn = str(tmpdir) dfp = pd.DataFrame({"a": [11.0, 12.0, 12.0], "b": [4, 5, 6]})[:0] if index: dfp.set_index("a", inplace=True, drop=True) ddf2 = dd.from_pandas(dfp, npartitions=2) ddf2.to_parquet(fn, write_index=index, engine="pyarrow") read_df = dask_cudf.read_parquet(fn) dd.assert_eq(ddf2, read_df.compute())
def test_dask_dataset_from_dataframe(tmpdir, origin, cpu): # Generate a DataFrame-based input if origin in ("pd", "dd"): df = pd.DataFrame({"a": range(100)}) if origin == "dd": df = dask.dataframe.from_pandas(df, npartitions=4) elif origin in ("cudf", "dask_cudf"): df = cudf.DataFrame({"a": range(100)}) if origin == "dask_cudf": df = dask_cudf.from_cudf(df, npartitions=4) # Convert to an NVTabular Dataset and back to a ddf dataset = nvtabular.io.Dataset(df, cpu=cpu) result = dataset.to_ddf() # Check resulting data assert_eq(df, result) # Check that the cpu kwarg is working correctly if cpu: assert isinstance(result.compute(), pd.DataFrame) # Should still work if we move to the GPU # (test behavior after repetitive conversion) dataset.to_gpu() dataset.to_cpu() dataset.to_cpu() dataset.to_gpu() result = dataset.to_ddf() assert isinstance(result.compute(), cudf.DataFrame) dataset.to_cpu() else: assert isinstance(result.compute(), cudf.DataFrame) # Should still work if we move to the CPU # (test behavior after repetitive conversion) dataset.to_cpu() dataset.to_gpu() dataset.to_gpu() dataset.to_cpu() result = dataset.to_ddf() assert isinstance(result.compute(), pd.DataFrame) dataset.to_gpu() # Write to disk and read back path = str(tmpdir) dataset.to_parquet(path, out_files_per_proc=1, shuffle=None) ddf_check = dask_cudf.read_parquet(path).compute() if origin in ("dd", "dask_cudf"): # Multiple partitions are not guarenteed the same # order in output file. ddf_check = ddf_check.sort_values("a") assert_eq(df, ddf_check, check_index=False)
def test_roundtrip_none_rangeindex(tmpdir): fn = str(tmpdir.join("test.parquet")) gdf = cudf.DataFrame( { "id": [0, 1, 2, 3], "val": [None, None, 0, 1] }, index=pd.RangeIndex(start=5, stop=9), ) dask_cudf.from_cudf(gdf, npartitions=2).to_parquet(fn) ddf2 = dask_cudf.read_parquet(fn) dd.assert_eq(gdf, ddf2, check_index=True)
def test_chunksize(tmpdir, chunksize, metadata): nparts = 2 df_size = 100 row_group_size = 5 row_group_byte_size = 451 # Empirically measured df = pd.DataFrame({ "a": np.random.choice(["apple", "banana", "carrot"], size=df_size), "b": np.random.random(size=df_size), "c": np.random.randint(1, 5, size=df_size), "index": np.arange(0, df_size), }).set_index("index") ddf1 = dd.from_pandas(df, npartitions=nparts) ddf1.to_parquet( str(tmpdir), engine="pyarrow", row_group_size=row_group_size, write_metadata_file=metadata, ) if metadata: path = str(tmpdir) else: dirname = str(tmpdir) files = os.listdir(dirname) assert "_metadata" not in files path = os.path.join(dirname, "*.parquet") ddf2 = dask_cudf.read_parquet( path, chunksize=chunksize, split_row_groups=True, gather_statistics=True, index="index", ) assert_eq(ddf1, ddf2, check_divisions=False) num_row_groups = df_size // row_group_size if not chunksize: assert ddf2.npartitions == num_row_groups else: # Check that we are really aggregating df_byte_size = row_group_byte_size * num_row_groups expected = df_byte_size // parse_bytes(chunksize) remainder = (df_byte_size % parse_bytes(chunksize)) > 0 expected += int(remainder) * nparts assert ddf2.npartitions == max(nparts, expected)
def get_wcs_minima(config): import dask_cudf wcs_df = dask_cudf.read_parquet( config["data_dir"] + "web_clickstreams/*.parquet", columns=["wcs_click_date_sk", "wcs_click_time_sk"], ) wcs_df["tstamp"] = wcs_df["wcs_click_date_sk"] * 86400 + wcs_df["wcs_click_time_sk"] wcs_tstamp_min = wcs_df["tstamp"].min().compute() return wcs_tstamp_min
def test_dask_timeseries_from_daskcudf(tmpdir, index, stats): fn = str(tmpdir) ddf2 = dask_cudf.from_cudf(cudf.datasets.timeseries(freq="D"), npartitions=4) ddf2.name = ddf2.name.astype("object") ddf2.to_parquet(fn, write_index=index) read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats) dd.assert_eq(ddf2, read_df, check_divisions=(stats and index), check_index=index)
def test_roundtrip_from_dask(tmpdir, stats): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine="pyarrow") files = sorted( (os.path.join(tmpdir, f) for f in os.listdir(tmpdir)), key=natural_sort_key, ) # Read list of parquet files ddf2 = dask_cudf.read_parquet(files, gather_statistics=stats) dd.assert_eq(ddf, ddf2, check_divisions=stats) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet(files, columns=["x"], gather_statistics=stats) dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats) # Specify columns='y' ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=stats) dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats) # Now include metadata ddf2 = dask_cudf.read_parquet(tmpdir, gather_statistics=stats) dd.assert_eq(ddf, ddf2, check_divisions=stats) # Specify columns=['x'] (with metadata) ddf2 = dask_cudf.read_parquet(tmpdir, columns=["x"], gather_statistics=stats) dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats) # Specify columns='y' (with metadata) ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", gather_statistics=stats) dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats)
def test_roundtrip_from_dask(tmpdir): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine="pyarrow") files = sorted( [ os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if not f.endswith("_metadata") ], key=natural_sort_key, ) # Read list of parquet files ddf2 = dask_cudf.read_parquet(files, gather_statistics=True) assert_eq(ddf, ddf2) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet(files, columns=["x"], gather_statistics=True) assert_eq(ddf[["x"]], ddf2) # Specify columns='y' ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=True) assert_eq(ddf[["y"]], ddf2) # Now include metadata; gather_statistics is True by default # Read list of parquet files ddf2 = dask_cudf.read_parquet(tmpdir) assert_eq(ddf, ddf2) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet(tmpdir, columns=["x"]) assert_eq(ddf[["x"]], ddf2) # Specify columns='y' ddf2 = dask_cudf.read_parquet(tmpdir, columns="y") assert_eq(ddf[["y"]], ddf2)
def test_dask_dataset(datasets, engine, num_files): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) paths = paths[:num_files] if engine == "parquet": ddf0 = dask_cudf.read_parquet(paths)[mycols_pq] dataset = nvtabular.io.Dataset(paths) result = dataset.to_ddf(columns=mycols_pq) else: ddf0 = dask_cudf.read_csv(paths, header=False, names=allcols_csv)[mycols_csv] dataset = nvtabular.io.Dataset(paths, header=False, names=allcols_csv) result = dataset.to_ddf(columns=mycols_csv) assert_eq(ddf0, result)
def test_ddf_dataset_itr(tmpdir, datasets, inp_format): paths = glob.glob(str(datasets["parquet"]) + "/*." + "parquet".split("-")[0]) ddf1 = dask_cudf.read_parquet(paths)[mycols_pq] df1 = ddf1.compute() if inp_format == "dask": ds = nvtabular.io.Dataset(ddf1.to_dask_dataframe()) elif inp_format == "dask_cudf": ds = nvtabular.io.Dataset(ddf1) elif inp_format == "cudf": ds = nvtabular.io.Dataset(df1) elif inp_format == "pandas": ds = nvtabular.io.Dataset(df1.to_pandas()) assert_eq(df1, cudf.concat(list(ds.to_iter(columns=mycols_pq))))