def test_s3_dataset(s3, paths, engine, df): # create a mocked out bucket here bucket = "testbucket" s3.create_bucket(Bucket=bucket) s3_paths = [] for path in paths: s3_path = f"s3://{bucket}/{path}" with fsspec.open(s3_path, "wb") as f: f.write(open(path, "rb").read()) s3_paths.append(s3_path) # create a basic s3 dataset dataset = nvt.Dataset(s3_paths) # make sure the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = cudf.concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name,) processor.add_feature([ops.ZeroFill(), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify(cat_cache="host")) processor.finalize() processor.update_stats(dataset)
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # Need to collect statistics first (for now) processor.update_stats(dataset) # Second "online" pass to write HugeCTR output processor.apply( dataset, apply_offline=False, record_stats=False, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=False, ) # Check files ext = "" if output_format == "parquet": ext = "parquet" assert os.path.isfile(outdir + "/metadata.json") elif output_format == "hugectr": ext = "data" assert os.path.isfile(outdir + "/file_list.txt") for n in range(nfiles): assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns, num_io_threads): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # Need to collect statistics first (for now) processor.update_stats(dataset) # Second "online" pass to write HugeCTR output processor.apply( dataset, apply_offline=False, record_stats=False, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=False, num_io_threads=num_io_threads, ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") # Check for _metadata.json assert os.path.isfile(outdir + "/_metadata.json") # Check contents of _metadata.json data = {} col_summary = {} with open(outdir + "/_metadata.json", "r") as fil: for k, v in json.load(fil).items(): data[k] = v assert "cats" in data assert "conts" in data assert "labels" in data assert "file_stats" in data assert len(data["file_stats"]) == nfiles for cdata in data["cats"] + data["conts"] + data["labels"]: col_summary[cdata["index"]] = cdata["col_name"] # Check that data files exist ext = "" if output_format == "parquet": ext = "parquet" elif output_format == "hugectr": ext = "data" for n in range(nfiles): assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext)) # Make sure the columns in "_metadata.json" make sense if output_format == "parquet": df_check = cudf.read_parquet(os.path.join(outdir, "0.parquet")) for i, name in enumerate(df_check.columns): if i in col_summary: assert col_summary[i] == name
def test_dask_workflow_api_dlrm(dask_cluster, tmpdir, datasets, freq_threshold, part_mem_fraction, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow(client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.ZeroFill(), ops.LogOp()]) processor.add_preprocess( ops.Categorify(freq_threshold=freq_threshold, out_path=str(tmpdir), split_out=2)) processor.finalize() if engine in ("parquet", "csv"): dataset = DaskDataset(paths, part_mem_fraction=part_mem_fraction) else: dataset = DaskDataset(paths, names=allcols_csv, part_mem_fraction=part_mem_fraction) processor.apply(dataset, output_path=str(tmpdir)) result = processor.get_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check category counts cat_expect = df0.groupby("name-string").agg({ "name-string": "count" }).reset_index(drop=True) cat_result = result.groupby("name-string").agg({ "name-string": "count" }).reset_index(drop=True) if freq_threshold: cat_expect = cat_expect[cat_expect["name-string"] >= freq_threshold] # Note that we may need to skip the 0th element in result (null mapping) assert_eq( cat_expect, cat_result.iloc[1:] if len(cat_result) > len(cat_expect) else cat_result, check_index=False, ) else: assert_eq(cat_expect, cat_result) # Read back from disk df_disk = dask_cudf.read_parquet("/".join([str(tmpdir), "processed"]), index=False).compute() for col in df_disk: assert_eq(result[col], df_disk[col])
def main(args): # Input data_path = args.data_path out_path = args.out_path freq_limit = args.freq_limit out_files_per_proc = args.splits if args.protocol == "ucx": os.environ["UCX_TLS"] = "tcp,cuda_copy,cuda_ipc,sockcm" # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] if args.cat_splits: tree_width = { name: int(s) for name, s in zip(cat_names, args.cat_splits.split(",")) } else: tree_width = {col: 1 for col in cat_names} if args.cat_names is None: # Using Criteo... Use more hash partitions for # known high-cardinality columns tree_width["C20"] = 8 tree_width["C1"] = 8 tree_width["C22"] = 4 tree_width["C10"] = 4 tree_width["C21"] = 2 tree_width["C11"] = 2 tree_width["C23"] = 2 tree_width["C12"] = 2 # Specify categorical caching location cat_cache = None if args.cat_cache: cat_cache = args.cat_cache.split(",") if len(cat_cache) == 1: cat_cache = cat_cache[0] else: # If user is specifying a list of options, # they must specify an option for every cat column assert len(cat_names) == len(cat_cache) if isinstance(cat_cache, str): cat_cache = {col: cat_cache for col in cat_names} elif isinstance(cat_cache, list): cat_cache = {name: c for name, c in zip(cat_names, cat_cache)} else: # Criteo/DLRM Defaults cat_cache = {col: "device" for col in cat_names} if args.cat_names is None: cat_cache["C20"] = "host" cat_cache["C1"] = "host" # Only need to cache the largest two on a dgx-2 if args.n_workers < 16: cat_cache["C22"] = "host" cat_cache["C10"] = "host" # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, device_memory_limit=device_limit, local_directory=args.dask_workspace, dashboard_address=":3787", ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, enable_nvlink=True, device_memory_limit=device_limit, local_directory=args.dask_workspace, dashboard_address=":3787", ) client = Client(cluster) # Setup RMM pool if not args.no_rmm_pool: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) processor.add_feature([ops.ZeroFill(), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=out_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, on_host=args.cat_on_host, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle="full" if args.worker_shuffle else "partial", out_files_per_proc=out_files_per_proc, output_path=out_path, ) else: processor.apply( dataset, shuffle="full" if args.worker_shuffle else "partial", out_files_per_proc=out_files_per_proc, output_path=out_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.no_rmm_pool)}") print(f"out_files_per_proc | {args.splits}") print(f"worker-shuffle | {args.worker_shuffle}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, op_columns, use_client): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client if use_client else None, ) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify(cat_cache="host")) processor.finalize() processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log if not op_columns: assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_gpu_workflow(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() config["FE"]["continuous"] = [ops.ZeroFill()] config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-4) # assert math.isclose(get_norms(df.id).mean(), # processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-3) # assert math.isclose(get_norms(df.id).std(), # processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_gpu_workflow_api(tmpdir, datasets, dump, gpu_memory_frac, engine, op_columns): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=False, ) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) processor.update_stats(data_itr) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log if not op_columns: assert math.isclose( get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-1, ) assert math.isclose( get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-1, ) assert math.isclose( get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-1, ) assert math.isclose( get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-1, ) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_to_string() cats0 = processor.stats["encoders"]["name-cat"].get_cats( ).values_to_string() # adding the None entry as a string because of move from gpu assert cats0 == ["None"] + cats_expected0 cats_expected1 = df["name-string"].unique().values_to_string() cats1 = processor.stats["encoders"]["name-string"].get_cats( ).values_to_string() # adding the None entry as a string because of move from gpu assert cats1 == ["None"] + cats_expected1 # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, data_itr, nfiles=10, shuffle=True, apply_ops=True) data_itr_2 = nvtabular.io.GPUDatasetIterator( glob.glob(str(tmpdir) + "/ds_part.*.parquet"), use_row_groups=True, gpu_memory_frac=gpu_memory_frac, ) df_pp = None for chunk in data_itr_2: df_pp = cudf.concat([df_pp, chunk], axis=0) if df_pp else chunk if engine == "parquet": assert df_pp["name-cat"].dtype == "int64" assert df_pp["name-string"].dtype == "int64" num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp) return processor.ds_exports
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() config["FE"]["continuous"] = [ops.ZeroFill()] config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, ) processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-4) # assert math.isclose(get_norms(df.id).mean(), # processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-3) # assert math.isclose(get_norms(df.id).std(), # processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_to_string() cats0 = processor.stats["encoders"]["name-cat"].get_cats( ).values_to_string() # adding the None entry as a string because of move from gpu assert cats0 == ["None"] + cats_expected0 cats_expected1 = df["name-string"].unique().values_to_string() cats1 = processor.stats["encoders"]["name-string"].get_cats( ).values_to_string() # adding the None entry as a string because of move from gpu assert cats1 == ["None"] + cats_expected1 # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) data_itr_2 = nvtabular.io.GPUDatasetIterator( glob.glob(str(tmpdir) + "/ds_part.*.parquet"), use_row_groups=True, gpu_memory_frac=gpu_memory_frac, ) df_pp = None for chunk in data_itr_2: df_pp = cudf.concat([df_pp, chunk], axis=0) if df_pp else chunk if engine == "parquet": assert df_pp["name-cat"].dtype == "int64" assert df_pp["name-string"].dtype == "int64" num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp) return processor.ds_exports
def test_hugectr(tmpdir, client, df, dataset, output_format, engine, op_columns, num_io_threads, use_client): client = client if use_client else None cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # apply the workflow and write out the dataset processor.apply( dataset, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=None, num_io_threads=num_io_threads, ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") # Check for _metadata.json assert os.path.isfile(outdir + "/_metadata.json") # Check contents of _metadata.json data = {} col_summary = {} with open(outdir + "/_metadata.json", "r") as fil: for k, v in json.load(fil).items(): data[k] = v assert "cats" in data assert "conts" in data assert "labels" in data assert "file_stats" in data assert len(data["file_stats"]) == nfiles if not client else nfiles * len( client.cluster.workers) for cdata in data["cats"] + data["conts"] + data["labels"]: col_summary[cdata["index"]] = cdata["col_name"] # Check that data files exist ext = "" if output_format == "parquet": ext = "parquet" elif output_format == "hugectr": ext = "data" data_files = [ os.path.join(outdir, filename) for filename in os.listdir(outdir) if filename.endswith(ext) ] # Make sure the columns in "_metadata.json" make sense if output_format == "parquet": df_check = cudf.read_parquet(os.path.join(outdir, data_files[0])) for i, name in enumerate(df_check.columns): if i in col_summary: assert col_summary[i] == name