def test_chaining_2(): gdf = cudf.DataFrame({ "A": [1, 2, 2, 9, 6, np.nan, 3], "B": [2, np.nan, 4, 7, 7, 2, 5], "C": ["a", "b", "c", np.nan, np.nan, "g", "k"], }) cat_names = ["C"] cont_names = ["A", "B"] label_name = [] all_features = (cat_names + cont_names >> ops.LambdaOp( f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull")) cat_features = cat_names >> ops.Categorify() workflow = Workflow(all_features + cat_features + label_name) dataset = nvt.Dataset(gdf, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all(x in list(result.columns) for x in ["A_isnull", "B_isnull", "C_isnull"]) assert (x in result["C"].unique() for x in set(gdf["C"].dropna().to_arrow()))
def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) platform_features = ["platform"] >> ops.Dropna() joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) joined_lambda = ( joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >> ops.Rename(postfix="_ctr")) workflow = Workflow(platform_features + joined + joined_lambda) dataset = nvt.Dataset(gdf_test, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_workflow_generate_columns(tmpdir, use_parquet): out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) # Stripped down dataset with geo_locaiton codes like in outbrains df = nvt.dispatch._make_df( {"geo_location": ["US>CA", "CA>BC", "US>TN>659"]}) # defining a simple workflow that strips out the country code from the first two digits of the # geo_location code and sticks in a new 'geo_location_country' field country = (["geo_location"] >> ops.LambdaOp( f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country")) cat_features = ["geo_location"] + country >> ops.Categorify() workflow = Workflow(cat_features) if use_parquet: df.to_parquet(path) dataset = nvt.Dataset(path) else: dataset = nvt.Dataset(df) # just make sure this works without errors workflow.fit(dataset) workflow.transform(dataset).to_parquet(out_path)
def test_schema_write_read_dataset(tmpdir, dataset, engine): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name) workflow.fit(dataset) workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, ) schema_path = Path(tmpdir) proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt") new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet")) assert """name: "name-cat"\n min: 0\n max: 27\n""" in str( proto_schema) assert new_dataset.schema == workflow.output_schema
def test_workflow_node_select(): df = dispatch._make_df({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) dataset = Dataset(df) input_features = WorkflowNode(ColumnSelector(["a", "b", "c"])) # pylint: disable=unnecessary-lambda sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col)) plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) workflow.fit(dataset) df_out = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") expected = dispatch._make_df() expected["a"] = np.sqrt(df["a"]) expected["c"] = np.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline): out_files_per_proc = 2 out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) size = 25 row_group_size = 5 cont_names = ["cont1", "cont2"] cat_names = ["cat1", "cat2"] label_name = ["label"] df = pd.DataFrame({ "cont1": np.arange(size, dtype=np.float64), "cont2": np.arange(size, dtype=np.float64), "cat1": np.arange(size, dtype=np.int32), "cat2": np.arange(size, dtype=np.int32), "label": np.arange(size, dtype=np.float64), }) df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow") dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1) cat_features = cat_names >> ops.Categorify() cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp workflow = Workflow(cat_features + cont_features + label_name, client=client if use_client else None) workflow.fit(dataset) # Force dtypes dict_dtypes = {} for col in cont_names: dict_dtypes[col] = np.float32 for col in cat_names: dict_dtypes[col] = np.float32 for col in label_name: dict_dtypes[col] = np.int64 workflow.transform(dataset).to_parquet( # apply_offline=apply_offline, Not any more? # record_stats=apply_offline, Not any more? output_path=out_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc, dtypes=dict_dtypes, ) # Check dtypes for filename in glob.glob(os.path.join(out_path, "*.parquet")): gdf = cudf.io.read_parquet(filename) assert dict(gdf.dtypes) == dict_dtypes
def test_workflow_input_output_dtypes(): df = cudf.DataFrame({ "genre": ["drama", "comedy"], "user": ["a", "b"], "unneeded": [1, 2] }) features = [["genre", "user"], "genre" ] >> ops.Categorify(encode_type="combo") workflow = Workflow(features) workflow.fit(Dataset(df)) assert "unneeded" not in workflow.input_dtypes assert set(workflow.input_dtypes.keys()) == {"genre", "user"} assert set(workflow.output_dtypes.keys()) == {"genre_user", "genre"}
def test_fit_simple(): data = cudf.DataFrame({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = cudf.DataFrame({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) assert_eq(expected, transformed)
def test_workflow_transform_ddf_dtypes(): # Initial Dataset df = cudf.datasets.timeseries().reset_index() ddf = dask_cudf.from_cudf(df, npartitions=2) dataset = Dataset(ddf) # Create and Execute Workflow cols = ["name", "x", "y", "timestamp"] cat_cols = ["id"] >> ops.Normalize() workflow = Workflow(cols + cat_cols) workflow.fit(dataset) transformed_ddf = workflow.transform(dataset).to_ddf() # no transforms on the pass through cols, should have original dtypes for col in cols: assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col]) # Followup dask-cudf sorting used to throw an exception because of dtype issues, # check that it works now transformed_ddf.sort_values(["id", "timestamp"]).compute()
def test_fit_simple(): data = nvt.dispatch._make_df({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = nvt.dispatch._make_df({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) if not HAS_GPU: transformed["x"] = transformed["x"].astype(expected["x"].dtype) transformed["y"] = transformed["y"].astype(expected["y"].dtype) assert_eq(expected, transformed)
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, use_client): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client if use_client else None) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client if use_client else None) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebook’s DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path[:-1] if args.data_path[ -1] == "/" else args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory base_dir = args.out_path[:-1] if args.out_path[-1] == "/" else args.out_path dask_workdir = os.path.join(base_dir, "workdir") output_path = os.path.join(base_dir, "output") stats_path = os.path.join(base_dir, "stats") setup_dirs(base_dir, dask_workdir, output_path, stats_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" if args.normalize: cont_features = cont_names >> ops.FillMissing() >> ops.Normalize() else: cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cat_features = cat_names >> ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, ) processor = Workflow(cat_features + cont_features + label_name, client=client) dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() processor.fit(dataset) if args.profile is not None: with performance_report(filename=args.profile): processor.transform(dataset).to_parquet( output_path=output_path, num_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, ) else: processor.transform(dataset).to_parquet( output_path=output_path, num_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify() if replace: cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms else: fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >> ops.Rename(postfix="_FillMissing_1_LogOp_1")) cont_features = cont_names + fillmissing_logop >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client) def get_norms(tar): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log concat_ops = "_FillMissing_1_LogOp_1" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), norms.means["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), norms.means["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique( ).values_host if HAS_GPU else df["name-cat"].unique() cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert all(cat in [None] + sorted(cats_expected0.tolist()) for cat in cats0.tolist()) assert len(cats0.tolist()) == len(cats_expected0.tolist() + [None]) cats_expected1 = (df["name-string"].unique().values_host if HAS_GPU else df["name-string"].unique()) cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert all(cat in [None] + sorted(cats_expected1.tolist()) for cat in cats1.tolist()) assert len(cats1.tolist()) == len(cats_expected1.tolist() + [None]) # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = nvt.dispatch._concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = nvt.dispatch._read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def nvt_etl( data_path, out_path, devices, protocol, device_limit_frac, device_pool_frac, part_mem_frac, cats, conts, labels, out_files_per_proc, ): # Set up data paths input_path = data_path[:-1] if data_path[-1] == "/" else data_path base_dir = out_path[:-1] if out_path[-1] == "/" else out_path dask_workdir = os.path.join(base_dir, "workdir") output_path = os.path.join(base_dir, "output") stats_path = os.path.join(base_dir, "stats") output_train_dir = os.path.join(output_path, "train/") output_valid_dir = os.path.join(output_path, "valid/") # Make sure we have a clean worker space for Dask if os.path.isdir(dask_workdir): shutil.rmtree(dask_workdir) os.makedirs(dask_workdir) # Make sure we have a clean stats space for Dask if os.path.isdir(stats_path): shutil.rmtree(stats_path) os.mkdir(stats_path) # Make sure we have a clean output path if os.path.isdir(output_path): shutil.rmtree(output_path) os.mkdir(output_path) os.mkdir(output_train_dir) os.mkdir(output_valid_dir) # Get train/valid files train_paths = [ os.path.join(input_path, f) for f in os.listdir(input_path) if os.path.isfile(os.path.join(input_path, f)) ] n_files = int(len(train_paths) * 0.9) valid_paths = train_paths[n_files:] train_paths = train_paths[:n_files] # Force dtypes for HugeCTR usage dict_dtypes = {} for col in cats: dict_dtypes[col] = np.int64 for col in conts: dict_dtypes[col] = np.float32 for col in labels: dict_dtypes[col] = np.float32 # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(device_limit_frac * device_size) device_pool_size = int(device_pool_frac * device_size) part_size = int(part_mem_frac * device_size) # Check if any device memory is already occupied for dev in devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup dask cluster and perform ETL with managed_client(dask_workdir, devices, device_limit, protocol) as client: # Setup RMM pool if device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" cont_features = conts >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cat_features = cats >> ops.Categorify(out_path=stats_path, max_size=10000000) workflow = Workflow(cat_features + cont_features + labels, client=client) train_dataset = Dataset(train_paths, engine="parquet", part_size=part_size) valid_dataset = Dataset(valid_paths, engine="parquet", part_size=part_size) workflow.fit(train_dataset) workflow.transform(train_dataset).to_parquet( output_path=output_train_dir, shuffle=nvt_io.Shuffle.PER_WORKER, dtypes=dict_dtypes, cats=cats, conts=conts, labels=labels, out_files_per_proc=out_files_per_proc, ) workflow.transform(valid_dataset).to_parquet( output_path=output_valid_dir, shuffle=nvt_io.Shuffle.PER_WORKER, dtypes=dict_dtypes, cats=cats, conts=conts, labels=labels, out_files_per_proc=out_files_per_proc, ) workflow.save(os.path.join(output_path, "workflow")) return workflow