def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebookâs DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory BASE_DIR = args.out_path dask_workdir = os.path.join(BASE_DIR, "workdir") output_path = os.path.join(BASE_DIR, "output") stats_path = os.path.join(BASE_DIR, "stats") if not os.path.isdir(BASE_DIR): os.mkdir(BASE_DIR) for dir_path in (dask_workdir, output_path, stats_path): if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.mkdir(dir_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) if args.normalize: processor.add_feature([ops.FillMissing(), ops.Normalize()]) else: processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, num_io_threads=args.num_io_threads, ) else: processor.apply( dataset, num_io_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def run_preprocessing(input_path, base_dir, num_train_days, num_val_days, num_gpus): # Define paths to save artifacts dask_workdir = os.path.join(base_dir, "test_dask/workdir") output_path = os.path.join(base_dir, "test_dask/output") stats_path = os.path.join(base_dir, "test_dask/stats") logging.info(f"Dask Workdir: {dask_workdir}") logging.info(f"Output Path: {output_path}") # Make sure we have a clean worker space for Dask if os.path.isdir(dask_workdir): shutil.rmtree(dask_workdir) os.makedirs(dask_workdir) # Make sure we have a clean stats space for Dask if os.path.isdir(stats_path): shutil.rmtree(stats_path) os.mkdir(stats_path) # Make sure we have a clean output path if os.path.isdir(output_path): shutil.rmtree(output_path) os.mkdir(output_path) logging.info("Created output directories..") # This requires the data to be in this specific format eg. day_0.parquet, day_2.parquet etc. fname = 'day_{}.parquet' num_days = len([ i for i in os.listdir(input_path) if re.match(fname.format('[0-9]{1,2}'), i) is not None ]) train_paths = [ os.path.join(input_path, fname.format(day)) for day in range(num_train_days) ] valid_paths = [ os.path.join(input_path, fname.format(day)) for day in range(num_train_days, num_train_days + num_val_days) ] logging.info(f"Training data: {train_paths}") logging.info(f"Validation data: {valid_paths}") # Deploy a Dask Distributed Cluster # Single-Machine Multi-GPU Cluster protocol = "tcp" # "tcp" or "ucx" visible_devices = ",".join([str(n) for n in num_gpus ]) # Delect devices to place workers device_limit_frac = 0.4 # Spill GPU-Worker memory to host at this limit. device_pool_frac = 0.5 part_mem_frac = 0.05 # Desired maximum size of each partition as a fraction of total GPU memory. # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") part_size = int(part_mem_frac * device_size) logging.info(f"Partition size: {part_size}") # Deploy Dask Distributed cluster only if asked for multiple GPUs if len(num_gpus) > 1: device_limit = int(device_limit_frac * device_size) device_pool_size = int(device_pool_frac * device_size) logging.info("Checking if any device memory is already occupied..") # Check if any device memory is already occupied for dev in visible_devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) cluster = None # (Optional) Specify existing scheduler port if cluster is None: cluster = LocalCUDACluster(protocol=protocol, n_workers=len( visible_devices.split(",")), CUDA_VISIBLE_DEVICES=visible_devices, device_memory_limit=device_limit, local_directory=dask_workdir) logging.info("Create the distributed client..") # Create the distributed client client = Client(cluster) logging.info("Initialize memory pools..") # Initialize RMM pool on ALL workers def _rmm_pool(): rmm.reinitialize( # RMM may require the pool size to be a multiple of 256. pool_allocator=True, initial_pool_size=(device_pool_size // 256) * 256, ) client.run(_rmm_pool) # Preprocessing CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)] CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)] LABEL_COLUMNS = ['label'] COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS cat_features = CATEGORICAL_COLUMNS >> Categorify(out_path=stats_path) cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip( min_value=0) >> Normalize() features = cat_features + cont_features + LABEL_COLUMNS logging.info("Defining a workflow object..") if len(num_gpus) > 1: workflow = nvt.Workflow(features, client=client) else: workflow = nvt.Workflow(features) dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 for col in CONTINUOUS_COLUMNS: dict_dtypes[col] = np.float32 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size) valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size) output_train_dir = os.path.join(output_path, 'train/') logging.info(f"Creating train/ directory at: {output_train_dir}") if not os.path.exists(output_train_dir): os.makedirs(output_train_dir) output_valid_dir = os.path.join(output_path, 'valid/') logging.info(f"Creating valid/ directory at: {output_valid_dir}") if not os.path.exists(output_valid_dir): os.makedirs(output_valid_dir) logging.info("Workflow Fit..") workflow.fit(train_dataset) logging.info("Transform Training data..") workflow.transform(train_dataset).to_parquet( output_path=output_train_dir, shuffle=nvt.io.Shuffle.PER_PARTITION, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS, conts=CONTINUOUS_COLUMNS, labels=LABEL_COLUMNS) logging.info("Transform Validation data..") workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS, conts=CONTINUOUS_COLUMNS, labels=LABEL_COLUMNS) # use these printed out cardinalities list in the "slot_size_array" in the HugeCTR training "dcn_parquet.json" cardinalities = [] for col in CATEGORICAL_COLUMNS: cardinalities.append(nvt.ops.get_embedding_sizes(workflow)[col][0]) logging.info( f"Cardinalities for configuring slot_size_array: {cardinalities}") logging.info(f"Saving workflow object at: {output_path + '/workflow'}") workflow.save(output_path + '/workflow') logging.info("Done!")
def run_preprocessing(input_train_path, workflow_path, output_path, dask_workdir, num_gpus): fname = '{}.parquet' train_files = [ i for i in os.listdir(input_train_path) if re.match(fname.format('.*'), i) is not None ] train_paths = [ os.path.join(input_train_path, filename) for filename in train_files ] # Deploy a Dask Distributed Cluster # Single-Machine Multi-GPU Cluster protocol = "tcp" # "tcp" or "ucx" visible_devices = ",".join([str(n) for n in num_gpus ]) # Delect devices to place workers device_limit_frac = 0.4 # Spill GPU-Worker memory to host at this limit. device_pool_frac = 0.5 part_mem_frac = 0.05 # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") part_size = int(part_mem_frac * device_size) logging.info(f"Partition size: {part_size}") # Deploy Dask Distributed cluster only if asked for multiple GPUs if len(num_gpus) > 1: logging.info("Deploy Dask Distributed cluster...") device_limit = int(device_limit_frac * device_size) device_pool_size = int(device_pool_frac * device_size) logging.info("Checking if any device memory is already occupied...") # Check if any device memory is already occupied for dev in visible_devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) cluster = None # (Optional) Specify existing scheduler port if cluster is None: cluster = LocalCUDACluster(protocol=protocol, n_workers=len( visible_devices.split(",")), CUDA_VISIBLE_DEVICES=visible_devices, device_memory_limit=device_limit, local_directory=dask_workdir) logging.info("Create the distributed client...") # Create the distributed client client = Client(cluster) logging.info("Initialize memory pools...") # Initialize RMM pool on ALL workers def _rmm_pool(): rmm.reinitialize( # RMM may require the pool size to be a multiple of 256. pool_allocator=True, initial_pool_size=(device_pool_size // 256) * 256, # Use default size ) client.run(_rmm_pool) # Import the test .parquet logging.info("Importing Data...") test_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size) logging.info("Loading workflow object...") workflow = nvt.Workflow.load(workflow_path) # Specify the columns IDs: this part should exactly the columns while preproc. train, valid datasets CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)] CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)] LABEL_COLUMNS = ['label'] dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 for col in CONTINUOUS_COLUMNS: dict_dtypes[col] = np.float32 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 # Create output directory for test data output_test_dir = os.path.join(output_path, 'train/') if not os.path.exists(output_test_dir): logging.info(f"Creating train/ directory at: {output_test_dir}") os.makedirs(output_test_dir) logging.info("Preprocessing Data...") workflow.transform(test_dataset).to_parquet(output_path=output_test_dir, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS, conts=CONTINUOUS_COLUMNS, labels=LABEL_COLUMNS) logging.info("Done!")
def nvt_etl( data_path, out_path, devices, protocol, device_limit_frac, device_pool_frac, part_mem_frac, cats, conts, labels, out_files_per_proc, ): # Set up data paths input_path = data_path[:-1] if data_path[-1] == "/" else data_path base_dir = out_path[:-1] if out_path[-1] == "/" else out_path dask_workdir = os.path.join(base_dir, "workdir") output_path = os.path.join(base_dir, "output") stats_path = os.path.join(base_dir, "stats") output_train_dir = os.path.join(output_path, "train/") output_valid_dir = os.path.join(output_path, "valid/") # Make sure we have a clean worker space for Dask if os.path.isdir(dask_workdir): shutil.rmtree(dask_workdir) os.makedirs(dask_workdir) # Make sure we have a clean stats space for Dask if os.path.isdir(stats_path): shutil.rmtree(stats_path) os.mkdir(stats_path) # Make sure we have a clean output path if os.path.isdir(output_path): shutil.rmtree(output_path) os.mkdir(output_path) os.mkdir(output_train_dir) os.mkdir(output_valid_dir) # Get train/valid files train_paths = [ os.path.join(input_path, f) for f in os.listdir(input_path) if os.path.isfile(os.path.join(input_path, f)) ] n_files = int(len(train_paths) * 0.9) valid_paths = train_paths[n_files:] train_paths = train_paths[:n_files] # Force dtypes for HugeCTR usage dict_dtypes = {} for col in cats: dict_dtypes[col] = np.int64 for col in conts: dict_dtypes[col] = np.float32 for col in labels: dict_dtypes[col] = np.float32 # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(device_limit_frac * device_size) device_pool_size = int(device_pool_frac * device_size) part_size = int(part_mem_frac * device_size) # Check if any device memory is already occupied for dev in devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup dask cluster and perform ETL with managed_client(dask_workdir, devices, device_limit, protocol) as client: # Setup RMM pool if device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" cont_features = conts >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cat_features = cats >> ops.Categorify(out_path=stats_path, max_size=10000000) workflow = Workflow(cat_features + cont_features + labels, client=client) train_dataset = Dataset(train_paths, engine="parquet", part_size=part_size) valid_dataset = Dataset(valid_paths, engine="parquet", part_size=part_size) workflow.fit(train_dataset) workflow.transform(train_dataset).to_parquet( output_path=output_train_dir, shuffle=nvt_io.Shuffle.PER_WORKER, dtypes=dict_dtypes, cats=cats, conts=conts, labels=labels, out_files_per_proc=out_files_per_proc, ) workflow.transform(valid_dataset).to_parquet( output_path=output_valid_dir, shuffle=nvt_io.Shuffle.PER_WORKER, dtypes=dict_dtypes, cats=cats, conts=conts, labels=labels, out_files_per_proc=out_files_per_proc, ) workflow.save(os.path.join(output_path, "workflow")) return workflow