Example #1
0
def main(args):
    """Multi-GPU Criteo/DLRM Preprocessing Benchmark

    This benchmark is designed to measure the time required to preprocess
    the Criteo (1TB) dataset for Facebook’s DLRM model.  The user must specify
    the path of the raw dataset (using the `--data-path` flag), as well as the
    output directory for all temporary/final data (using the `--out-path` flag)

    Example Usage
    -------------

    python dask-nvtabular-criteo-benchmark.py
                        --data-path /path/to/criteo_parquet --out-path /out/dir/`


    Dataset Requirements (Parquet)
    ------------------------------

    This benchmark is designed with a parquet-formatted dataset in mind.
    While a CSV-formatted dataset can be processed by NVTabular, converting
    to parquet will yield significantly better performance.  To convert your
    dataset, try using the `optimize_criteo.ipynb` notebook (also located
    in `NVTabular/examples/`)

    For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md`
    """

    # Input
    data_path = args.data_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.out_files_per_proc
    high_card_columns = args.high_cards.split(",")
    dashboard_port = args.dashboard_port
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS

    # Cleanup output directory
    BASE_DIR = args.out_path
    dask_workdir = os.path.join(BASE_DIR, "workdir")
    output_path = os.path.join(BASE_DIR, "output")
    stats_path = os.path.join(BASE_DIR, "stats")
    if not os.path.isdir(BASE_DIR):
        os.mkdir(BASE_DIR)
    for dir_path in (dask_workdir, output_path, stats_path):
        if os.path.isdir(dir_path):
            shutil.rmtree(dir_path)
        os.mkdir(dir_path)

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    # Specify Categorify/GroupbyStatistics options
    tree_width = {}
    cat_cache = {}
    for col in cat_names:
        if col in high_card_columns:
            tree_width[col] = args.tree_width
            cat_cache[col] = args.cat_cache_high
        else:
            tree_width[col] = 1
            cat_cache[col] = args.cat_cache_low

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Parse shuffle option
    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt_io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt_io.Shuffle.PER_PARTITION

    # Check if any device memory is already occupied
    for dev in args.devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    client = Client(cluster)

    # Setup RMM pool
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    if args.normalize:
        processor.add_feature([ops.FillMissing(), ops.Normalize()])
    else:
        processor.add_feature(
            [ops.FillMissing(),
             ops.Clip(min_value=0),
             ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=stats_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            search_sorted=not freq_limit,
            on_host=not args.cats_on_device,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                output_path=output_path,
                num_io_threads=args.num_io_threads,
            )
    else:
        processor.apply(
            dataset,
            num_io_threads=args.num_io_threads,
            shuffle=shuffle,
            out_files_per_proc=out_files_per_proc,
            output_path=output_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print(f"cats-on-device     | {args.cats_on_device}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
Example #2
0
def run_preprocessing(input_path, base_dir, num_train_days, num_val_days,
                      num_gpus):

    # Define paths to save artifacts
    dask_workdir = os.path.join(base_dir, "test_dask/workdir")
    output_path = os.path.join(base_dir, "test_dask/output")
    stats_path = os.path.join(base_dir, "test_dask/stats")

    logging.info(f"Dask Workdir: {dask_workdir}")
    logging.info(f"Output Path: {output_path}")

    # Make sure we have a clean worker space for Dask
    if os.path.isdir(dask_workdir):
        shutil.rmtree(dask_workdir)
    os.makedirs(dask_workdir)

    # Make sure we have a clean stats space for Dask
    if os.path.isdir(stats_path):
        shutil.rmtree(stats_path)
    os.mkdir(stats_path)

    # Make sure we have a clean output path
    if os.path.isdir(output_path):
        shutil.rmtree(output_path)
    os.mkdir(output_path)

    logging.info("Created output directories..")

    # This requires the data to be in this specific format eg. day_0.parquet, day_2.parquet etc.
    fname = 'day_{}.parquet'
    num_days = len([
        i for i in os.listdir(input_path)
        if re.match(fname.format('[0-9]{1,2}'), i) is not None
    ])
    train_paths = [
        os.path.join(input_path, fname.format(day))
        for day in range(num_train_days)
    ]
    valid_paths = [
        os.path.join(input_path, fname.format(day))
        for day in range(num_train_days, num_train_days + num_val_days)
    ]

    logging.info(f"Training data: {train_paths}")
    logging.info(f"Validation data: {valid_paths}")

    # Deploy a Dask Distributed Cluster
    # Single-Machine Multi-GPU Cluster
    protocol = "tcp"  # "tcp" or "ucx"
    visible_devices = ",".join([str(n) for n in num_gpus
                                ])  # Delect devices to place workers
    device_limit_frac = 0.4  # Spill GPU-Worker memory to host at this limit.
    device_pool_frac = 0.5
    part_mem_frac = 0.05  # Desired maximum size of each partition as a fraction of total GPU memory.

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    part_size = int(part_mem_frac * device_size)
    logging.info(f"Partition size: {part_size}")

    # Deploy Dask Distributed cluster only if asked for multiple GPUs
    if len(num_gpus) > 1:

        device_limit = int(device_limit_frac * device_size)
        device_pool_size = int(device_pool_frac * device_size)

        logging.info("Checking if any device memory is already occupied..")
        # Check if any device memory is already occupied
        for dev in visible_devices.split(","):
            fmem = _pynvml_mem_size(kind="free", index=int(dev))
            used = (device_size - fmem) / 1e9
            if used > 1.0:
                warnings.warn(
                    f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
                )

        cluster = None  # (Optional) Specify existing scheduler port
        if cluster is None:
            cluster = LocalCUDACluster(protocol=protocol,
                                       n_workers=len(
                                           visible_devices.split(",")),
                                       CUDA_VISIBLE_DEVICES=visible_devices,
                                       device_memory_limit=device_limit,
                                       local_directory=dask_workdir)

        logging.info("Create the distributed client..")
        # Create the distributed client
        client = Client(cluster)

        logging.info("Initialize memory pools..")

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                # RMM may require the pool size to be a multiple of 256.
                pool_allocator=True,
                initial_pool_size=(device_pool_size // 256) * 256,
            )

        client.run(_rmm_pool)

    # Preprocessing
    CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)]
    CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)]
    LABEL_COLUMNS = ['label']
    COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

    cat_features = CATEGORICAL_COLUMNS >> Categorify(out_path=stats_path)
    cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(
        min_value=0) >> Normalize()
    features = cat_features + cont_features + LABEL_COLUMNS

    logging.info("Defining a workflow object..")
    if len(num_gpus) > 1:
        workflow = nvt.Workflow(features, client=client)
    else:
        workflow = nvt.Workflow(features)

    dict_dtypes = {}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64

    for col in CONTINUOUS_COLUMNS:
        dict_dtypes[col] = np.float32

    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    train_dataset = nvt.Dataset(train_paths,
                                engine='parquet',
                                part_size=part_size)
    valid_dataset = nvt.Dataset(valid_paths,
                                engine='parquet',
                                part_size=part_size)

    output_train_dir = os.path.join(output_path, 'train/')
    logging.info(f"Creating train/ directory at: {output_train_dir}")
    if not os.path.exists(output_train_dir):
        os.makedirs(output_train_dir)

    output_valid_dir = os.path.join(output_path, 'valid/')
    logging.info(f"Creating valid/ directory at: {output_valid_dir}")
    if not os.path.exists(output_valid_dir):
        os.makedirs(output_valid_dir)

    logging.info("Workflow Fit..")
    workflow.fit(train_dataset)

    logging.info("Transform Training data..")
    workflow.transform(train_dataset).to_parquet(
        output_path=output_train_dir,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        dtypes=dict_dtypes,
        cats=CATEGORICAL_COLUMNS,
        conts=CONTINUOUS_COLUMNS,
        labels=LABEL_COLUMNS)

    logging.info("Transform Validation data..")
    workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir,
                                                 dtypes=dict_dtypes,
                                                 cats=CATEGORICAL_COLUMNS,
                                                 conts=CONTINUOUS_COLUMNS,
                                                 labels=LABEL_COLUMNS)

    # use these printed out cardinalities list in the  "slot_size_array" in the HugeCTR training "dcn_parquet.json"
    cardinalities = []
    for col in CATEGORICAL_COLUMNS:
        cardinalities.append(nvt.ops.get_embedding_sizes(workflow)[col][0])

    logging.info(
        f"Cardinalities for configuring slot_size_array: {cardinalities}")

    logging.info(f"Saving workflow object at: {output_path + '/workflow'}")
    workflow.save(output_path + '/workflow')

    logging.info("Done!")
Example #3
0
def run_preprocessing(input_train_path, workflow_path, output_path,
                      dask_workdir, num_gpus):
    fname = '{}.parquet'
    train_files = [
        i for i in os.listdir(input_train_path)
        if re.match(fname.format('.*'), i) is not None
    ]
    train_paths = [
        os.path.join(input_train_path, filename) for filename in train_files
    ]

    # Deploy a Dask Distributed Cluster
    # Single-Machine Multi-GPU Cluster
    protocol = "tcp"  # "tcp" or "ucx"
    visible_devices = ",".join([str(n) for n in num_gpus
                                ])  # Delect devices to place workers
    device_limit_frac = 0.4  # Spill GPU-Worker memory to host at this limit.
    device_pool_frac = 0.5
    part_mem_frac = 0.05

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    part_size = int(part_mem_frac * device_size)
    logging.info(f"Partition size: {part_size}")

    # Deploy Dask Distributed cluster only if asked for multiple GPUs
    if len(num_gpus) > 1:
        logging.info("Deploy Dask Distributed cluster...")

        device_limit = int(device_limit_frac * device_size)
        device_pool_size = int(device_pool_frac * device_size)

        logging.info("Checking if any device memory is already occupied...")
        # Check if any device memory is already occupied
        for dev in visible_devices.split(","):
            fmem = _pynvml_mem_size(kind="free", index=int(dev))
            used = (device_size - fmem) / 1e9
            if used > 1.0:
                warnings.warn(
                    f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
                )

        cluster = None  # (Optional) Specify existing scheduler port
        if cluster is None:
            cluster = LocalCUDACluster(protocol=protocol,
                                       n_workers=len(
                                           visible_devices.split(",")),
                                       CUDA_VISIBLE_DEVICES=visible_devices,
                                       device_memory_limit=device_limit,
                                       local_directory=dask_workdir)

        logging.info("Create the distributed client...")
        # Create the distributed client
        client = Client(cluster)

        logging.info("Initialize memory pools...")

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                # RMM may require the pool size to be a multiple of 256.
                pool_allocator=True,
                initial_pool_size=(device_pool_size // 256) *
                256,  # Use default size
            )

        client.run(_rmm_pool)

    # Import the test .parquet
    logging.info("Importing Data...")
    test_dataset = nvt.Dataset(train_paths,
                               engine='parquet',
                               part_size=part_size)

    logging.info("Loading workflow object...")
    workflow = nvt.Workflow.load(workflow_path)

    # Specify the columns IDs: this part should exactly the columns while preproc. train, valid datasets
    CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)]
    CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)]
    LABEL_COLUMNS = ['label']
    dict_dtypes = {}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64

    for col in CONTINUOUS_COLUMNS:
        dict_dtypes[col] = np.float32

    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    # Create output directory for test data
    output_test_dir = os.path.join(output_path, 'train/')

    if not os.path.exists(output_test_dir):
        logging.info(f"Creating train/ directory at: {output_test_dir}")
        os.makedirs(output_test_dir)

    logging.info("Preprocessing Data...")
    workflow.transform(test_dataset).to_parquet(output_path=output_test_dir,
                                                dtypes=dict_dtypes,
                                                cats=CATEGORICAL_COLUMNS,
                                                conts=CONTINUOUS_COLUMNS,
                                                labels=LABEL_COLUMNS)

    logging.info("Done!")
Example #4
0
def nvt_etl(
    data_path,
    out_path,
    devices,
    protocol,
    device_limit_frac,
    device_pool_frac,
    part_mem_frac,
    cats,
    conts,
    labels,
    out_files_per_proc,
):
    # Set up data paths
    input_path = data_path[:-1] if data_path[-1] == "/" else data_path
    base_dir = out_path[:-1] if out_path[-1] == "/" else out_path
    dask_workdir = os.path.join(base_dir, "workdir")
    output_path = os.path.join(base_dir, "output")
    stats_path = os.path.join(base_dir, "stats")
    output_train_dir = os.path.join(output_path, "train/")
    output_valid_dir = os.path.join(output_path, "valid/")

    # Make sure we have a clean worker space for Dask
    if os.path.isdir(dask_workdir):
        shutil.rmtree(dask_workdir)
    os.makedirs(dask_workdir)

    # Make sure we have a clean stats space for Dask
    if os.path.isdir(stats_path):
        shutil.rmtree(stats_path)
    os.mkdir(stats_path)

    # Make sure we have a clean output path
    if os.path.isdir(output_path):
        shutil.rmtree(output_path)
    os.mkdir(output_path)
    os.mkdir(output_train_dir)
    os.mkdir(output_valid_dir)

    # Get train/valid files
    train_paths = [
        os.path.join(input_path, f) for f in os.listdir(input_path)
        if os.path.isfile(os.path.join(input_path, f))
    ]
    n_files = int(len(train_paths) * 0.9)
    valid_paths = train_paths[n_files:]
    train_paths = train_paths[:n_files]

    # Force dtypes for HugeCTR usage
    dict_dtypes = {}
    for col in cats:
        dict_dtypes[col] = np.int64
    for col in conts:
        dict_dtypes[col] = np.float32
    for col in labels:
        dict_dtypes[col] = np.float32

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(device_limit_frac * device_size)
    device_pool_size = int(device_pool_frac * device_size)
    part_size = int(part_mem_frac * device_size)

    # Check if any device memory is already occupied
    for dev in devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup dask cluster and perform ETL
    with managed_client(dask_workdir, devices, device_limit,
                        protocol) as client:
        # Setup RMM pool
        if device_pool_frac > 0.01:
            setup_rmm_pool(client, device_pool_size)

        # Define Dask NVTabular "Workflow"
        cont_features = conts >> ops.FillMissing() >> ops.Clip(
            min_value=0) >> ops.LogOp()

        cat_features = cats >> ops.Categorify(out_path=stats_path,
                                              max_size=10000000)

        workflow = Workflow(cat_features + cont_features + labels,
                            client=client)

        train_dataset = Dataset(train_paths,
                                engine="parquet",
                                part_size=part_size)
        valid_dataset = Dataset(valid_paths,
                                engine="parquet",
                                part_size=part_size)

        workflow.fit(train_dataset)

        workflow.transform(train_dataset).to_parquet(
            output_path=output_train_dir,
            shuffle=nvt_io.Shuffle.PER_WORKER,
            dtypes=dict_dtypes,
            cats=cats,
            conts=conts,
            labels=labels,
            out_files_per_proc=out_files_per_proc,
        )
        workflow.transform(valid_dataset).to_parquet(
            output_path=output_valid_dir,
            shuffle=nvt_io.Shuffle.PER_WORKER,
            dtypes=dict_dtypes,
            cats=cats,
            conts=conts,
            labels=labels,
            out_files_per_proc=out_files_per_proc,
        )

        workflow.save(os.path.join(output_path, "workflow"))

        return workflow