def _pool(frac=0.8):
    initial_pool_size = frac * device_mem_size()
    if initial_pool_size % 256 != 0:
        new_initial_pool_size = initial_pool_size // 256 * 256
        print(
            f"Initial pool size for rmm has to be a multiply of 256. Got {initial_pool_size}, reducing to {new_initial_pool_size}"
        )
        initial_pool_size = new_initial_pool_size

    rmm.reinitialize(
        pool_allocator=True,
        initial_pool_size=initial_pool_size,
    )
Example #2
0
def create_client(devices, local_directory):
    client = None

    if len(devices) > 1:
        device_size = device_mem_size(kind="total")
        device_limit = int(0.8 * device_size)
        device_pool_size = int(0.8 * device_size)
        cluster = LocalCUDACluster(n_workers=len(devices),
                                   CUDA_VISIBLE_DEVICES=",".join(
                                       str(x) for x in devices),
                                   device_memory_limit=device_limit,
                                   local_directory=local_directory)
        client = Client(cluster)
        setup_rmm_pool(client, device_pool_size)

    return client
Example #3
0
def main(args):
    # Get device configuration
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Get dataset columns
    with fsspec.open(args.config_file) as f:
        config = json.load(f)

    # Create Dataset
    dataset = Dataset(args.data_path, engine=args.format, part_size=part_size)

    # Call Inspector
    with managed_client(args.devices, device_limit, args.protocol) as client:
        setup_rmm_pool(client, device_pool_size)
        a = datains.DatasetInspector(client)
        a.inspect(dataset, config, args.output_file)
Example #4
0
def set_cluster_client(n_gpus=-1, device_spill_frac=0.8):
    # TODO: Check for any solution. If user calls this function, for the second call the correct recreation will fail.
    # New cluster can be created after 'kernel restart' procedure.
    '''
        device_spill_frac: Spill GPU-Worker memory to host at this limit. Reduce if spilling fails to prevent device memory errors.
        '''
    if os.path.isdir("dask-worker-space"):
        shutil.rmtree('dask-worker-space', ignore_errors=True)
    # Deploy a Single-Machine Multi-GPU Cluster
    if n_gpus == -1:
        nvidia_smi.nvmlInit()
        n_gpus_avail = nvidia_smi.nvmlDeviceGetCount()
        print('\n n_gpus_avail: {}'.format(n_gpus_avail))
        n_gpus = n_gpus_avail
    # Delect devices to place workers
    visible_devices = [i for i in list(range(n_gpus))]
    visible_devices = str(visible_devices)[1:-1]
    #print('visible_devices: {}'.format(visible_devices))

    #TODO: how to reinitialzed cluster
    cluster = LocalCUDACluster(
        protocol="tcp",  # "tcp" or "ucx"
        CUDA_VISIBLE_DEVICES=visible_devices,
        device_memory_limit=device_spill_frac * device_mem_size(kind="total"),
    )
    try:
        # Create the distributed client
        client = Client(cluster)
        display(client)
        print('\n Dashboard avail: http://localhost:8888/proxy/8787/status')

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                pool_allocator=True,
                initial_pool_size=None,  # Use default size
            )

        client.run(_rmm_pool)
        return client
    except MemoryError:
        print('\n The client is already initialized')
Example #5
0
def process(args):
    
    train_path = os.path.abspath("../din_data/train")
    test_path = os.path.abspath("../din_data/valid")

    if os.path.exists(train_path):
        shutil.rmtree(train_path)
    if os.path.exists(test_path):
        shutil.rmtree(test_path)
    os.mkdir(train_path)
    os.mkdir(test_path)
    
    #Path to save temp parquet
    train_temp = "../din_data/train_temp.parquet"
    valid_temp = "../din_data/test_temp.parquet"

    #Path to save final parquet
    train_output = train_path
    valid_output = test_path
    
    # Deploy a Single-Machine Multi-GPU Cluster
    device_size = device_mem_size(kind="total")
    cluster = None
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS
        cluster = LocalCUDACluster(
            protocol = args.protocol,
            CUDA_VISIBLE_DEVICES = args.devices,
            n_workers = len(args.devices.split(",")),
            enable_nvlink=True,
            device_memory_limit = int(device_size * args.device_limit_frac),
            dashboard_address=":" + args.dashboard_port
        )
    else:
        cluster = LocalCUDACluster(
            protocol = args.protocol,
            n_workers = len(args.devices.split(",")),
            CUDA_VISIBLE_DEVICES = args.devices,
            device_memory_limit = int(device_size * args.device_limit_frac),
            dashboard_address=":" + args.dashboard_port
        )
        

    # Create the distributed client
    client = Client(cluster)
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, int(args.device_pool_frac*device_size))
        
    runtime = time.time()

    ##Real works here
    features = LABEL + ColumnGroup(CAT_COLUMNS)
    
    workflow = nvt.Workflow(features, client = client)
    
    train_ds_iterator = nvt.Dataset(train_temp, engine='parquet', part_size=int(args.part_mem_frac * device_size))
    valid_ds_iterator = nvt.Dataset(valid_temp, engine='parquet', part_size=int(args.part_mem_frac * device_size))
    
    ##Shuffle
    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt.io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt.io.Shuffle.PER_PARTITION
        
    dict_dtypes = {}
    for col in CAT_COLUMNS:
        dict_dtypes[col] = np.int64
    for col in LABEL:
        dict_dtypes[col] = np.float32
        
    workflow.fit(train_ds_iterator)
    
    workflow.transform(train_ds_iterator).to_parquet(
                output_path=train_output,
                dtypes=dict_dtypes,
                cats=CAT_COLUMNS,
                labels=LABEL,
                shuffle=shuffle,
                out_files_per_proc=args.out_files_per_proc,
                num_threads=args.num_io_threads)

    workflow.transform(valid_ds_iterator).to_parquet(
                output_path=valid_output,
                dtypes=dict_dtypes,
                cats=CAT_COLUMNS,
                labels=LABEL,
                shuffle=shuffle,
                out_files_per_proc=args.out_files_per_proc,
                num_threads=args.num_io_threads)
    
    client.close()

    print("Time:", time.time() - runtime)
Example #6
0
def main(args):
    """Multi-GPU Criteo/DLRM Preprocessing Benchmark

    This benchmark is designed to measure the time required to preprocess
    the Criteo (1TB) dataset for Facebook’s DLRM model.  The user must specify
    the path of the raw dataset (using the `--data-path` flag), as well as the
    output directory for all temporary/final data (using the `--out-path` flag)

    Example Usage
    -------------

    python dask-nvtabular-criteo-benchmark.py
                        --data-path /path/to/criteo_parquet --out-path /out/dir/`


    Dataset Requirements (Parquet)
    ------------------------------

    This benchmark is designed with a parquet-formatted dataset in mind.
    While a CSV-formatted dataset can be processed by NVTabular, converting
    to parquet will yield significantly better performance.  To convert your
    dataset, try using the `optimize_criteo.ipynb` notebook (also located
    in `NVTabular/examples/`)

    For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md`
    """

    # Input
    data_path = args.data_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.out_files_per_proc
    high_card_columns = args.high_cards.split(",")
    dashboard_port = args.dashboard_port
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS

    # Cleanup output directory
    BASE_DIR = args.out_path
    dask_workdir = os.path.join(BASE_DIR, "workdir")
    output_path = os.path.join(BASE_DIR, "output")
    stats_path = os.path.join(BASE_DIR, "stats")
    if not os.path.isdir(BASE_DIR):
        os.mkdir(BASE_DIR)
    for dir_path in (dask_workdir, output_path, stats_path):
        if os.path.isdir(dir_path):
            shutil.rmtree(dir_path)
        os.mkdir(dir_path)

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    # Specify Categorify/GroupbyStatistics options
    tree_width = {}
    cat_cache = {}
    for col in cat_names:
        if col in high_card_columns:
            tree_width[col] = args.tree_width
            cat_cache[col] = args.cat_cache_high
        else:
            tree_width[col] = 1
            cat_cache[col] = args.cat_cache_low

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Parse shuffle option
    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt_io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt_io.Shuffle.PER_PARTITION

    # Check if any device memory is already occupied
    for dev in args.devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    client = Client(cluster)

    # Setup RMM pool
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    if args.normalize:
        processor.add_feature([ops.FillMissing(), ops.Normalize()])
    else:
        processor.add_feature(
            [ops.FillMissing(),
             ops.Clip(min_value=0),
             ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=stats_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            search_sorted=not freq_limit,
            on_host=not args.cats_on_device,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                output_path=output_path,
                num_io_threads=args.num_io_threads,
            )
    else:
        processor.apply(
            dataset,
            num_io_threads=args.num_io_threads,
            shuffle=shuffle,
            out_files_per_proc=out_files_per_proc,
            output_path=output_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print(f"cats-on-device     | {args.cats_on_device}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
Example #7
0
def run_preprocessing(input_train_path, workflow_path, output_path,
                      dask_workdir, num_gpus):
    fname = '{}.parquet'
    train_files = [
        i for i in os.listdir(input_train_path)
        if re.match(fname.format('.*'), i) is not None
    ]
    train_paths = [
        os.path.join(input_train_path, filename) for filename in train_files
    ]

    # Deploy a Dask Distributed Cluster
    # Single-Machine Multi-GPU Cluster
    protocol = "tcp"  # "tcp" or "ucx"
    visible_devices = ",".join([str(n) for n in num_gpus
                                ])  # Delect devices to place workers
    device_limit_frac = 0.4  # Spill GPU-Worker memory to host at this limit.
    device_pool_frac = 0.5
    part_mem_frac = 0.05

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    part_size = int(part_mem_frac * device_size)
    logging.info(f"Partition size: {part_size}")

    # Deploy Dask Distributed cluster only if asked for multiple GPUs
    if len(num_gpus) > 1:
        logging.info("Deploy Dask Distributed cluster...")

        device_limit = int(device_limit_frac * device_size)
        device_pool_size = int(device_pool_frac * device_size)

        logging.info("Checking if any device memory is already occupied...")
        # Check if any device memory is already occupied
        for dev in visible_devices.split(","):
            fmem = _pynvml_mem_size(kind="free", index=int(dev))
            used = (device_size - fmem) / 1e9
            if used > 1.0:
                warnings.warn(
                    f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
                )

        cluster = None  # (Optional) Specify existing scheduler port
        if cluster is None:
            cluster = LocalCUDACluster(protocol=protocol,
                                       n_workers=len(
                                           visible_devices.split(",")),
                                       CUDA_VISIBLE_DEVICES=visible_devices,
                                       device_memory_limit=device_limit,
                                       local_directory=dask_workdir)

        logging.info("Create the distributed client...")
        # Create the distributed client
        client = Client(cluster)

        logging.info("Initialize memory pools...")

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                # RMM may require the pool size to be a multiple of 256.
                pool_allocator=True,
                initial_pool_size=(device_pool_size // 256) *
                256,  # Use default size
            )

        client.run(_rmm_pool)

    # Import the test .parquet
    logging.info("Importing Data...")
    test_dataset = nvt.Dataset(train_paths,
                               engine='parquet',
                               part_size=part_size)

    logging.info("Loading workflow object...")
    workflow = nvt.Workflow.load(workflow_path)

    # Specify the columns IDs: this part should exactly the columns while preproc. train, valid datasets
    CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)]
    CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)]
    LABEL_COLUMNS = ['label']
    dict_dtypes = {}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64

    for col in CONTINUOUS_COLUMNS:
        dict_dtypes[col] = np.float32

    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    # Create output directory for test data
    output_test_dir = os.path.join(output_path, 'train/')

    if not os.path.exists(output_test_dir):
        logging.info(f"Creating train/ directory at: {output_test_dir}")
        os.makedirs(output_test_dir)

    logging.info("Preprocessing Data...")
    workflow.transform(test_dataset).to_parquet(output_path=output_test_dir,
                                                dtypes=dict_dtypes,
                                                cats=CATEGORICAL_COLUMNS,
                                                conts=CONTINUOUS_COLUMNS,
                                                labels=LABEL_COLUMNS)

    logging.info("Done!")
Example #8
0
def process_NVT(args):

    if args.feature_cross_list:
        feature_pairs = [
            pair.split("_") for pair in args.feature_cross_list.split(",")
        ]
        for pair in feature_pairs:
            CROSS_COLUMNS.append(pair[0] + '_' + pair[1])

    logging.info('NVTabular processing')
    train_input = os.path.join(args.data_path, "train/train.txt")
    val_input = os.path.join(args.data_path, "val/test.txt")
    PREPROCESS_DIR_temp_train = os.path.join(
        args.out_path, 'train/temp-parquet-after-conversion')
    PREPROCESS_DIR_temp_val = os.path.join(
        args.out_path, 'val/temp-parquet-after-conversion')
    PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val]
    train_output = os.path.join(args.out_path, "train")
    val_output = os.path.join(args.out_path, "val")

    # Make sure we have a clean parquet space for cudf conversion
    for one_path in PREPROCESS_DIR_temp:
        if os.path.exists(one_path):
            shutil.rmtree(one_path)
        os.mkdir(one_path)

    ## Get Dask Client

    # Deploy a Single-Machine Multi-GPU Cluster
    device_size = device_mem_size(kind="total")
    cluster = None
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS
        cluster = LocalCUDACluster(protocol=args.protocol,
                                   CUDA_VISIBLE_DEVICES=args.devices,
                                   n_workers=len(args.devices.split(",")),
                                   enable_nvlink=True,
                                   device_memory_limit=int(
                                       device_size * args.device_limit_frac),
                                   dashboard_address=":" + args.dashboard_port)
    else:
        cluster = LocalCUDACluster(protocol=args.protocol,
                                   n_workers=len(args.devices.split(",")),
                                   CUDA_VISIBLE_DEVICES=args.devices,
                                   device_memory_limit=int(
                                       device_size * args.device_limit_frac),
                                   dashboard_address=":" + args.dashboard_port)

    # Create the distributed client
    client = Client(cluster)
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, int(args.device_pool_frac * device_size))

    #calculate the total processing time
    runtime = time.time()

    #test dataset without the label feature
    if args.dataset_type == 'test':
        global LABEL_COLUMNS
        LABEL_COLUMNS = []

    ##-----------------------------------##
    # Dask rapids converts txt to parquet
    # Dask cudf dataframe = ddf

    ## train/valid txt to parquet
    train_valid_paths = [(train_input, PREPROCESS_DIR_temp_train),
                         (val_input, PREPROCESS_DIR_temp_val)]

    for input, temp_output in train_valid_paths:

        ddf = dask_cudf.read_csv(input,
                                 sep='\t',
                                 names=LABEL_COLUMNS + CONTINUOUS_COLUMNS +
                                 CATEGORICAL_COLUMNS)

        ## Convert label col to FP32
        if args.parquet_format and args.dataset_type == 'train':
            ddf["label"] = ddf['label'].astype('float32')

        # Save it as parquet format for better memory usage
        ddf.to_parquet(temp_output, header=True)
        ##-----------------------------------##

    COLUMNS = LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS
    train_paths = glob.glob(
        os.path.join(PREPROCESS_DIR_temp_train, "*.parquet"))
    valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet"))
    if args.criteo_mode == 0:
        proc = nvt.Workflow(cat_names=CROSS_COLUMNS + CATEGORICAL_COLUMNS,
                            cont_names=CONTINUOUS_COLUMNS,
                            label_name=LABEL_COLUMNS,
                            client=client)
        logging.info('Fillmissing processing')
        proc.add_cont_feature(nvt.ops.FillMissing())
        #For feature Cross
        if args.feature_cross_list:
            logging.info('Feature Crossing')
            feature_pairs = [
                pair.split("_") for pair in args.feature_cross_list.split(",")
            ]
            for pair in feature_pairs:
                col0 = pair[0]
                col1 = pair[1]
                #CROSS_COLUMNS.append(col0+'_'+col1)
                ## LambdaOp will automatically add new column with the name of col_name + "_" + op_name for differentiation
                proc.add_cat_feature(
                    nvt.ops.LambdaOp(op_name=col1,
                                     f=lambda col, gdf: col + gdf[col1],
                                     columns=[col0],
                                     replace=False))
        logging.info('Nomalization processing')
        proc.add_cont_preprocess(nvt.ops.Normalize())
    else:
        proc = nvt.Workflow(cat_names=CROSS_COLUMNS + CATEGORICAL_COLUMNS,
                            cont_names=[],
                            label_name=LABEL_COLUMNS,
                            client=client)
    logging.info('Categorify processing')
    proc.add_cat_preprocess(
        nvt.ops.Categorify(freq_threshold=args.freq_limit,
                           columns=CROSS_COLUMNS + CATEGORICAL_COLUMNS))
    proc.finalize()  # prepare to load the config

    ##Define the output format##
    output_format = 'hugectr'
    if args.parquet_format:
        output_format = 'parquet'
    ##--------------------##

    # just for /samples/criteo model
    train_ds_iterator = nvt.Dataset(train_paths,
                                    engine='parquet',
                                    part_size=int(args.part_mem_frac *
                                                  device_size))
    valid_ds_iterator = nvt.Dataset(valid_paths,
                                    engine='parquet',
                                    part_size=int(args.part_mem_frac *
                                                  device_size))

    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt.io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt.io.Shuffle.PER_PARTITION

    logging.info('Train Datasets Preprocessing.....')

    proc.apply(
        train_ds_iterator,
        output_path=train_output,
        out_files_per_proc=args.out_files_per_proc,
        output_format=output_format,
        shuffle=shuffle,
        num_io_threads=args.num_io_threads,
    )

    #--------------------##
    embeddings = nvt.ops.get_embedding_sizes(proc)
    print(embeddings)
    slot_size = []
    #Output slot_size for each categorical feature
    for item in CROSS_COLUMNS + CATEGORICAL_COLUMNS:
        slot_size.append(embeddings[item][0])
    print(slot_size)
    ##--------------------##

    logging.info('Valid Datasets Preprocessing.....')
    proc.apply(
        valid_ds_iterator,
        record_stats=False,
        output_path=val_output,
        out_files_per_proc=args.out_files_per_proc,
        output_format=output_format,
        shuffle=shuffle,
        num_io_threads=args.num_io_threads,
    )

    embeddings = nvt.ops.get_embedding_sizes(proc)
    print(embeddings)
    slot_size = []
    #Output slot_size for each categorical feature
    for item in CROSS_COLUMNS + CATEGORICAL_COLUMNS:
        slot_size.append(embeddings[item][0])
    print(slot_size)
    ##--------------------##

    ## Shutdown clusters
    client.close()
    logging.info('NVTabular processing done')

    runtime = time.time() - runtime

    print("\nDask-NVTabular Criteo Preprocessing")
    print("--------------------------------------")
    print(f"data_path          | {args.data_path}")
    print(f"output_path        | {args.out_path}")
    print(
        f"partition size     | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}"
    )
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")
Example #9
0
def run_preprocessing(input_path, base_dir, num_train_days, num_val_days,
                      num_gpus):

    # Define paths to save artifacts
    dask_workdir = os.path.join(base_dir, "test_dask/workdir")
    output_path = os.path.join(base_dir, "test_dask/output")
    stats_path = os.path.join(base_dir, "test_dask/stats")

    logging.info(f"Dask Workdir: {dask_workdir}")
    logging.info(f"Output Path: {output_path}")

    # Make sure we have a clean worker space for Dask
    if os.path.isdir(dask_workdir):
        shutil.rmtree(dask_workdir)
    os.makedirs(dask_workdir)

    # Make sure we have a clean stats space for Dask
    if os.path.isdir(stats_path):
        shutil.rmtree(stats_path)
    os.mkdir(stats_path)

    # Make sure we have a clean output path
    if os.path.isdir(output_path):
        shutil.rmtree(output_path)
    os.mkdir(output_path)

    logging.info("Created output directories..")

    # This requires the data to be in this specific format eg. day_0.parquet, day_2.parquet etc.
    fname = 'day_{}.parquet'
    num_days = len([
        i for i in os.listdir(input_path)
        if re.match(fname.format('[0-9]{1,2}'), i) is not None
    ])
    train_paths = [
        os.path.join(input_path, fname.format(day))
        for day in range(num_train_days)
    ]
    valid_paths = [
        os.path.join(input_path, fname.format(day))
        for day in range(num_train_days, num_train_days + num_val_days)
    ]

    logging.info(f"Training data: {train_paths}")
    logging.info(f"Validation data: {valid_paths}")

    # Deploy a Dask Distributed Cluster
    # Single-Machine Multi-GPU Cluster
    protocol = "tcp"  # "tcp" or "ucx"
    visible_devices = ",".join([str(n) for n in num_gpus
                                ])  # Delect devices to place workers
    device_limit_frac = 0.4  # Spill GPU-Worker memory to host at this limit.
    device_pool_frac = 0.5
    part_mem_frac = 0.05  # Desired maximum size of each partition as a fraction of total GPU memory.

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    part_size = int(part_mem_frac * device_size)
    logging.info(f"Partition size: {part_size}")

    # Deploy Dask Distributed cluster only if asked for multiple GPUs
    if len(num_gpus) > 1:

        device_limit = int(device_limit_frac * device_size)
        device_pool_size = int(device_pool_frac * device_size)

        logging.info("Checking if any device memory is already occupied..")
        # Check if any device memory is already occupied
        for dev in visible_devices.split(","):
            fmem = _pynvml_mem_size(kind="free", index=int(dev))
            used = (device_size - fmem) / 1e9
            if used > 1.0:
                warnings.warn(
                    f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
                )

        cluster = None  # (Optional) Specify existing scheduler port
        if cluster is None:
            cluster = LocalCUDACluster(protocol=protocol,
                                       n_workers=len(
                                           visible_devices.split(",")),
                                       CUDA_VISIBLE_DEVICES=visible_devices,
                                       device_memory_limit=device_limit,
                                       local_directory=dask_workdir)

        logging.info("Create the distributed client..")
        # Create the distributed client
        client = Client(cluster)

        logging.info("Initialize memory pools..")

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                # RMM may require the pool size to be a multiple of 256.
                pool_allocator=True,
                initial_pool_size=(device_pool_size // 256) * 256,
            )

        client.run(_rmm_pool)

    # Preprocessing
    CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)]
    CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)]
    LABEL_COLUMNS = ['label']
    COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

    cat_features = CATEGORICAL_COLUMNS >> Categorify(out_path=stats_path)
    cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(
        min_value=0) >> Normalize()
    features = cat_features + cont_features + LABEL_COLUMNS

    logging.info("Defining a workflow object..")
    if len(num_gpus) > 1:
        workflow = nvt.Workflow(features, client=client)
    else:
        workflow = nvt.Workflow(features)

    dict_dtypes = {}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64

    for col in CONTINUOUS_COLUMNS:
        dict_dtypes[col] = np.float32

    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    train_dataset = nvt.Dataset(train_paths,
                                engine='parquet',
                                part_size=part_size)
    valid_dataset = nvt.Dataset(valid_paths,
                                engine='parquet',
                                part_size=part_size)

    output_train_dir = os.path.join(output_path, 'train/')
    logging.info(f"Creating train/ directory at: {output_train_dir}")
    if not os.path.exists(output_train_dir):
        os.makedirs(output_train_dir)

    output_valid_dir = os.path.join(output_path, 'valid/')
    logging.info(f"Creating valid/ directory at: {output_valid_dir}")
    if not os.path.exists(output_valid_dir):
        os.makedirs(output_valid_dir)

    logging.info("Workflow Fit..")
    workflow.fit(train_dataset)

    logging.info("Transform Training data..")
    workflow.transform(train_dataset).to_parquet(
        output_path=output_train_dir,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        dtypes=dict_dtypes,
        cats=CATEGORICAL_COLUMNS,
        conts=CONTINUOUS_COLUMNS,
        labels=LABEL_COLUMNS)

    logging.info("Transform Validation data..")
    workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir,
                                                 dtypes=dict_dtypes,
                                                 cats=CATEGORICAL_COLUMNS,
                                                 conts=CONTINUOUS_COLUMNS,
                                                 labels=LABEL_COLUMNS)

    # use these printed out cardinalities list in the  "slot_size_array" in the HugeCTR training "dcn_parquet.json"
    cardinalities = []
    for col in CATEGORICAL_COLUMNS:
        cardinalities.append(nvt.ops.get_embedding_sizes(workflow)[col][0])

    logging.info(
        f"Cardinalities for configuring slot_size_array: {cardinalities}")

    logging.info(f"Saving workflow object at: {output_path + '/workflow'}")
    workflow.save(output_path + '/workflow')

    logging.info("Done!")
Example #10
0
def process_NVT(args):

    if args.feature_cross_list:
        feature_pairs = [
            pair.split("_") for pair in args.feature_cross_list.split(",")
        ]
        for pair in feature_pairs:
            CROSS_COLUMNS.append(pair[0] + '_' + pair[1])

    logging.info('NVTabular processing')
    train_input = os.path.join(args.data_path, "train/train.txt")
    val_input = os.path.join(args.data_path, "val/test.txt")
    PREPROCESS_DIR_temp_train = os.path.join(
        args.out_path, 'train/temp-parquet-after-conversion')
    PREPROCESS_DIR_temp_val = os.path.join(
        args.out_path, 'val/temp-parquet-after-conversion')
    PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val]
    train_output = os.path.join(args.out_path, "train")
    val_output = os.path.join(args.out_path, "val")

    # Make sure we have a clean parquet space for cudf conversion
    for one_path in PREPROCESS_DIR_temp:
        if os.path.exists(one_path):
            shutil.rmtree(one_path)
        os.mkdir(one_path)

    ## Get Dask Client

    # Deploy a Single-Machine Multi-GPU Cluster
    device_size = device_mem_size(kind="total")
    cluster = None
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS
        cluster = LocalCUDACluster(protocol=args.protocol,
                                   CUDA_VISIBLE_DEVICES=args.devices,
                                   n_workers=len(args.devices.split(",")),
                                   enable_nvlink=True,
                                   device_memory_limit=int(
                                       device_size * args.device_limit_frac),
                                   dashboard_address=":" + args.dashboard_port)
    else:
        cluster = LocalCUDACluster(protocol=args.protocol,
                                   n_workers=len(args.devices.split(",")),
                                   CUDA_VISIBLE_DEVICES=args.devices,
                                   device_memory_limit=int(
                                       device_size * args.device_limit_frac),
                                   dashboard_address=":" + args.dashboard_port)

    # Create the distributed client
    client = Client(cluster)
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, int(args.device_pool_frac * device_size))

    #calculate the total processing time
    runtime = time.time()

    #test dataset without the label feature
    if args.dataset_type == 'test':
        global LABEL_COLUMNS
        LABEL_COLUMNS = []

    ##-----------------------------------##
    # Dask rapids converts txt to parquet
    # Dask cudf dataframe = ddf

    ## train/valid txt to parquet
    train_valid_paths = [(train_input, PREPROCESS_DIR_temp_train),
                         (val_input, PREPROCESS_DIR_temp_val)]

    for input, temp_output in train_valid_paths:

        ddf = dask_cudf.read_csv(input,
                                 sep='\t',
                                 names=LABEL_COLUMNS + CONTINUOUS_COLUMNS +
                                 CATEGORICAL_COLUMNS)

        ## Convert label col to FP32
        if args.parquet_format and args.dataset_type == 'train':
            ddf["label"] = ddf['label'].astype('float32')

        # Save it as parquet format for better memory usage
        ddf.to_parquet(temp_output, header=True)
        ##-----------------------------------##

    COLUMNS = LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS
    train_paths = glob.glob(
        os.path.join(PREPROCESS_DIR_temp_train, "*.parquet"))
    valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet"))

    categorify_op = Categorify(freq_threshold=args.freq_limit)
    cat_features = CATEGORICAL_COLUMNS >> categorify_op
    cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(
        min_value=0) >> Normalize()
    cross_cat_op = Categorify(freq_threshold=args.freq_limit)

    features = LABEL_COLUMNS

    if args.criteo_mode == 0:
        features += cont_features
        if args.feature_cross_list:
            feature_pairs = [
                pair.split("_") for pair in args.feature_cross_list.split(",")
            ]
            for pair in feature_pairs:
                col0 = pair[0]
                col1 = pair[1]
                features += col0 >> FeatureCross(col1) >> Rename(
                    postfix="_" + col1) >> cross_cat_op

    features += cat_features

    workflow = nvt.Workflow(features, client=client)

    logging.info("Preprocessing")

    output_format = 'hugectr'
    if args.parquet_format:
        output_format = 'parquet'

    # just for /samples/criteo model
    train_ds_iterator = nvt.Dataset(train_paths,
                                    engine='parquet',
                                    part_size=int(args.part_mem_frac *
                                                  device_size))
    valid_ds_iterator = nvt.Dataset(valid_paths,
                                    engine='parquet',
                                    part_size=int(args.part_mem_frac *
                                                  device_size))

    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt.io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt.io.Shuffle.PER_PARTITION

    logging.info('Train Datasets Preprocessing.....')

    dict_dtypes = {}
    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64
    if not args.criteo_mode:
        for col in CONTINUOUS_COLUMNS:
            dict_dtypes[col] = np.float32
    for col in CROSS_COLUMNS:
        dict_dtypes[col] = np.int64
    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    conts = CONTINUOUS_COLUMNS if not args.criteo_mode else []

    workflow.fit(train_ds_iterator)

    if output_format == 'hugectr':
        workflow.transform(train_ds_iterator).to_hugectr(
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            output_path=train_output,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)
    else:
        workflow.transform(train_ds_iterator).to_parquet(
            output_path=train_output,
            dtypes=dict_dtypes,
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)

    ###Getting slot size###
    #--------------------##
    embeddings_dict_cat = categorify_op.get_embedding_sizes(
        CATEGORICAL_COLUMNS)
    embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
    embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS
                  ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS]

    print(embeddings)
    ##--------------------##

    logging.info('Valid Datasets Preprocessing.....')

    if output_format == 'hugectr':
        workflow.transform(valid_ds_iterator).to_hugectr(
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            output_path=val_output,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)
    else:
        workflow.transform(valid_ds_iterator).to_parquet(
            output_path=val_output,
            dtypes=dict_dtypes,
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)

    embeddings_dict_cat = categorify_op.get_embedding_sizes(
        CATEGORICAL_COLUMNS)
    embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
    embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS
                  ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS]

    print(embeddings)
    ##--------------------##

    ## Shutdown clusters
    client.close()
    logging.info('NVTabular processing done')

    runtime = time.time() - runtime

    print("\nDask-NVTabular Criteo Preprocessing")
    print("--------------------------------------")
    print(f"data_path          | {args.data_path}")
    print(f"output_path        | {args.out_path}")
    print(
        f"partition size     | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}"
    )
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")
Example #11
0
def nvt_etl(
    data_path,
    out_path,
    devices,
    protocol,
    device_limit_frac,
    device_pool_frac,
    part_mem_frac,
    cats,
    conts,
    labels,
    out_files_per_proc,
):
    # Set up data paths
    input_path = data_path[:-1] if data_path[-1] == "/" else data_path
    base_dir = out_path[:-1] if out_path[-1] == "/" else out_path
    dask_workdir = os.path.join(base_dir, "workdir")
    output_path = os.path.join(base_dir, "output")
    stats_path = os.path.join(base_dir, "stats")
    output_train_dir = os.path.join(output_path, "train/")
    output_valid_dir = os.path.join(output_path, "valid/")

    # Make sure we have a clean worker space for Dask
    if os.path.isdir(dask_workdir):
        shutil.rmtree(dask_workdir)
    os.makedirs(dask_workdir)

    # Make sure we have a clean stats space for Dask
    if os.path.isdir(stats_path):
        shutil.rmtree(stats_path)
    os.mkdir(stats_path)

    # Make sure we have a clean output path
    if os.path.isdir(output_path):
        shutil.rmtree(output_path)
    os.mkdir(output_path)
    os.mkdir(output_train_dir)
    os.mkdir(output_valid_dir)

    # Get train/valid files
    train_paths = [
        os.path.join(input_path, f) for f in os.listdir(input_path)
        if os.path.isfile(os.path.join(input_path, f))
    ]
    n_files = int(len(train_paths) * 0.9)
    valid_paths = train_paths[n_files:]
    train_paths = train_paths[:n_files]

    # Force dtypes for HugeCTR usage
    dict_dtypes = {}
    for col in cats:
        dict_dtypes[col] = np.int64
    for col in conts:
        dict_dtypes[col] = np.float32
    for col in labels:
        dict_dtypes[col] = np.float32

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(device_limit_frac * device_size)
    device_pool_size = int(device_pool_frac * device_size)
    part_size = int(part_mem_frac * device_size)

    # Check if any device memory is already occupied
    for dev in devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup dask cluster and perform ETL
    with managed_client(dask_workdir, devices, device_limit,
                        protocol) as client:
        # Setup RMM pool
        if device_pool_frac > 0.01:
            setup_rmm_pool(client, device_pool_size)

        # Define Dask NVTabular "Workflow"
        cont_features = conts >> ops.FillMissing() >> ops.Clip(
            min_value=0) >> ops.LogOp()

        cat_features = cats >> ops.Categorify(out_path=stats_path,
                                              max_size=10000000)

        workflow = Workflow(cat_features + cont_features + labels,
                            client=client)

        train_dataset = Dataset(train_paths,
                                engine="parquet",
                                part_size=part_size)
        valid_dataset = Dataset(valid_paths,
                                engine="parquet",
                                part_size=part_size)

        workflow.fit(train_dataset)

        workflow.transform(train_dataset).to_parquet(
            output_path=output_train_dir,
            shuffle=nvt_io.Shuffle.PER_WORKER,
            dtypes=dict_dtypes,
            cats=cats,
            conts=conts,
            labels=labels,
            out_files_per_proc=out_files_per_proc,
        )
        workflow.transform(valid_dataset).to_parquet(
            output_path=output_valid_dir,
            shuffle=nvt_io.Shuffle.PER_WORKER,
            dtypes=dict_dtypes,
            cats=cats,
            conts=conts,
            labels=labels,
            out_files_per_proc=out_files_per_proc,
        )

        workflow.save(os.path.join(output_path, "workflow"))

        return workflow