def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(ops.Normalize())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    counts = df0[cont_names].count()
    for name in cont_names:
        assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3)
        assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3)
        assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
Example #2
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction,
                                use_client):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client if use_client else None,
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
    )

    processor.add_preprocess(
        ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True))

    processor.add_cat_feature(
        ops.JoinGroupby(cont_names=cont_names,
                        stats=["count", "sum"],
                        out_path=str(tmpdir)))

    processor.finalize()
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)

    processor.apply(dataset, output_path=str(tmpdir))
    result = processor.get_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
def test_dask_median_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.Median()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # TODO: Improve the accuracy! "tidigest" with crick could help,
    #       but current version seems to have cupy/numpy problems here
    medians = result[cont_names].quantile(q=0.5)
    assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1)
    assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1)
    assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.MinMax()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3)
    assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )

    processor.add_preprocess(
        ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir))
    )
    processor.finalize()

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    processor.apply(dataset)
    result = processor.get_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"],
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
Example #6
0
def main(args):
    """Multi-GPU Criteo/DLRM Preprocessing Benchmark

    This benchmark is designed to measure the time required to preprocess
    the Criteo (1TB) dataset for Facebook’s DLRM model.  The user must specify
    the path of the raw dataset (using the `--data-path` flag), as well as the
    output directory for all temporary/final data (using the `--out-path` flag)

    Example Usage
    -------------

    python dask-nvtabular-criteo-benchmark.py
                        --data-path /path/to/criteo_parquet --out-path /out/dir/`


    Dataset Requirements (Parquet)
    ------------------------------

    This benchmark is designed with a parquet-formatted dataset in mind.
    While a CSV-formatted dataset can be processed by NVTabular, converting
    to parquet will yield significantly better performance.  To convert your
    dataset, try using the `optimize_criteo.ipynb` notebook (also located
    in `NVTabular/examples/`)

    For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md`
    """

    # Input
    data_path = args.data_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.out_files_per_proc
    high_card_columns = args.high_cards.split(",")
    dashboard_port = args.dashboard_port
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS

    # Cleanup output directory
    BASE_DIR = args.out_path
    dask_workdir = os.path.join(BASE_DIR, "workdir")
    output_path = os.path.join(BASE_DIR, "output")
    stats_path = os.path.join(BASE_DIR, "stats")
    if not os.path.isdir(BASE_DIR):
        os.mkdir(BASE_DIR)
    for dir_path in (dask_workdir, output_path, stats_path):
        if os.path.isdir(dir_path):
            shutil.rmtree(dir_path)
        os.mkdir(dir_path)

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    # Specify Categorify/GroupbyStatistics options
    tree_width = {}
    cat_cache = {}
    for col in cat_names:
        if col in high_card_columns:
            tree_width[col] = args.tree_width
            cat_cache[col] = args.cat_cache_high
        else:
            tree_width[col] = 1
            cat_cache[col] = args.cat_cache_low

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Parse shuffle option
    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt_io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt_io.Shuffle.PER_PARTITION

    # Check if any device memory is already occupied
    for dev in args.devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    client = Client(cluster)

    # Setup RMM pool
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    if args.normalize:
        processor.add_feature([ops.FillMissing(), ops.Normalize()])
    else:
        processor.add_feature(
            [ops.FillMissing(),
             ops.Clip(min_value=0),
             ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=stats_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            search_sorted=not freq_limit,
            on_host=not args.cats_on_device,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                output_path=output_path,
                num_io_threads=args.num_io_threads,
            )
    else:
        processor.apply(
            dataset,
            num_io_threads=args.num_io_threads,
            shuffle=shuffle,
            out_files_per_proc=out_files_per_proc,
            output_path=output_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print(f"cats-on-device     | {args.cats_on_device}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
Example #7
0
def test_dask_workflow_api_dlrm(client, tmpdir, datasets, freq_threshold,
                                part_mem_fraction, engine, cat_cache, on_host,
                                shuffle):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(client=client,
                         cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name)

    processor.add_feature(
        [ops.FillMissing(),
         ops.Clip(min_value=0),
         ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            freq_threshold=freq_threshold,
            out_path=str(tmpdir),
            cat_cache=cat_cache,
            on_host=on_host,
        ))
    processor.finalize()

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    else:
        dataset = Dataset(paths,
                          names=allcols_csv,
                          part_mem_fraction=part_mem_fraction)
    output_path = os.path.join(tmpdir, "processed")
    processor.apply(dataset, output_path=output_path, shuffle=shuffle)

    # Can still access the final ddf if we didn't shuffle
    if not shuffle:
        result = processor.get_ddf().compute()
        assert len(df0) == len(result)
        assert result["x"].min() == 0.0
        assert result["x"].isna().sum() == 0
        assert result["y"].min() == 0.0
        assert result["y"].isna().sum() == 0

        # Check category counts
        cat_expect = df0.groupby("name-string").agg({
            "name-string": "count"
        }).reset_index(drop=True)
        cat_result = (result.groupby("name-string").agg({
            "name-string": "count"
        }).reset_index(drop=True))
        if freq_threshold:
            cat_expect = cat_expect[
                cat_expect["name-string"] >= freq_threshold]
            # Note that we may need to skip the 0th element in result (null mapping)
            assert_eq(
                cat_expect,
                cat_result.iloc[1:]
                if len(cat_result) > len(cat_expect) else cat_result,
                check_index=False,
            )
        else:
            assert_eq(cat_expect, cat_result)

        # Read back from disk
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        for col in df_disk:
            assert_eq(result[col], df_disk[col])

    else:
        # Read back from disk
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        assert len(df0) == len(df_disk)
Example #8
0
def main(args):

    # Input
    data_path = args.data_path
    out_path = args.out_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.splits
    if args.protocol == "ucx":
        os.environ["UCX_TLS"] = "tcp,cuda_copy,cuda_ipc,sockcm"

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    if args.cat_splits:
        tree_width = {
            name: int(s)
            for name, s in zip(cat_names, args.cat_splits.split(","))
        }
    else:
        tree_width = {col: 1 for col in cat_names}
        if args.cat_names is None:
            # Using Criteo... Use more hash partitions for
            # known high-cardinality columns
            tree_width["C20"] = 8
            tree_width["C1"] = 8
            tree_width["C22"] = 4
            tree_width["C10"] = 4
            tree_width["C21"] = 2
            tree_width["C11"] = 2
            tree_width["C23"] = 2
            tree_width["C12"] = 2

    # Specify categorical caching location
    cat_cache = None
    if args.cat_cache:
        cat_cache = args.cat_cache.split(",")
        if len(cat_cache) == 1:
            cat_cache = cat_cache[0]
        else:
            # If user is specifying a list of options,
            # they must specify an option for every cat column
            assert len(cat_names) == len(cat_cache)
    if isinstance(cat_cache, str):
        cat_cache = {col: cat_cache for col in cat_names}
    elif isinstance(cat_cache, list):
        cat_cache = {name: c for name, c in zip(cat_names, cat_cache)}
    else:
        # Criteo/DLRM Defaults
        cat_cache = {col: "device" for col in cat_names}
        if args.cat_names is None:
            cat_cache["C20"] = "host"
            cat_cache["C1"] = "host"
            # Only need to cache the largest two on a dgx-2
            if args.n_workers < 16:
                cat_cache["C22"] = "host"
                cat_cache["C10"] = "host"

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            device_memory_limit=device_limit,
            local_directory=args.dask_workspace,
            dashboard_address=":3787",
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=args.dask_workspace,
            dashboard_address=":3787",
        )
    client = Client(cluster)

    # Setup RMM pool
    if not args.no_rmm_pool:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    processor.add_feature([ops.ZeroFill(), ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=out_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            on_host=args.cat_on_host,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle="full" if args.worker_shuffle else "partial",
                out_files_per_proc=out_files_per_proc,
                output_path=out_path,
            )
    else:
        processor.apply(
            dataset,
            shuffle="full" if args.worker_shuffle else "partial",
            out_files_per_proc=out_files_per_proc,
            output_path=out_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devs}")
    print(f"rmm-pool           | {(not args.no_rmm_pool)}")
    print(f"out_files_per_proc | {args.splits}")
    print(f"worker-shuffle     | {args.worker_shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
def preprocess_criteo_parquet(
    input_path: str,
    output_path: str,
    client,
    frequency_threshold: int,
):
    train_days = [str(x) for x in CRITEO_TRAIN_DAYS]
    train_files = [
        os.path.join(input_path, x) for x in os.listdir(input_path)
        if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days
    ]
    valid_file = os.path.join(input_path, "day_23.part2.parquet")
    test_file = os.path.join(input_path, "day_23.part1.parquet")

    all_set = train_files + [valid_file] + [test_file]

    print(all_set, train_files, valid_file, test_file)
    print("Creating Workflow Object")

    workflow = Workflow(cat_names=CRITEO_CATEGORICAL_COLUMNS,
                        cont_names=CRITEO_CONTINUOUS_COLUMNS,
                        label_name=CRITEO_CLICK_COLUMNS)

    # We want to assign 0 to all missing values, and calculate log(x+3) for present values
    # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0
    workflow.add_cont_feature([
        FillMissing(fill_val=-2.0),
        LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne',
                 f=lambda col, _: col.add(2.0)),
        LogOp(),  # Log(1+x)
    ])

    workflow.add_cat_preprocess(
        Categorify(freq_threshold=frequency_threshold, out_path=output_path))

    workflow.finalize()

    print("Creating Dataset Iterator")
    all_ds = Dataset(all_set,
                     engine="parquet",
                     part_mem_fraction=ALL_DS_MEM_FRAC)
    trains_ds = Dataset(train_files,
                        engine="parquet",
                        part_mem_fraction=TRAIN_DS_MEM_FRAC)
    valid_ds = Dataset(valid_file,
                       engine="parquet",
                       part_mem_fraction=TEST_DS_MEM_FRAC)
    test_ds = Dataset(test_file,
                      engine="parquet",
                      part_mem_fraction=VALID_DS_MEM_FRAC)

    print("Running apply")
    out_train = os.path.join(output_path, "train")
    out_valid = os.path.join(output_path, "validation")
    out_test = os.path.join(output_path, "test")

    start = time()
    workflow.update_stats(all_ds)
    print(f"Gathering statistics time: {time() - start}")

    start = time()
    workflow.apply(trains_ds, record_stats=False, output_path=out_train)
    print(f"train preprocess time: {time() - start}")

    start = time()
    workflow.apply(valid_ds, record_stats=False, output_path=out_valid)
    print(f"valid preprocess time: {time() - start}")

    start = time()
    workflow.apply(test_ds, record_stats=False, output_path=out_test)
    print(f"test preprocess time: {time() - start}")

    save_model_size_config(workflow, output_path)