Python Workflow.fitの例、nvtabular.Workflow.fit Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: miguelusque/NVTabular

def test_chaining_2():
    gdf = cudf.DataFrame({
        "A": [1, 2, 2, 9, 6, np.nan, 3],
        "B": [2, np.nan, 4, 7, 7, 2, 5],
        "C": ["a", "b", "c", np.nan, np.nan, "g", "k"],
    })

    cat_names = ["C"]
    cont_names = ["A", "B"]
    label_name = []

    all_features = (cat_names + cont_names >> ops.LambdaOp(
        f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull"))
    cat_features = cat_names >> ops.Categorify()

    workflow = Workflow(all_features + cat_features + label_name)

    dataset = nvt.Dataset(gdf, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(x in list(result.columns)
               for x in ["A_isnull", "B_isnull", "C_isnull"])
    assert (x in result["C"].unique()
            for x in set(gdf["C"].dropna().to_arrow()))

コード例 #2

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: miguelusque/NVTabular

def test_chaining_3():
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    platform_features = ["platform"] >> ops.Dropna()
    joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    joined_lambda = (
        joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >>
        ops.Rename(postfix="_ctr"))

    workflow = Workflow(platform_features + joined + joined_lambda)

    dataset = nvt.Dataset(gdf_test, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(
        x in result.columns
        for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])

コード例 #3

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: thibaultcharrin/NVTabular

def test_workflow_generate_columns(tmpdir, use_parquet):
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    # Stripped down dataset with geo_locaiton codes like in outbrains
    df = nvt.dispatch._make_df(
        {"geo_location": ["US>CA", "CA>BC", "US>TN>659"]})

    # defining a simple workflow that strips out the country code from the first two digits of the
    # geo_location code and sticks in a new 'geo_location_country' field
    country = (["geo_location"] >> ops.LambdaOp(
        f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country"))
    cat_features = ["geo_location"] + country >> ops.Categorify()

    workflow = Workflow(cat_features)

    if use_parquet:
        df.to_parquet(path)
        dataset = nvt.Dataset(path)
    else:
        dataset = nvt.Dataset(df)

    # just make sure this works without errors
    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(out_path)

コード例 #4

0

ファイルを表示

ファイル: test_workflow_schemas.py プロジェクト: thibaultcharrin/NVTabular

def test_schema_write_read_dataset(tmpdir, dataset, engine):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify(cat_cache="host")
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp >> norms

    workflow = Workflow(cat_features + cont_features + label_name)

    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
    )

    schema_path = Path(tmpdir)
    proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt")
    new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"))
    assert """name: "name-cat"\n    min: 0\n    max: 27\n""" in str(
        proto_schema)
    assert new_dataset.schema == workflow.output_schema

コード例 #5

0

ファイルを表示

def test_workflow_node_select():
    df = dispatch._make_df({
        "a": [1, 4, 9, 16, 25],
        "b": [0, 1, 2, 3, 4],
        "c": [25, 16, 9, 4, 1]
    })
    dataset = Dataset(df)

    input_features = WorkflowNode(ColumnSelector(["a", "b", "c"]))
    # pylint: disable=unnecessary-lambda
    sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col))
    plus_one_features = input_features["b"] >> (lambda col: col + 1)
    features = sqrt_features + plus_one_features

    workflow = Workflow(features)
    workflow.fit(dataset)

    df_out = workflow.transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    expected = dispatch._make_df()
    expected["a"] = np.sqrt(df["a"])
    expected["c"] = np.sqrt(df["c"])
    expected["b"] = df["b"] + 1

    assert_eq(expected, df_out)

コード例 #6

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: miguelusque/NVTabular

def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline):
    out_files_per_proc = 2
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    size = 25
    row_group_size = 5

    cont_names = ["cont1", "cont2"]
    cat_names = ["cat1", "cat2"]
    label_name = ["label"]

    df = pd.DataFrame({
        "cont1": np.arange(size, dtype=np.float64),
        "cont2": np.arange(size, dtype=np.float64),
        "cat1": np.arange(size, dtype=np.int32),
        "cat2": np.arange(size, dtype=np.int32),
        "label": np.arange(size, dtype=np.float64),
    })
    df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow")

    dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1)

    cat_features = cat_names >> ops.Categorify()
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client if use_client else None)

    workflow.fit(dataset)

    # Force dtypes
    dict_dtypes = {}
    for col in cont_names:
        dict_dtypes[col] = np.float32
    for col in cat_names:
        dict_dtypes[col] = np.float32
    for col in label_name:
        dict_dtypes[col] = np.int64

    workflow.transform(dataset).to_parquet(
        # apply_offline=apply_offline, Not any more?
        # record_stats=apply_offline, Not any more?
        output_path=out_path,
        shuffle=shuffle,
        out_files_per_proc=out_files_per_proc,
        dtypes=dict_dtypes,
    )

    # Check dtypes
    for filename in glob.glob(os.path.join(out_path, "*.parquet")):
        gdf = cudf.io.read_parquet(filename)
        assert dict(gdf.dtypes) == dict_dtypes

コード例 #7

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: miguelusque/NVTabular

def test_workflow_input_output_dtypes():
    df = cudf.DataFrame({
        "genre": ["drama", "comedy"],
        "user": ["a", "b"],
        "unneeded": [1, 2]
    })
    features = [["genre", "user"], "genre"
                ] >> ops.Categorify(encode_type="combo")
    workflow = Workflow(features)
    workflow.fit(Dataset(df))

    assert "unneeded" not in workflow.input_dtypes
    assert set(workflow.input_dtypes.keys()) == {"genre", "user"}
    assert set(workflow.output_dtypes.keys()) == {"genre_user", "genre"}

コード例 #8

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: miguelusque/NVTabular

def test_fit_simple():
    data = cudf.DataFrame({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = cudf.DataFrame({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    assert_eq(expected, transformed)

コード例 #9

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: thibaultcharrin/NVTabular

def test_workflow_transform_ddf_dtypes():
    # Initial Dataset
    df = cudf.datasets.timeseries().reset_index()
    ddf = dask_cudf.from_cudf(df, npartitions=2)
    dataset = Dataset(ddf)

    # Create and Execute Workflow
    cols = ["name", "x", "y", "timestamp"]
    cat_cols = ["id"] >> ops.Normalize()
    workflow = Workflow(cols + cat_cols)
    workflow.fit(dataset)
    transformed_ddf = workflow.transform(dataset).to_ddf()

    # no transforms on the pass through cols, should have original dtypes
    for col in cols:
        assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col])

    # Followup dask-cudf sorting used to throw an exception because of dtype issues,
    # check that it works now
    transformed_ddf.sort_values(["id", "timestamp"]).compute()

コード例 #10

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: thibaultcharrin/NVTabular

def test_fit_simple():
    data = nvt.dispatch._make_df({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = nvt.dispatch._make_df({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    if not HAS_GPU:
        transformed["x"] = transformed["x"].astype(expected["x"].dtype)
        transformed["y"] = transformed["y"].astype(expected["y"].dtype)
    assert_eq(expected, transformed)

コード例 #11

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: miguelusque/NVTabular

def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                          dump, use_client):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify(cat_cache="host")
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp >> norms

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client if use_client else None)

    workflow.fit(dataset)

    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir,
                                 client=client if use_client else None)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Clip, Log
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
    )

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)

コード例 #12

0

ファイルを表示

ファイル: dask-nvtabular-criteo-benchmark.py プロジェクト: bschifferer/NVTabular

def main(args):
    """Multi-GPU Criteo/DLRM Preprocessing Benchmark

    This benchmark is designed to measure the time required to preprocess
    the Criteo (1TB) dataset for Facebook’s DLRM model.  The user must specify
    the path of the raw dataset (using the `--data-path` flag), as well as the
    output directory for all temporary/final data (using the `--out-path` flag)

    Example Usage
    -------------

    python dask-nvtabular-criteo-benchmark.py
                        --data-path /path/to/criteo_parquet --out-path /out/dir/`


    Dataset Requirements (Parquet)
    ------------------------------

    This benchmark is designed with a parquet-formatted dataset in mind.
    While a CSV-formatted dataset can be processed by NVTabular, converting
    to parquet will yield significantly better performance.  To convert your
    dataset, try using the `optimize_criteo.ipynb` notebook (also located
    in `NVTabular/examples/`)

    For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md`
    """

    # Input
    data_path = args.data_path[:-1] if args.data_path[
        -1] == "/" else args.data_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.out_files_per_proc
    high_card_columns = args.high_cards.split(",")
    dashboard_port = args.dashboard_port
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS

    # Cleanup output directory
    base_dir = args.out_path[:-1] if args.out_path[-1] == "/" else args.out_path
    dask_workdir = os.path.join(base_dir, "workdir")
    output_path = os.path.join(base_dir, "output")
    stats_path = os.path.join(base_dir, "stats")
    setup_dirs(base_dir, dask_workdir, output_path, stats_path)

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    # Specify Categorify/GroupbyStatistics options
    tree_width = {}
    cat_cache = {}
    for col in cat_names:
        if col in high_card_columns:
            tree_width[col] = args.tree_width
            cat_cache[col] = args.cat_cache_high
        else:
            tree_width[col] = 1
            cat_cache[col] = args.cat_cache_low

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Parse shuffle option
    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt_io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt_io.Shuffle.PER_PARTITION

    # Check if any device memory is already occupied
    for dev in args.devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    client = Client(cluster)

    # Setup RMM pool
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    if args.normalize:
        cont_features = cont_names >> ops.FillMissing() >> ops.Normalize()
    else:
        cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
            min_value=0) >> ops.LogOp()

    cat_features = cat_names >> ops.Categorify(
        out_path=stats_path,
        tree_width=tree_width,
        cat_cache=cat_cache,
        freq_threshold=freq_limit,
        search_sorted=not freq_limit,
        on_host=not args.cats_on_device,
    )
    processor = Workflow(cat_features + cont_features + label_name,
                         client=client)

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()

    processor.fit(dataset)

    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.transform(dataset).to_parquet(
                output_path=output_path,
                num_threads=args.num_io_threads,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
            )
    else:
        processor.transform(dataset).to_parquet(
            output_path=output_path,
            num_threads=args.num_io_threads,
            shuffle=shuffle,
            out_files_per_proc=out_files_per_proc,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print(f"cats-on-device     | {args.cats_on_device}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()

コード例 #13

0

ファイルを表示

ファイル: test_workflow.py プロジェクト: thibaultcharrin/NVTabular

def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac,
                             engine, dump, replace):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify()
    if replace:
        cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms
    else:
        fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >>
                             ops.Rename(postfix="_FillMissing_1_LogOp_1"))
        cont_features = cont_names + fillmissing_logop >> norms

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client)

    workflow.fit(dataset)

    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir, client=client)

    def get_norms(tar):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Clip, Log

    concat_ops = "_FillMissing_1_LogOp_1"
    if replace:
        concat_ops = ""
    assert math.isclose(get_norms(df.x).mean(),
                        norms.means["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).mean(),
                        norms.means["y" + concat_ops],
                        rel_tol=1e-1)

    assert math.isclose(get_norms(df.x).std(),
                        norms.stds["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(),
                        norms.stds["y" + concat_ops],
                        rel_tol=1e-1)
    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique(
        ).values_host if HAS_GPU else df["name-cat"].unique()
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert all(cat in [None] + sorted(cats_expected0.tolist())
                   for cat in cats0.tolist())
        assert len(cats0.tolist()) == len(cats_expected0.tolist() + [None])
    cats_expected1 = (df["name-string"].unique().values_host
                      if HAS_GPU else df["name-string"].unique())
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert all(cat in [None] + sorted(cats_expected1.tolist())
               for cat in cats1.tolist())
    assert len(cats1.tolist()) == len(cats_expected1.tolist() + [None])

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
    )

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = nvt.dispatch._concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = nvt.dispatch._read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)

コード例 #14

0

ファイルを表示

ファイル: nvt_etl.py プロジェクト: bschifferer/NVTabular

def nvt_etl(
    data_path,
    out_path,
    devices,
    protocol,
    device_limit_frac,
    device_pool_frac,
    part_mem_frac,
    cats,
    conts,
    labels,
    out_files_per_proc,
):
    # Set up data paths
    input_path = data_path[:-1] if data_path[-1] == "/" else data_path
    base_dir = out_path[:-1] if out_path[-1] == "/" else out_path
    dask_workdir = os.path.join(base_dir, "workdir")
    output_path = os.path.join(base_dir, "output")
    stats_path = os.path.join(base_dir, "stats")
    output_train_dir = os.path.join(output_path, "train/")
    output_valid_dir = os.path.join(output_path, "valid/")

    # Make sure we have a clean worker space for Dask
    if os.path.isdir(dask_workdir):
        shutil.rmtree(dask_workdir)
    os.makedirs(dask_workdir)

    # Make sure we have a clean stats space for Dask
    if os.path.isdir(stats_path):
        shutil.rmtree(stats_path)
    os.mkdir(stats_path)

    # Make sure we have a clean output path
    if os.path.isdir(output_path):
        shutil.rmtree(output_path)
    os.mkdir(output_path)
    os.mkdir(output_train_dir)
    os.mkdir(output_valid_dir)

    # Get train/valid files
    train_paths = [
        os.path.join(input_path, f) for f in os.listdir(input_path)
        if os.path.isfile(os.path.join(input_path, f))
    ]
    n_files = int(len(train_paths) * 0.9)
    valid_paths = train_paths[n_files:]
    train_paths = train_paths[:n_files]

    # Force dtypes for HugeCTR usage
    dict_dtypes = {}
    for col in cats:
        dict_dtypes[col] = np.int64
    for col in conts:
        dict_dtypes[col] = np.float32
    for col in labels:
        dict_dtypes[col] = np.float32

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(device_limit_frac * device_size)
    device_pool_size = int(device_pool_frac * device_size)
    part_size = int(part_mem_frac * device_size)

    # Check if any device memory is already occupied
    for dev in devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup dask cluster and perform ETL
    with managed_client(dask_workdir, devices, device_limit,
                        protocol) as client:
        # Setup RMM pool
        if device_pool_frac > 0.01:
            setup_rmm_pool(client, device_pool_size)

        # Define Dask NVTabular "Workflow"
        cont_features = conts >> ops.FillMissing() >> ops.Clip(
            min_value=0) >> ops.LogOp()

        cat_features = cats >> ops.Categorify(out_path=stats_path,
                                              max_size=10000000)

        workflow = Workflow(cat_features + cont_features + labels,
                            client=client)

        train_dataset = Dataset(train_paths,
                                engine="parquet",
                                part_size=part_size)
        valid_dataset = Dataset(valid_paths,
                                engine="parquet",
                                part_size=part_size)

        workflow.fit(train_dataset)

        workflow.transform(train_dataset).to_parquet(
            output_path=output_train_dir,
            shuffle=nvt_io.Shuffle.PER_WORKER,
            dtypes=dict_dtypes,
            cats=cats,
            conts=conts,
            labels=labels,
            out_files_per_proc=out_files_per_proc,
        )
        workflow.transform(valid_dataset).to_parquet(
            output_path=output_valid_dir,
            shuffle=nvt_io.Shuffle.PER_WORKER,
            dtypes=dict_dtypes,
            cats=cats,
            conts=conts,
            labels=labels,
            out_files_per_proc=out_files_per_proc,
        )

        workflow.save(os.path.join(output_path, "workflow"))

        return workflow