Esempio n. 1
0
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms
    cats = cat_names >> ops.Categorify()
    workflow = nvt.Workflow(conts + cats + label_name)

    workflow.fit(dataset)
    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        output_path=tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Esempio n. 2
0
def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    encoder = ops.CategoryStatistics(columns=op_columns)
    config = nvt.workflow.get_new_config()
    config["PP"]["categorical"] = [encoder]

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config
    )
    processor.update_stats(dataset)

    if engine == "parquet" and not op_columns:
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        assert cats0.tolist() == [None] + cats_expected0.tolist()

    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    assert cats1.tolist() == [None] + cats_expected1.tolist()
Esempio n. 3
0
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac,
                             engine, dump, replace):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify()
    if replace:
        cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms
    else:
        fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >>
                             ops.Rename(postfix="_FillMissing_1_LogOp_1"))
        cont_features = cont_names + fillmissing_logop >> norms

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client)

    workflow.fit(dataset)

    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir, client=client)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Clip, Log

    concat_ops = "_FillMissing_1_LogOp_1"
    if replace:
        concat_ops = ""
    assert math.isclose(get_norms(df.x).mean(),
                        norms.means["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).mean(),
                        norms.means["y" + concat_ops],
                        rel_tol=1e-1)

    assert math.isclose(get_norms(df.x).std(),
                        norms.stds["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(),
                        norms.stds["y" + concat_ops],
                        rel_tol=1e-1)
    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
    )

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Esempio n. 4
0
def test_cpu_workflow(tmpdir, df, dataset, cpu, engine, dump):
    # Make sure we are in cpu formats
    if cudf and isinstance(df, cudf.DataFrame):
        df = df.to_pandas()

    if cpu:
        dataset.to_cpu()

    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms
    cats = cat_names >> ops.Categorify()
    workflow = nvt.Workflow(conts + cats + label_name)

    workflow.fit(dataset)
    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir)

    def get_norms(tar: pd.Series):
        df = tar.fillna(0)
        df = df * (df >= 0).astype("int")
        return df

    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique()
        cats0 = get_cats(workflow, "name-cat", cpu=True)
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + sorted(cats_expected0.tolist())
    cats_expected1 = df["name-string"].unique()
    cats1 = get_cats(workflow, "name-string", cpu=True)
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + sorted(cats_expected1.tolist())

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION
    )

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), cpu=cpu)

    df_pp = pd.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    metadata = pq.read_metadata(str(tmpdir) + "/_metadata")
    assert metadata.num_rows == len(df_pp)
Esempio n. 5
0
def test_gpu_preproc(tmpdir, df, dataset, dump, gpu_memory_frac, engine,
                     preprocessing):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_feature(
        [ops.FillMedian(),
         ops.LogOp(preprocessing=preprocessing)])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log
    x_col = "x" if preprocessing else "x_LogOp"
    y_col = "y" if preprocessing else "y_LogOp"
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"][x_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"][y_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"][x_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"][y_col],
                        rel_tol=1e-2)

    # Check median (TODO: Improve the accuracy)
    x_median = df.x.dropna().quantile(0.5, interpolation="linear")
    y_median = df.y.dropna().quantile(0.5, interpolation="linear")
    id_median = df.id.dropna().quantile(0.5, interpolation="linear")
    assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1)
    assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1)
    assert math.isclose(id_median,
                        processor.stats["medians"]["id"],
                        rel_tol=1e1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    #     Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               nfiles=10,
                               shuffle=True,
                               apply_ops=True)

    processor.create_final_cols()

    # if preprocessing
    if not preprocessing:
        for col in cont_names:
            assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][
                "continuous"]

    dlc = torch_dataloader.DLCollator(preproc=processor, apply_ops=False)
    data_files = [
        torch_dataloader.FileItrDataset(x,
                                        use_row_groups=True,
                                        gpu_memory_frac=gpu_memory_frac,
                                        names=allcols_csv)
        for x in glob.glob(str(tmpdir) + "/*.parquet")
    ]

    data_itr = torch.utils.data.ChainDataset(data_files)
    dl = torch_dataloader.DLDataLoader(data_itr,
                                       collate_fn=dlc.gdf_col,
                                       pin_memory=False,
                                       num_workers=0)

    len_df_pp = 0
    for chunk in dl:
        len_df_pp += len(chunk[0][0])

    dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                      part_mem_fraction=gpu_memory_frac)
    x = processor.ds_to_tensors(dataset.to_iter(), apply_ops=False)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert len(x[0]) == len_df_pp

    itr_ds = torch_dataloader.TensorItrDataset([x[0], x[1], x[2]],
                                               batch_size=512000)
    count_tens_itr = 0
    for data_gd in itr_ds:
        count_tens_itr += len(data_gd[1])
        assert data_gd[0].shape[1] > 0
        assert data_gd[1].shape[1] > 0

    assert len_df_pp == count_tens_itr
Esempio n. 6
0
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                          dump, op_columns, use_client):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        client=client if use_client else None,
    )

    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify(cat_cache="host"))
    processor.finalize()

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log

    if not op_columns:
        assert math.isclose(get_norms(df.y).mean(),
                            processor.stats["means"]["y"],
                            rel_tol=1e-1)
        assert math.isclose(get_norms(df.y).std(),
                            processor.stats["stds"]["y"],
                            rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x"],
                        rel_tol=1e-1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Esempio n. 7
0
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac,
                             engine, dump, replace):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    # add operators with dependencies
    config["FE"]["continuous"] = [[
        ops.FillMissing(replace=replace),
        ops.LogOp(replace=replace)
    ]]
    config["PP"]["continuous"] = [[
        ops.LogOp(replace=replace),
        ops.Normalize()
    ]]
    config["PP"]["categorical"] = [ops.Categorify()]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        client=client,
    )

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log

    concat_ops = "_FillMissing_LogOp"
    if replace:
        concat_ops = ""
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"]["y" + concat_ops],
                        rel_tol=1e-1)

    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"]["y" + concat_ops],
                        rel_tol=1e-1)
    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Esempio n. 8
0
def test_gpu_workflow(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                      dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["FE"]["continuous"] = [ops.ZeroFill()]
    config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]]
    config["PP"]["categorical"] = [ops.Categorify()]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        client=client,
    )

    processor.update_stats(dataset)
    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"]["y"],
                        rel_tol=1e-4)
    #     assert math.isclose(get_norms(df.id).mean(),
    #                         processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x"],
                        rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"]["y"],
                        rel_tol=1e-3)
    #     assert math.isclose(get_norms(df.id).std(),
    #                         processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Esempio n. 9
0
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                          dump, use_client):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify(cat_cache="host")
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp >> norms

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client if use_client else None)

    workflow.fit(dataset)

    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir,
                                 client=client if use_client else None)

    def get_norms(tar):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Clip, Log
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique(
        ).values_host if HAS_GPU else df["name-cat"].unique()
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert all(cat in [None] + sorted(cats_expected0.tolist())
                   for cat in cats0.tolist())
        assert len(cats0.tolist()) == len(cats_expected0.tolist() + [None])
    if HAS_GPU:
        cats_expected1 = (df["name-string"].unique().values_host
                          if HAS_GPU else df["name-string"].unique())
    else:
        cats_expected1 = df["name-string"].unique()
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert all(cat in [None] + sorted(cats_expected1.tolist())
               for cat in cats1.tolist())
    assert len(cats1.tolist()) == len(cats_expected1.tolist() + [None])

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
    )

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = nvt.dispatch._concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = nvt.dispatch._read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)