def test_multicolumn_cats(tmpdir, df, dataset, engine, groups, concat_groups): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] encoder = ops.GroupbyStatistics( columns=groups, cont_names=None if concat_groups else ["x"], stats=None if concat_groups else ["count", "mean"], out_path=str(tmpdir), concat_groups=concat_groups, ) config = nvt.workflow.get_new_config() config["PP"]["categorical"] = [encoder] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config) processor.update_stats(dataset) groups = [groups] if isinstance(groups, str) else groups for group in groups: group = [group] if isinstance(group, str) else group prefix = "unique." if concat_groups else "cat_stats." fn = prefix + "_".join(group) + ".parquet" cudf.read_parquet(os.path.join(tmpdir, "categories", fn))
def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] encoder = ops.GroupbyStatistics(columns=op_columns) config = nvt.workflow.get_new_config() config["PP"]["categorical"] = [encoder] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config ) processor.update_stats(dataset) if engine == "parquet" and not op_columns: cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") assert cats1.tolist() == [None] + cats_expected1.tolist()
def test_groupby_folds(tmpdir, df, dataset, engine, groups, kfold): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] gb_stats = ops.GroupbyStatistics( columns=None, out_path=str(tmpdir), kfold=kfold, fold_groups=groups, stats=["count", "sum"], cont_names=["y"], ) config = nvt.workflow.get_new_config() config["PP"]["categorical"] = [gb_stats] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config ) processor.update_stats(dataset) for group, path in processor.stats["categories"].items(): df = cudf.read_parquet(path) assert "__fold__" in df.columns