Esempio n. 1
0
def test_multicolumn_cats(tmpdir, df, dataset, engine, groups, concat_groups):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    encoder = ops.GroupbyStatistics(
        columns=groups,
        cont_names=None if concat_groups else ["x"],
        stats=None if concat_groups else ["count", "mean"],
        out_path=str(tmpdir),
        concat_groups=concat_groups,
    )
    config = nvt.workflow.get_new_config()
    config["PP"]["categorical"] = [encoder]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name,
                             config=config)
    processor.update_stats(dataset)

    groups = [groups] if isinstance(groups, str) else groups
    for group in groups:
        group = [group] if isinstance(group, str) else group
        prefix = "unique." if concat_groups else "cat_stats."
        fn = prefix + "_".join(group) + ".parquet"
        cudf.read_parquet(os.path.join(tmpdir, "categories", fn))
Esempio n. 2
0
def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    encoder = ops.GroupbyStatistics(columns=op_columns)
    config = nvt.workflow.get_new_config()
    config["PP"]["categorical"] = [encoder]

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config
    )
    processor.update_stats(dataset)

    if engine == "parquet" and not op_columns:
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        assert cats0.tolist() == [None] + cats_expected0.tolist()

    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    assert cats1.tolist() == [None] + cats_expected1.tolist()
Esempio n. 3
0
def test_groupby_folds(tmpdir, df, dataset, engine, groups, kfold):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    gb_stats = ops.GroupbyStatistics(
        columns=None,
        out_path=str(tmpdir),
        kfold=kfold,
        fold_groups=groups,
        stats=["count", "sum"],
        cont_names=["y"],
    )
    config = nvt.workflow.get_new_config()
    config["PP"]["categorical"] = [gb_stats]

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config
    )
    processor.update_stats(dataset)
    for group, path in processor.stats["categories"].items():
        df = cudf.read_parquet(path)
        assert "__fold__" in df.columns