Esempio n. 1
0
def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine, use_dict):
    # TODO: add tests for > 2 features, multiple crosses, etc.
    cat_names = ("name-string", "id")
    num_buckets = 10

    if use_dict:
        hashed_cross_op = ops.HashedCross({cat_names: num_buckets})
    else:
        hashed_cross_op = ops.HashedCross([cat_names], [num_buckets])

    columns_ctx = {}
    columns_ctx["categorical"] = {}
    columns_ctx["categorical"]["base"] = list(cat_names)

    # check sums for determinancy
    checksums = []
    for gdf in dataset.to_iter():
        new_gdf = hashed_cross_op.apply_op(gdf, columns_ctx, "categorical")
        new_column_name = "_X_".join(cat_names)
        assert np.all(new_gdf[new_column_name].values >= 0)
        assert np.all(new_gdf[new_column_name].values <= 9)
        checksums.append(new_gdf[new_column_name].sum())

    for checksum, gdf in zip(checksums, dataset.to_iter()):
        new_gdf = hashed_cross_op.apply_op(gdf, columns_ctx, "categorical")
        assert new_gdf[new_column_name].sum() == checksum
Esempio n. 2
0
def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine):
    # TODO: add tests for > 2 features, multiple crosses, etc.
    cat_names = [["name-string", "id"]]
    num_buckets = 10

    hashed_cross = cat_names >> ops.HashedCross(num_buckets)
    dataset = nvt.Dataset(df)
    processor = nvtabular.Workflow(hashed_cross)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check sums for determinancy
    new_column_name = "_X_".join(cat_names[0])
    assert np.all(new_gdf[new_column_name].values >= 0)
    assert np.all(new_gdf[new_column_name].values <= 9)
    checksum = new_gdf[new_column_name].sum()
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert new_gdf[new_column_name].sum() == checksum
# initial column selector works with tags
# filter within the workflow by tags
# test tags correct at output
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("col1"),
        ops.FillMissing(),
        ops.Groupby("col1"),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby("col1"),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding("col1"),
    ],
)
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
    workflow = Workflow(cont_features)