def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine, use_dict): # TODO: add tests for > 2 features, multiple crosses, etc. cat_names = ("name-string", "id") num_buckets = 10 if use_dict: hashed_cross_op = ops.HashedCross({cat_names: num_buckets}) else: hashed_cross_op = ops.HashedCross([cat_names], [num_buckets]) columns_ctx = {} columns_ctx["categorical"] = {} columns_ctx["categorical"]["base"] = list(cat_names) # check sums for determinancy checksums = [] for gdf in dataset.to_iter(): new_gdf = hashed_cross_op.apply_op(gdf, columns_ctx, "categorical") new_column_name = "_X_".join(cat_names) assert np.all(new_gdf[new_column_name].values >= 0) assert np.all(new_gdf[new_column_name].values <= 9) checksums.append(new_gdf[new_column_name].sum()) for checksum, gdf in zip(checksums, dataset.to_iter()): new_gdf = hashed_cross_op.apply_op(gdf, columns_ctx, "categorical") assert new_gdf[new_column_name].sum() == checksum
def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine): # TODO: add tests for > 2 features, multiple crosses, etc. cat_names = [["name-string", "id"]] num_buckets = 10 hashed_cross = cat_names >> ops.HashedCross(num_buckets) dataset = nvt.Dataset(df) processor = nvtabular.Workflow(hashed_cross) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check sums for determinancy new_column_name = "_X_".join(cat_names[0]) assert np.all(new_gdf[new_column_name].values >= 0) assert np.all(new_gdf[new_column_name].values <= 9) checksum = new_gdf[new_column_name].sum() new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf[new_column_name].sum() == checksum
# initial column selector works with tags # filter within the workflow by tags # test tags correct at output @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("col1"), ops.FillMissing(), ops.Groupby("col1"), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby("col1"), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding("col1"), ], ) def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op workflow = Workflow(cont_features)