def test_joingroupby_multi(tmpdir, groups): df = pd.DataFrame({ "Author": ["User_A", "User_A", "User_A", "User_B"], "Engaging-User": ["User_B", "User_B", "User_C", "User_C"], "Cost": [100.0, 200.0, 300.0, 400.0], "Post": [1, 2, 3, 4], }) groupby_features = groups >> ops.JoinGroupby( out_path=str(tmpdir), stats=["sum"], cont_cols=["Cost"]) workflow = nvt.Workflow(groupby_features + "Post") df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() if isinstance(groups, list): # Join on ["Author", "Engaging-User"] assert df_out["Author_Engaging-User_Cost_sum"].to_arrow().to_pylist( ) == [ 300.0, 300.0, 300.0, 400.0, ] else: # Join on ["Author"] assert df_out["Author_Cost_sum"].to_arrow().to_pylist() == [ 600.0, 600.0, 600.0, 400.0 ]
def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) platform_features = ["platform"] >> ops.Dropna() joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) joined_lambda = ( joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >> ops.Rename(postfix="_ctr")) workflow = Workflow(platform_features + joined + joined_lambda) dataset = nvt.Dataset(gdf_test, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client if use_client else None, cat_names=cat_names, cont_names=cont_names, label_name=label_name, ) processor.add_preprocess( ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)) processor.add_cat_feature( ops.JoinGroupby(cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir))) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset, output_path=str(tmpdir)) result = processor.get_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_joingroupby_dependency(tmpdir, cpu): df = pd.DataFrame({ "Author": ["User_A", "User_A", "User_A", "User_B", "User_B"], "Cost": [100.0, 200.0, 300.0, 400.0, 400.0], }) normalized_cost = ["Cost"] >> nvt.ops.NormalizeMinMax() >> nvt.ops.Rename( postfix="_normalized") groupby_features = ["Author"] >> ops.JoinGroupby( out_path=str(tmpdir), stats=["sum"], cont_cols=normalized_cost) workflow = nvt.Workflow(groupby_features) df_out = workflow.fit_transform(nvt.Dataset(df, cpu=cpu)).to_ddf().compute() if cpu: assert df_out["Author_Cost_normalized_sum"].to_list() == [ 1.0, 1.0, 1.0, 2.0, 2.0 ] else: assert df_out["Author_Cost_normalized_sum"].to_arrow().to_pylist() == [ 1.0, 1.0, 1.0, 2.0, 2.0, ]
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] cats = ColumnSelector(cat_names) cat_features = cats >> ops.Categorify( out_path=str(tmpdir), freq_threshold=10, on_host=True) groupby_features = cats >> ops.JoinGroupby( cont_cols=cont_names, stats=["count", "sum"], out_path=str(tmpdir)) # We have a global dask client defined in this context, so NVTabular # should warn us if we initialize a `Workflow` with `client=None` workflow = run_in_context( Workflow, cat_features + groupby_features, context=None if use_client else pytest.warns(UserWarning), client=client if use_client else None, ) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_joingroupby_multi(tmpdir, groups): df = pd.DataFrame( { "Author": ["User_A", "User_A", "User_A", "User_B"], "Engaging-User": ["User_B", "User_B", "User_C", "User_C"], "Cost": [100.0, 200.0, 300.0, 400.0], "Post": [1, 2, 3, 4], } ) cat_names = ["Author", "Engaging-User"] cont_names = ["Cost"] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess( ops.JoinGroupby(columns=groups, out_path=str(tmpdir), stats=["sum"], cont_names=["Cost"]) ) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") if isinstance(groups, list): # Join on ["Author", "Engaging-User"] assert df_out["Author_Engaging-User_Cost_sum"].to_arrow().to_pylist() == [ 300.0, 300.0, 300.0, 400.0, ] else: # Join on ["Author"] assert df_out["Author_Cost_sum"].to_arrow().to_pylist() == [600.0, 600.0, 600.0, 400.0]
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir) ) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64), check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "min" assert_eq( result[["name-string", "name-string_x_min"]] .drop_duplicates() .sort_values("name-string")["name-string_x_min"], df0.groupby("name-string").agg({"x": "min"})["x"], check_index=False, check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_cols=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check results. Need to sort for direct comparison expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() gb_e = expect.groupby("name-cat").aggregate({ "name-cat": "count", "x": ["sum", "min", "std"] }) gb_e.columns = ["count", "sum", "min", "std"] df_check = got.merge(gb_e, left_on="name-cat", right_index=True, how="left") assert_eq(df_check["name-cat_count"], df_check["count"].astype("int64"), check_names=False) assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False) assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False) assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] cats = ColumnGroup(cat_names) cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True) groupby_features = cats >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir) ) workflow = Workflow(cat_features + groupby_features, client=client) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
# filter within the workflow by tags # test tags correct at output @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("col1"), ops.FillMissing(), ops.Groupby("col1"), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby("col1"), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding("col1"), ], ) def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op workflow = Workflow(cont_features) workflow.fit_schema(schema)
@pytest.mark.parametrize("properties", [{}, {"p1": "1"}]) @pytest.mark.parametrize("tags", [[], ["TAG1", "TAG2"]]) @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("1"), ops.FillMissing(), ops.Groupby(["1"]), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby(["1"]), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding(["1"]), ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}), ops.ValueCount(), ], ) @pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]]) def test_schema_out(tags, properties, selection, op): # Create columnSchemas column_schemas = [] all_cols = [] for x in range(5): all_cols.append(str(x))