def test_fill_missing(tmpdir, df, dataset, engine): cont_names = ["x", "y"] cont_features = cont_names >> nvt.ops.FillMissing(fill_val=42) for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None df = df.reset_index() dataset = nvt.Dataset(df) processor = nvt.Workflow(cont_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() for col in cont_names: assert np.all((df[col].fillna(42) - new_gdf[col]).abs().values <= 1e-2) assert new_gdf[col].isna().sum() == 0
def test_joingroupby_dependency(tmpdir): df = pd.DataFrame({ "Author": ["User_A", "User_A", "User_A", "User_B", "User_B"], "Cost": [100.0, 200.0, 300.0, 400.0, 400.0], }) normalized_cost = ["Cost"] >> nvt.ops.NormalizeMinMax() >> nvt.ops.Rename( postfix="_normalized") groupby_features = ["Author"] >> ops.JoinGroupby( out_path=str(tmpdir), stats=["sum"], cont_cols=normalized_cost) workflow = nvt.Workflow(groupby_features) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() assert df_out["Author_Cost_normalized_sum"].to_arrow().to_pylist() == [ 1.0, 1.0, 1.0, 2.0, 2.0 ]
def test_workflow_fit_op_rename(tmpdir, dataset, engine): # NVT schema = dataset.schema for name in schema.column_names: dataset.schema.column_schemas[name] = dataset.schema.column_schemas[ name].with_tags([nvt.graph.tags.Tags.USER]) selector = nvt.ColumnSelector(tags=[nvt.graph.tags.Tags.USER]) workflow_ops_1 = selector >> nvt.ops.Rename(postfix="_1") workflow_1 = nvt.Workflow(workflow_ops_1) workflow_1.fit(dataset) workflow_1.save(str(tmpdir / "one")) new_dataset = workflow_1.transform(dataset).to_ddf().compute() assert len(new_dataset.columns) > 0 assert all("_1" in col for col in new_dataset.columns)
def test_target_encode_multi(tmpdir, npartitions, cpu): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = cudf.DataFrame({ "cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2 }) if cpu: df = dd.from_pandas(df.to_pandas(), npartitions=npartitions) else: df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_groups = ["cat", "cat2", ["cat", "cat2"]] te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"], out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32") workflow = nvt.Workflow(te_features) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu): df = dispatch._make_df({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=3) else: df = dask_cudf.from_cudf(df, npartitions=3) cont_names = ["Cost"] te_features = cat_groups >> ops.TargetEncoding( cont_names, out_path=str(tmpdir), kfold=kfold, out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate ) cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") df_lib = dispatch.get_lib() if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = df_lib.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check, check_dtype=False)
def test_large_strings(tmpdir, output_model): strings = ["a" * (2**exp) for exp in range(1, 17)] df = _make_df({"description": strings}) features = ["description"] >> ops.Categorify() workflow = nvt.Workflow(features) if output_model == "pytorch": model_info = { "description": { "columns": ["description"], "dtype": "int64" } } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_large_string", output_model, model_info)
def test_mh_support(tmpdir): df = nvt.dispatch._make_df({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors", "Reviewers"] # , "Engaging User"] cont_names = [] label_name = ["Post"] if HAS_GPU: cats = cat_names >> ops.HashBucket(num_buckets=10) else: cats = cat_names >> ops.Categorify() processor = nvt.Workflow(cats + label_name) df_out = processor.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same if HAS_GPU: authors = df_out["Authors"].to_arrow().to_pylist() else: authors = df_out["Authors"] assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name) idx = 0 for batch in data_itr: idx = idx + 1 cats_conts, labels = batch assert "Reviewers" in cats_conts # check it is multihot assert isinstance(cats_conts["Reviewers"], tuple) # mh is a tuple of dictionaries {Column name: (values, offsets)} assert "Authors" in cats_conts assert isinstance(cats_conts["Authors"], tuple) assert idx > 0
def test_target_encode_multi(tmpdir, npartitions): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = cudf.DataFrame({"cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2}) df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_names = ["cat", "cat2"] cont_names = ["num", "num_2"] label_name = [] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) cat_groups = ["cat", "cat2", ["cat", "cat2"]] processor.add_preprocess( ops.TargetEncoding( cat_groups, ["num", "num_2"], # cont_target out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32", ) ) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def test_categorify_size(tmpdir, cpu, include_nulls): num_rows = 50 num_distinct = 10 possible_session_ids = list(range(num_distinct)) if include_nulls: possible_session_ids.append(None) df = dispatch._make_df( { "session_id": [random.choice(possible_session_ids) for _ in range(num_rows)] }, device="cpu" if cpu else None, ) cat_features = ["session_id"] >> nvt.ops.Categorify(out_path=str(tmpdir)) workflow = nvt.Workflow(cat_features) workflow.fit_transform(nvt.Dataset(df, cpu=cpu)).to_ddf().compute() vals = df["session_id"].value_counts() vocab = dispatch._read_dispatch(cpu=cpu)(os.path.join( tmpdir, "categories", "unique.session_id.parquet")) if cpu: expected = dict(zip(vals.index, vals)) computed = { session: size for session, size in zip(vocab["session_id"], vocab["session_id_size"]) if size } else: expected = dict(zip(vals.index.values_host, vals.values_host)) computed = { session: size for session, size in zip(vocab["session_id"].values_host, vocab["session_id_size"].values_host) if size } first_key = list(computed.keys())[0] if pd.isna(first_key): computed.pop(first_key) assert computed == expected
def test_join_external_workflow(tmpdir, df, dataset, engine): # Define "external" table how = "left" drop_duplicates = True cache = "device" shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() # Define Op on = "id" columns_left = list(df.columns) columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) # Define Workflow gdf = df.reset_index() dataset = nvt.Dataset(gdf) processor = nvt.Workflow(joined) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() # Validate check_gdf = gdf.merge(df_ext_check, how=how, on=on) assert len(check_gdf) == len(new_gdf) assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns
def test_generate_triton_model(tmpdir, engine, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_hash_bucket(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-string"] if op_columns is None: num_buckets = 10 else: num_buckets = {column: 10 for column in op_columns} hash_features = cat_names >> ops.HashBucket(num_buckets) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check sums for determinancy assert np.all(new_gdf[cat_names].values >= 0) assert np.all(new_gdf[cat_names].values <= 9) checksum = new_gdf[cat_names].sum().values new_gdf = processor.transform(dataset).to_ddf().compute() np.all(new_gdf[cat_names].sum().values == checksum)
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # Need to collect statistics first (for now) processor.update_stats(dataset) # Second "online" pass to write HugeCTR output processor.apply( dataset, apply_offline=False, record_stats=False, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=False, ) # Check files ext = "" if output_format == "parquet": ext = "parquet" assert os.path.isfile(outdir + "/metadata.json") elif output_format == "hugectr": ext = "data" assert os.path.isfile(outdir + "/file_list.txt") for n in range(nfiles): assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
def test_normalize_lists(tmpdir, cpu): df = dispatch._make_df(device="cpu" if cpu else "gpu") df["vals"] = [ [0.0, 1.0, 2.0], [ 3.0, 4.0, ], [5.0], ] features = ["vals"] >> nvt.ops.Normalize() workflow = nvt.Workflow(features) transformed = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() expected = _flatten_list_column_values(df["vals"]).astype("float32") expected = (expected - expected.mean()) / expected.std() expected_df = type(transformed)({"vals": expected}) assert_eq(expected_df, _flatten_list_column(transformed["vals"]))
def test_categorify_single_table(): df = dispatch._make_df({ "Authors": [None, "User_A", "User_A", "User_E", "User_B", "User_C"], "Engaging_User": [None, "User_B", "User_B", "User_A", "User_D", "User_D"], "Post": [1, 2, 3, 4, None, 5], }) cat_names = ["Authors", "Engaging_User"] dataset = nvt.Dataset(df) features = cat_names >> ops.Categorify(single_table=True) processor = nvt.Workflow(features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() old_max = 0 for name in cat_names: curr_min = new_gdf[name].min() assert old_max <= curr_min curr_max = new_gdf[name].max() old_max += curr_max
def test_categorify_multi_combo(tmpdir): cat_names = [["Author", "Engaging User"], ["Author"], "Engaging User"] kind = "combo" df = pd.DataFrame( { "Author": ["User_A", "User_E", "User_B", "User_C"], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) label_name = ["Post"] cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind) workflow = nvt.Workflow(cats + label_name) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # Column combinations are encoded assert df_out["Author"].to_arrow().to_pylist() == [1, 4, 2, 3] assert df_out["Engaging User"].to_arrow().to_pylist() == [2, 2, 1, 3] assert df_out["Author_Engaging User"].to_arrow().to_pylist() == [1, 4, 2, 3]
def test_categorify_multi(tmpdir, cat_names, kind, cpu): df = pd.DataFrame({ "Author": ["User_A", "User_E", "User_B", "User_C"], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) label_name = ["Post"] cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind) workflow = nvt.Workflow(cats + label_name) df_out = (workflow.fit_transform(nvt.Dataset( df, cpu=cpu)).to_ddf().compute(scheduler="synchronous")) if len(cat_names) == 1: if kind == "joint": # Columns are encoded jointly compare_authors = (df_out["Author"].to_list() if cpu else df_out["Author"].to_arrow().to_pylist()) compare_engaging = (df_out["Engaging User"].to_list() if cpu else df_out["Engaging User"].to_arrow().to_pylist()) # again userB has highest frequency given lowest encoding assert compare_authors == [2, 5, 1, 3] assert compare_engaging == [1, 1, 2, 4] else: # Column combinations are encoded compare_engaging = ( df_out["Author_Engaging User"].to_list() if cpu else df_out["Author_Engaging User"].to_arrow().to_pylist()) assert compare_engaging == [1, 4, 2, 3] else: # Columns are encoded independently compare_authors = (df_out["Author"].to_list() if cpu else df_out["Author"].to_arrow().to_pylist()) compare_engaging = (df_out["Engaging User"].to_list() if cpu else df_out["Engaging User"].to_arrow().to_pylist()) assert compare_authors == [1, 4, 2, 3] # User B is first in frequency based ordering assert compare_engaging == [1, 1, 2, 3]
def test_moments(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() config["PP"]["continuous"] = [ops.Moments(columns=op_columns)] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, ) processor.update_stats(dataset) assert df.x.count() == processor.stats["counts"]["x"] assert df.x.count() == 4321 # Check mean and std assert math.isclose(df.x.mean(), processor.stats["means"]["x"], rel_tol=1e-4) assert math.isclose(df.x.std(), processor.stats["stds"]["x"], rel_tol=1e-3) if not op_columns: assert math.isclose(df.y.mean(), processor.stats["means"]["y"], rel_tol=1e-4) assert math.isclose(df.id.mean(), processor.stats["means"]["id"], rel_tol=1e-4) assert math.isclose(df.y.std(), processor.stats["stds"]["y"], rel_tol=1e-3) assert math.isclose(df.id.std(), processor.stats["stds"]["id"], rel_tol=1e-3) return processor.ds_exports
def test_na_value_count(tmpdir): gdf = dispatch._make_df({ "productID": ["B00406YHLI"] * 5 + ["B002YXS8E6"] * 5 + ["B00011KM38"] * 2 + [np.nan] * 3, "brand": ["Coby"] * 5 + [np.nan] * 5 + ["Cooler Master"] * 2 + ["Asus"] * 3, }) cat_features = ["brand", "productID"] >> nvt.ops.Categorify() workflow = nvt.Workflow(cat_features) train_dataset = nvt.Dataset(gdf, engine="parquet") workflow.fit(train_dataset) workflow.transform(train_dataset).to_ddf().compute() single_cat = dispatch._read_dispatch("./categories/unique.brand.parquet")( "./categories/unique.brand.parquet") second_cat = dispatch._read_dispatch( "./categories/unique.productID.parquet")( "./categories/unique.productID.parquet") assert single_cat["brand_size"][0] == 5 assert second_cat["productID_size"][0] == 3
def test_categorify_lists(tmpdir, freq_threshold): df = cudf.DataFrame( { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) cat_names = ["Authors", "Engaging User"] label_name = ["Post"] cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold) workflow = nvt.Workflow(cat_features + label_name) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() # Columns are encoded independently if freq_threshold < 2: assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3], [3]] else: assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 0], [0, 2], [2]]
def test_chaining_2(): gdf = cudf.DataFrame( { "A": [1, 2, 2, 9, 6, np.nan, 3], "B": [2, np.nan, 4, 7, 7, 2, 5], "C": ["a", "b", "c", np.nan, np.nan, "g", "k"], } ) proc = nvt.Workflow(cat_names=["C"], cont_names=["A", "B"], label_name=[]) proc.add_feature( nvt.ops.LambdaOp(op_name="isnull", f=lambda col, gdf: col.isnull(), replace=False) ) proc.add_cat_preprocess(nvt.ops.Categorify()) train_dataset = nvt.Dataset(gdf, engine="parquet") proc.apply(train_dataset, apply_offline=True, record_stats=True, output_path=None) result = proc.get_ddf().compute() assert all(x in list(result.columns) for x in ["A_isnull", "B_isnull", "C_isnull"]) assert (x in result["C"].unique() for x in set(gdf["C"].dropna().to_arrow()))
def test_categorify_hash_bucket(cpu): df = dispatch._make_df({ "Authors": ["User_A", "User_A", "User_E", "User_B", "User_C"], "Engaging_User": ["User_B", "User_B", "User_A", "User_D", "User_D"], "Post": [1, 2, 3, 4, 5], }) cat_names = ["Authors", "Engaging_User"] buckets = 10 dataset = nvt.Dataset(df, cpu=cpu) hash_features = cat_names >> ops.Categorify(num_buckets=buckets) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check hashed values assert new_gdf["Authors"].max() <= (buckets - 1) assert new_gdf["Engaging_User"].max() <= (buckets - 1) # check embedding size is equal to the num_buckets after hashing assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == buckets assert nvt.ops.get_embedding_sizes( processor)["Engaging_User"][0] == buckets
def test_hash_bucket_lists(tmpdir): df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors"] # , "Engaging User"] dataset = nvt.Dataset(df) hash_features = cat_names >> ops.HashBucket(num_buckets=10) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check to make sure that the same strings are hashed the same authors = new_gdf["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) if cont_names: processor.add_feature([ops.FillMedian()]) processor.add_feature(ops.Normalize()) if cat_names: processor.add_feature(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_format=None, ) df_out = processor.get_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_joingroupby_multi(tmpdir, groups): df = pd.DataFrame({ "Author": ["User_A", "User_A", "User_A", "User_B"], "Engaging-User": ["User_B", "User_B", "User_C", "User_C"], "Cost": [100.0, 200.0, 300.0, 400.0], "Post": [1, 2, 3, 4], }) cat_names = ["Author", "Engaging-User"] cont_names = ["Cost"] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess( ops.JoinGroupby(columns=groups, out_path=str(tmpdir), stats=["sum"], cont_names=["Cost"])) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") if isinstance(groups, list): # Join on ["Author", "Engaging-User"] assert df_out["Author_Engaging-User_Cost_sum"].to_arrow().to_pylist( ) == [ 300.0, 300.0, 300.0, 400.0, ] else: # Join on ["Author"] assert df_out["Author_Cost_sum"].to_arrow().to_pylist() == [ 600.0, 600.0, 600.0, 400.0 ]
def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) proc = nvt.Workflow(cat_names=["ad_id", "source_id", "platform"], cont_names=[], label_name=["clicked"]) # apply dropna proc.add_feature([ nvt.ops.Dropna(["platform"]), nvt.ops.JoinGroupby(columns=["ad_id"], cont_names=["clicked"], stats=["sum", "count"]), nvt.ops.LambdaOp( op_name="ctr", f=lambda col, gdf: col / gdf["ad_id_count"], columns=["ad_id_clicked_sum"], replace=False, ), ]) proc.finalize() assert len(proc.phases) == 2 GPU_MEMORY_FRAC = 0.2 train_dataset = nvt.Dataset(gdf_test, engine="parquet", part_mem_fraction=GPU_MEMORY_FRAC) proc.apply(train_dataset, apply_offline=True, record_stats=True, output_path=None, shuffle=False) result = proc.get_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_groupby_model(tmpdir, output_model): size = 20 df = _make_df({ "id": np.random.choice([0, 1], size=size), "ts": np.linspace(0.0, 10.0, num=size), "x": np.arange(size), "y": np.linspace(0.0, 10.0, num=size), }) groupby_features = ColumnSelector(["id", "ts", "x", "y"]) >> ops.Groupby( groupby_cols=["id"], sort_cols=["ts"], aggs={ "x": ["sum"], "y": ["first"], }, name_sep="-", ) workflow = nvt.Workflow(groupby_features) if output_model == "pytorch": model_info = { "x-sum": { "columns": ["x-sum"], "dtype": "int64" }, "y-first": { "columns": ["y-first"], "dtype": "float64" }, "id": { "columns": ["id"], "dtype": "int64" }, } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby", output_model, model_info)
def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] encoder = ops.CategoryStatistics(columns=op_columns) config = nvt.workflow.get_new_config() config["PP"]["categorical"] = [encoder] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config ) processor.update_stats(dataset) if engine == "parquet" and not op_columns: cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") assert cats1.tolist() == [None] + cats_expected1.tolist()
def test_median(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() config["PP"]["continuous"] = [ops.Median(columns=op_columns)] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config ) processor.update_stats(dataset) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) if not op_columns: y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1)
def test_numeric_dtypes(tmpdir, output_model): if output_model == "pytorch": model_info = dict() else: model_info = None dtypes = [] for width in [8, 16, 32, 64]: dtype = f"int{width}" dtypes.append((dtype, np.iinfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} dtype = f"uint{width}" dtypes.append((dtype, np.iinfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} for width in [32, 64]: dtype = f"float{width}" dtypes.append((dtype, np.finfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} def check_dtypes(col): assert str(col.dtype) == col.name return col # simple transform to make sure we can round-trip the min/max values for each dtype, # through triton, with the 'transform' here just checking that the dtypes are correct df = _make_df({ dtype: np.array([limits.max, 0, limits.min], dtype=dtype) for dtype, limits in dtypes }) features = nvt.ColumnSelector(df.columns) >> check_dtypes workflow = nvt.Workflow(features) _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_numeric_dtypes", output_model, model_info)