def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu): df = dispatch._make_df({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=3) else: df = dask_cudf.from_cudf(df, npartitions=3) cont_names = ["Cost"] te_features = cat_groups >> ops.TargetEncoding( cont_names, out_path=str(tmpdir), kfold=kfold, out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate ) cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") df_lib = dispatch.get_lib() if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = df_lib.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check, check_dtype=False)
def test_nested_workflow_node(): df = dispatch._make_df({ "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], }) dataset = Dataset(df) geo_selector = ColumnSelector(["geo"]) country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1") # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2") user = "******" # user2 = "user2" # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = country + user + [country + user] >> Categorify(encode_type="combo") workflow = Workflow(cats) workflow.fit_schema(dataset.infer_schema()) df_out = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[ 0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user" ] >> Categorify(encode_type="combo")
def test_categorify_size(tmpdir, cpu, include_nulls): num_rows = 50 num_distinct = 10 possible_session_ids = list(range(num_distinct)) if include_nulls: possible_session_ids.append(None) df = dispatch._make_df( { "session_id": [random.choice(possible_session_ids) for _ in range(num_rows)] }, device="cpu" if cpu else None, ) cat_features = ["session_id"] >> nvt.ops.Categorify(out_path=str(tmpdir)) workflow = nvt.Workflow(cat_features) workflow.fit_transform(nvt.Dataset(df, cpu=cpu)).to_ddf().compute() vals = df["session_id"].value_counts() vocab = dispatch._read_dispatch(cpu=cpu)(os.path.join( tmpdir, "categories", "unique.session_id.parquet")) if cpu: expected = dict(zip(vals.index, vals)) computed = { session: size for session, size in zip(vocab["session_id"], vocab["session_id_size"]) if size } else: expected = dict(zip(vals.index.values_host, vals.values_host)) computed = { session: size for session, size in zip(vocab["session_id"].values_host, vocab["session_id_size"].values_host) if size } first_key = list(computed.keys())[0] if pd.isna(first_key): computed.pop(first_key) assert computed == expected
def test_categorify_single_table(): df = dispatch._make_df({ "Authors": [None, "User_A", "User_A", "User_E", "User_B", "User_C"], "Engaging_User": [None, "User_B", "User_B", "User_A", "User_D", "User_D"], "Post": [1, 2, 3, 4, None, 5], }) cat_names = ["Authors", "Engaging_User"] dataset = nvt.Dataset(df) features = cat_names >> ops.Categorify(single_table=True) processor = nvt.Workflow(features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() old_max = 0 for name in cat_names: curr_min = new_gdf[name].min() assert old_max <= curr_min curr_max = new_gdf[name].max() old_max += curr_max
def test_normalize_lists(tmpdir, cpu): df = dispatch._make_df(device="cpu" if cpu else "gpu") df["vals"] = [ [0.0, 1.0, 2.0], [ 3.0, 4.0, ], [5.0], ] features = ["vals"] >> nvt.ops.Normalize() workflow = nvt.Workflow(features) transformed = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() expected = _flatten_list_column_values(df["vals"]).astype("float32") expected = (expected - expected.mean()) / expected.std() expected_df = type(transformed)({"vals": expected}) assert_eq(expected_df, _flatten_list_column(transformed["vals"]))
def _create_tensors(self, gdf): """ Breaks a dataframe down into the relevant categorical, continuous, and label tensors. Can be overrideen """ column_groups = (self.cat_names, self.cont_names, self.label_names) dtypes = (self._LONG_DTYPE, self._FLOAT32_DTYPE, self._FLOAT32_DTYPE) tensors = [] offsets = _make_df(device=self.device) for column_names, dtype in zip(column_groups, dtypes): if len(column_names) == 0: tensors.append(None) continue gdf_i = gdf[column_names] gdf.drop(columns=column_names, inplace=True) scalars, lists = self._separate_list_columns(gdf_i) x = None if scalars: # should always return dict column_name: values, offsets (optional) x = self._to_tensor(gdf_i[scalars], dtype) if lists: list_tensors = OrderedDict() for column_name in lists: column = gdf_i.pop(column_name) leaves, offsets[column_name] = _pull_apart_list(column) list_tensors[column_name] = self._to_tensor(leaves, dtype) x = x, list_tensors tensors.append(x) if not offsets.empty: offsets_tensor = self._to_tensor(offsets, self._LONG_DTYPE) if len(offsets_tensor.shape) == 1: offsets_tensor = offsets_tensor[:, None] tensors.append(offsets_tensor) del gdf, offsets return tensors
def test_categorify_hash_bucket(cpu): df = dispatch._make_df({ "Authors": ["User_A", "User_A", "User_E", "User_B", "User_C"], "Engaging_User": ["User_B", "User_B", "User_A", "User_D", "User_D"], "Post": [1, 2, 3, 4, 5], }) cat_names = ["Authors", "Engaging_User"] buckets = 10 dataset = nvt.Dataset(df, cpu=cpu) hash_features = cat_names >> ops.Categorify(num_buckets=buckets) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check hashed values assert new_gdf["Authors"].max() <= (buckets - 1) assert new_gdf["Engaging_User"].max() <= (buckets - 1) # check embedding size is equal to the num_buckets after hashing assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == buckets assert nvt.ops.get_embedding_sizes( processor)["Engaging_User"][0] == buckets
def test_na_value_count(tmpdir): gdf = dispatch._make_df({ "productID": ["B00406YHLI"] * 5 + ["B002YXS8E6"] * 5 + ["B00011KM38"] * 2 + [np.nan] * 3, "brand": ["Coby"] * 5 + [np.nan] * 5 + ["Cooler Master"] * 2 + ["Asus"] * 3, }) cat_features = ["brand", "productID"] >> nvt.ops.Categorify() workflow = nvt.Workflow(cat_features) train_dataset = nvt.Dataset(gdf, engine="parquet") workflow.fit(train_dataset) workflow.transform(train_dataset).to_ddf().compute() single_cat = dispatch._read_dispatch("./categories/unique.brand.parquet")( "./categories/unique.brand.parquet") second_cat = dispatch._read_dispatch( "./categories/unique.productID.parquet")( "./categories/unique.productID.parquet") assert single_cat["brand_size"][0] == 5 assert second_cat["productID_size"][0] == 3
def test_groupby_model(tmpdir, output_model): size = 20 df = _make_df({ "id": np.random.choice([0, 1], size=size), "ts": np.linspace(0.0, 10.0, num=size), "x": np.arange(size), "y": np.linspace(0.0, 10.0, num=size), }) groupby_features = ColumnSelector(["id", "ts", "x", "y"]) >> ops.Groupby( groupby_cols=["id"], sort_cols=["ts"], aggs={ "x": ["sum"], "y": ["first"], }, name_sep="-", ) workflow = nvt.Workflow(groupby_features) if output_model == "pytorch": model_info = { "x-sum": { "columns": ["x-sum"], "dtype": "int64" }, "y-first": { "columns": ["y-first"], "dtype": "float64" }, "id": { "columns": ["id"], "dtype": "int64" }, } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby", output_model, model_info)
def test_ops_list_vc(properties, tags, op_routine): column_schemas = [] all_cols = [] for x in range(5): all_cols.append(str(x)) column_schemas.append(ColumnSchema(str(x), tags=tags, properties=properties)) # Turn to Schema schema = Schema(column_schemas) df_dict = {} num_rows = 10000 for column_name in schema.column_names: df_dict[column_name] = np.random.randint(1, 1000, num_rows) df_dict[column_name] = [[x] * np.random.randint(1, 10) for x in df_dict[column_name]] df = dispatch._make_df(df_dict) dataset = nvt.Dataset(df) test_node = ColumnSelector(schema.column_names) >> op_routine[0] for op in op_routine[1:]: test_node = test_node >> op processor = nvt.Workflow(test_node) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() workflow_schema_out = processor.output_node.output_schema for column_name in workflow_schema_out.column_names: schema1 = workflow_schema_out.column_schemas[column_name] assert "domain" in schema1.properties embeddings_info = schema1.properties["domain"] # should always exist, represents unknown assert embeddings_info["min"] == 0 if HAS_GPU: assert embeddings_info["max"] == new_gdf[column_name]._column.elements.max() + 1 else: list_vals = nvt.dispatch._pull_apart_list(new_gdf[column_name])[0] assert embeddings_info["max"] == list_vals.max() + 1 assert "value_count" in schema1.properties val_c = schema1.properties["value_count"] assert val_c["min"] == op_routine[-1].stats[column_name]["value_count"]["min"] assert val_c["max"] == op_routine[-1].stats[column_name]["value_count"]["max"]
def test_numeric_dtypes(tmpdir, output_model): if output_model == "pytorch": model_info = dict() else: model_info = None dtypes = [] for width in [8, 16, 32, 64]: dtype = f"int{width}" dtypes.append((dtype, np.iinfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} dtype = f"uint{width}" dtypes.append((dtype, np.iinfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} for width in [32, 64]: dtype = f"float{width}" dtypes.append((dtype, np.finfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} def check_dtypes(col): assert str(col.dtype) == col.name return col # simple transform to make sure we can round-trip the min/max values for each dtype, # through triton, with the 'transform' here just checking that the dtypes are correct df = _make_df({ dtype: np.array([limits.max, 0, limits.min], dtype=dtype) for dtype, limits in dtypes }) features = nvt.ColumnSelector(df.columns) >> check_dtypes workflow = nvt.Workflow(features) _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_numeric_dtypes", output_model, model_info)
def test_hash_bucket_lists(tmpdir): df = dispatch._make_df( { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) cat_names = ["Authors"] # , "Engaging User"] dataset = nvt.Dataset(df) hash_features = cat_names >> ops.HashBucket(num_buckets=10) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check to make sure that the same strings are hashed the same authors = new_gdf["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' assert nvt.ops.get_embedding_sizes(processor)[1]["Authors"][0] == 10
def test_categorify_lists_with_start_index(tmpdir, cpu, start_index): df = dispatch._make_df({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors", "Engaging User"] label_name = ["Post"] dataset = nvt.Dataset(df, cpu=cpu) cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir), start_index=start_index) processor = nvt.Workflow(cat_features + label_name) processor.fit(dataset) df_out = processor.transform(dataset).to_ddf().compute() if cpu: compare = [list(row) for row in df_out["Authors"].tolist()] else: compare = df_out["Authors"].to_arrow().to_pylist() # Note that start_index is the start_index of the range of encoding, which # includes both an initial value for the encoding for out-of-vocabulary items, # as well as the values for the rest of the in-vocabulary items. # In this group of tests below, there are no out-of-vocabulary items, so our start index # value does not appear in the expected comparison object. if start_index == 0: assert compare == [[1], [1, 4], [3, 2], [2]] elif start_index == 1: assert compare == [[2], [2, 5], [4, 3], [3]] elif start_index == 16: assert compare == [[17], [17, 20], [19, 18], [18]] # We expect five entries in the embedding size, one for each author, # plus start_index many additional entries for our offset start_index. embeddings = nvt.ops.get_embedding_sizes(processor) assert embeddings[1]["Authors"][0] == (5 + start_index)
def test_generate_triton_multihot(tmpdir): df = _make_df({ "userId": ["a", "a", "b"], "movieId": ["1", "2", "2"], "genres": [["action", "adventure"], ["action", "comedy"], ["comedy"]], }) cats = ["userId", "movieId", "genres"] >> nvt.ops.Categorify() workflow = nvt.Workflow(cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def create_df( self, size, cols, entries=False, ): conts_rep = cols["conts"] if "conts" in cols else None cats_rep = cols["cats"] if "cats" in cols else None labs_rep = cols["labels"] if "labels" in cols else None df = _make_df() if conts_rep: df = _concat([df, self.create_conts(size, conts_rep)], axis=1) if cats_rep: df = _concat( [ df, self.create_cats(size, cats_rep=cats_rep, entries=entries), ], axis=1, ) if labs_rep: df = _concat([df, self.create_labels(size, labs_rep)], axis=1) return df
def test_normalize_upcastfloat64(tmpdir, dataset, gpu_memory_frac, engine, op_columns): df = dispatch._make_df({ "x": [1.9e10, 2.3e16, 3.4e18, 1.6e19], "label": [1.0, 0.0, 1.0, 0.0] }) cont_features = op_columns >> ops.Normalize() processor = nvtabular.Workflow(cont_features) dataset = nvt.Dataset(df) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() for col in op_columns: assert math.isclose(df[col].mean(), processor.output_node.op.means[col], rel_tol=1e-4) assert math.isclose(df[col].std(), processor.output_node.op.stds[col], rel_tol=1e-4) df[col] = (df[col] - processor.output_node.op.means[col] ) / processor.output_node.op.stds[col] assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2)
def test_cat_rep(num_rows, distro): json_sample["num_rows"] = num_rows cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro()) df_uni = df_gen.create_df(num_rows, cols, entries=True) df_cats = df_uni[cats] assert df_cats.shape[1] == len(cats) assert df_cats.shape[0] == num_rows cats_rep = cols["cats"] for idx, cat in enumerate(cats[1:]): assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size if HAS_GPU: check_ser = _make_df(list(df_uni[cats[0]]._column.elements.values_host))[0] else: check_ser = df_uni[cats[0]] if isinstance(check_ser[0], (list, np.ndarray)): check_ser = _pull_apart_list(check_ser)[0] assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_target_encode_group(): df = dispatch._make_df({ "Cost": range(15), "Post": [1, 2, 3, 4, 5] * 3, "Author": ["A"] * 5 + ["B"] * 5 + ["C"] * 2 + ["D"] * 3, "Engaging_User": ["A"] * 5 + ["B"] * 3 + ["E"] * 2 + ["D"] * 3 + ["G"] * 2, }) cat_groups = ["Author", "Engaging_User"] labels = ColumnSelector( ["Post"]) >> ops.LambdaOp(lambda col: (col > 3).astype("int8")) te_features = cat_groups >> ops.TargetEncoding( labels, out_path="./", kfold=1, out_dtype="float32", drop_folds=False, # Keep folds to validate ) workflow = nvt.Workflow(te_features + ["Author", "Engaging_User"]) workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")
def convert_triton_output_to_df(columns, response): return _make_df({col: response.as_numpy(col) for col in columns})
def test_categorify_freq_limit(tmpdir, freq_limit, buckets, search_sort, cpu): if search_sort and cpu: # invalid combination - don't test return df = dispatch._make_df({ "Author": [ "User_A", "User_E", "User_B", "User_C", "User_A", "User_E", "User_B", "User_C", "User_B", "User_C", ], "Engaging User": [ "User_B", "User_B", "User_A", "User_D", "User_B", "User_c", "User_A", "User_D", "User_D", "User_D", ], }) isfreqthr = freq_limit > 0 if isinstance(freq_limit, int) else isinstance( freq_limit, dict) if (not search_sort and isfreqthr) or (search_sort and not isfreqthr): cat_names = ["Author", "Engaging User"] cats = cat_names >> ops.Categorify( freq_threshold=freq_limit, out_path=str(tmpdir), search_sorted=search_sort, num_buckets=buckets, ) workflow = nvt.Workflow(cats) df_out = (workflow.fit_transform(nvt.Dataset( df, cpu=cpu)).to_ddf().compute(scheduler="synchronous")) if freq_limit and not buckets: # Column combinations are encoded if isinstance(freq_limit, dict): assert df_out["Author"].max() == 2 assert df_out["Engaging User"].max() == 1 else: assert len(df["Author"].unique()) == df_out["Author"].max() assert len(df["Engaging User"].unique() ) == df_out["Engaging User"].max() elif not freq_limit and buckets: if isinstance(buckets, dict): assert df_out["Author"].max() <= 9 assert df_out["Engaging User"].max() <= 19 else: assert df_out["Author"].max() <= 9 assert df_out["Engaging User"].max() <= 9 elif freq_limit and buckets: if (isinstance(buckets, dict) and isinstance(buckets, dict) and not isinstance(df, pd.DataFrame)): assert ( df_out["Author"].max() <= (df["Author"].hash_values() % buckets["Author"]).max() + 2 + 1) assert (df_out["Engaging User"].max() <= (df["Engaging User"].hash_values() % buckets["Engaging User"]).max() + 1 + 1)
def create_col(self, num_rows, dtype=np.float32, min_val=0, max_val=1): ser = _make_df(np.random.uniform(min_val, max_val, size=num_rows))[0] ser = ser.astype(dtype) return ser