def test_hash_bucket_lists(tmpdir): df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors"] # , "Engaging User"] cont_names = [] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess(ops.HashBucket(num_buckets=10)) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same authors = df_out["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' # make sure we get the embedding sizes assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10
def test_mh_support(tmpdir): df = cudf.DataFrame( { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) cat_names = ["Authors", "Reviewers"] # , "Engaging User"] cont_names = [] label_name = ["Post"] cats = cat_names >> ops.HashBucket(num_buckets=10) processor = nvt.Workflow(cats + label_name) df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same authors = df_out["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name ) idx = 0 for batch in data_itr: idx = idx + 1 cats, conts, labels = batch cats, mh = cats # mh is a tuple of dictionaries {Column name: (values, offsets)} assert len(mh) == len(cat_names) assert not cats assert idx > 0
def test_mh_support(tmpdir): df = nvt.dispatch._make_df({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors", "Reviewers"] # , "Engaging User"] cont_names = [] label_name = ["Post"] if HAS_GPU: cats = cat_names >> ops.HashBucket(num_buckets=10) else: cats = cat_names >> ops.Categorify() processor = nvt.Workflow(cats + label_name) df_out = processor.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same if HAS_GPU: authors = df_out["Authors"].to_arrow().to_pylist() else: authors = df_out["Authors"] assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name) idx = 0 for batch in data_itr: idx = idx + 1 cats_conts, labels = batch assert "Reviewers" in cats_conts # check it is multihot assert isinstance(cats_conts["Reviewers"], tuple) # mh is a tuple of dictionaries {Column name: (values, offsets)} assert "Authors" in cats_conts assert isinstance(cats_conts["Authors"], tuple) assert idx > 0
def test_hash_bucket(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-string"] if op_columns is None: num_buckets = 10 else: num_buckets = {column: 10 for column in op_columns} hash_features = cat_names >> ops.HashBucket(num_buckets) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check sums for determinancy assert np.all(new_gdf[cat_names].values >= 0) assert np.all(new_gdf[cat_names].values <= 9) checksum = new_gdf[cat_names].sum().values new_gdf = processor.transform(dataset).to_ddf().compute() np.all(new_gdf[cat_names].sum().values == checksum)
def test_hash_bucket_lists(tmpdir): df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors"] # , "Engaging User"] dataset = nvt.Dataset(df) hash_features = cat_names >> ops.HashBucket(num_buckets=10) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check to make sure that the same strings are hashed the same authors = new_gdf["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10
def test_transform_geolocation(): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo_location": raw}) geo_location = ColumnGroup(["geo_location"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.HashBucket( num_buckets=100) # for this workflow we don't have any statoperators, so we can get away without fitting workflow = Workflow(geo_features) transformed = workflow.transform(Dataset(data)).to_ddf().compute() expected = cudf.DataFrame() expected["geo_location_state"] = data["geo_location"].str.slice( 0, 5).hash_values() % 100 expected["geo_location_country"] = data["geo_location"].str.slice( 0, 2).hash_values() % 100 expected["geo_location"] = data["geo_location"].hash_values() % 100 assert_eq(expected, transformed)
def test_hash_bucket(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-string"] if op_columns is None: num_buckets = 10 else: num_buckets = {column: 10 for column in op_columns} hash_bucket_op = ops.HashBucket(num_buckets) columns_ctx = {} columns_ctx["categorical"] = {} columns_ctx["categorical"]["base"] = cat_names # check sums for determinancy checksums = [] for gdf in dataset.to_iter(): new_gdf = hash_bucket_op.apply_op(gdf, columns_ctx, "categorical") assert np.all(new_gdf[cat_names].values >= 0) assert np.all(new_gdf[cat_names].values <= 9) checksums.append(new_gdf[cat_names].sum().values) for checksum, gdf in zip(checksums, dataset.to_iter()): new_gdf = hash_bucket_op.apply_op(gdf, columns_ctx, "categorical") assert np.all(new_gdf[cat_names].sum().values == checksum)
def test_mh_support(tmpdir, batch_size): data = { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [ ["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"], ], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Embedding": [ [0.1, 0.2, 0.3], [0.3, 0.4, 0.5], [0.6, 0.7, 0.8], [0.8, 0.4, 0.2], ], "Post": [1, 2, 3, 4], } df = cudf.DataFrame(data) cat_names = ["Authors", "Reviewers", "Engaging User"] cont_names = ["Embedding"] label_name = ["Post"] cats = cat_names >> ops.HashBucket(num_buckets=10) workflow = nvt.Workflow(cats + cont_names + label_name) data_itr = tf_dataloader.KerasSequenceLoader( workflow.transform(nvt.Dataset(df)), cat_names=cat_names, cont_names=cont_names, label_names=label_name, batch_size=batch_size, shuffle=False, ) idx = 0 for X, y in data_itr: assert len(X) == 7 n_samples = y.shape[0] for mh_name in ["Authors", "Reviewers", "Embedding"]: for postfix in ["__nnzs", "__values"]: assert (mh_name + postfix) in X array = X[mh_name + postfix].numpy()[:, 0] if postfix == "__nnzs": if mh_name == "Embedding": assert (array == 3).all() else: lens = [ len(x) for x in data[mh_name][idx * batch_size:idx * batch_size + n_samples] ] assert (array == np.array(lens)).all() else: if mh_name == "Embedding": assert len(array) == (n_samples * 3) else: assert len(array) == sum(lens) idx += 1 assert idx == (3 // batch_size + 1)
# initial column selector works with tags # filter within the workflow by tags # test tags correct at output @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("col1"), ops.FillMissing(), ops.Groupby("col1"), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby("col1"), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding("col1"), ], ) def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op
def test_mh_support(tmpdir, batch_size): data = { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [ ["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"], ], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Embedding": [ [0.1, 0.2, 0.3], [0.3, 0.4, 0.5], [0.6, 0.7, 0.8], [0.8, 0.4, 0.2], ], "Post": [1, 2, 3, 4], } df = nvt.dispatch._make_df(data) cat_names = ["Authors", "Reviewers", "Engaging User"] cont_names = ["Embedding"] label_name = ["Post"] if HAS_GPU: cats = cat_names >> ops.HashBucket(num_buckets=10) else: cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(cats + cont_names + label_name) data_itr = tf_dataloader.KerasSequenceLoader( workflow.fit_transform(nvt.Dataset(df)), cat_names=cat_names, cont_names=cont_names, label_names=label_name, batch_size=batch_size, shuffle=False, ) nnzs = None idx = 0 for X, y in data_itr: assert len(X) == 4 n_samples = y.shape[0] for mh_name in ["Authors", "Reviewers", "Embedding"]: # assert (mh_name) in X array, nnzs = X[mh_name] nnzs = nnzs.numpy()[:, 0] array = array.numpy()[:, 0] if mh_name == "Embedding": assert (nnzs == 3).all() else: lens = [ len(x) for x in data[mh_name][idx * batch_size : idx * batch_size + n_samples] ] assert (nnzs == np.array(lens)).all() if mh_name == "Embedding": assert len(array) == (n_samples * 3) else: assert len(array) == sum(lens) idx += 1 assert idx == (3 // batch_size + 1)