def test_mh_support(tmpdir): df = nvt.dispatch._make_df({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors", "Reviewers"] # , "Engaging User"] cont_names = [] label_name = ["Post"] if HAS_GPU: cats = cat_names >> ops.HashBucket(num_buckets=10) else: cats = cat_names >> ops.Categorify() processor = nvt.Workflow(cats + label_name) df_out = processor.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same if HAS_GPU: authors = df_out["Authors"].to_arrow().to_pylist() else: authors = df_out["Authors"] assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name) idx = 0 for batch in data_itr: idx = idx + 1 cats_conts, labels = batch assert "Reviewers" in cats_conts # check it is multihot assert isinstance(cats_conts["Reviewers"], tuple) # mh is a tuple of dictionaries {Column name: (values, offsets)} assert "Authors" in cats_conts assert isinstance(cats_conts["Authors"], tuple) assert idx > 0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] cats = ColumnGroup(cat_names) cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True) groupby_features = cats >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir) ) workflow = Workflow(cat_features + groupby_features, client=client) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_generate_triton_model(tmpdir, engine, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # Need to collect statistics first (for now) processor.update_stats(dataset) # Second "online" pass to write HugeCTR output processor.apply( dataset, apply_offline=False, record_stats=False, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=False, ) # Check files ext = "" if output_format == "parquet": ext = "parquet" assert os.path.isfile(outdir + "/metadata.json") elif output_format == "hugectr": ext = "data" assert os.path.isfile(outdir + "/file_list.txt") for n in range(nfiles): assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
def test_categorify_single_table(): df = dispatch._make_df({ "Authors": [None, "User_A", "User_A", "User_E", "User_B", "User_C"], "Engaging_User": [None, "User_B", "User_B", "User_A", "User_D", "User_D"], "Post": [1, 2, 3, 4, None, 5], }) cat_names = ["Authors", "Engaging_User"] dataset = nvt.Dataset(df) features = cat_names >> ops.Categorify(single_table=True) processor = nvt.Workflow(features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() old_max = 0 for name in cat_names: curr_min = new_gdf[name].min() assert old_max <= curr_min curr_max = new_gdf[name].max() old_max += curr_max
def test_categorify_multi_combo(tmpdir): cat_names = [["Author", "Engaging User"], ["Author"], "Engaging User"] kind = "combo" df = pd.DataFrame( { "Author": ["User_A", "User_E", "User_B", "User_C"], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) label_name = ["Post"] cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind) workflow = nvt.Workflow(cats + label_name) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # Column combinations are encoded assert df_out["Author"].to_arrow().to_pylist() == [1, 4, 2, 3] assert df_out["Engaging User"].to_arrow().to_pylist() == [2, 2, 1, 3] assert df_out["Author_Engaging User"].to_arrow().to_pylist() == [1, 4, 2, 3]
def test_categorify_multi(tmpdir, cat_names, kind, cpu): df = pd.DataFrame({ "Author": ["User_A", "User_E", "User_B", "User_C"], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) label_name = ["Post"] cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind) workflow = nvt.Workflow(cats + label_name) df_out = (workflow.fit_transform(nvt.Dataset( df, cpu=cpu)).to_ddf().compute(scheduler="synchronous")) if len(cat_names) == 1: if kind == "joint": # Columns are encoded jointly compare_authors = (df_out["Author"].to_list() if cpu else df_out["Author"].to_arrow().to_pylist()) compare_engaging = (df_out["Engaging User"].to_list() if cpu else df_out["Engaging User"].to_arrow().to_pylist()) # again userB has highest frequency given lowest encoding assert compare_authors == [2, 5, 1, 3] assert compare_engaging == [1, 1, 2, 4] else: # Column combinations are encoded compare_engaging = ( df_out["Author_Engaging User"].to_list() if cpu else df_out["Author_Engaging User"].to_arrow().to_pylist()) assert compare_engaging == [1, 4, 2, 3] else: # Columns are encoded independently compare_authors = (df_out["Author"].to_list() if cpu else df_out["Author"].to_arrow().to_pylist()) compare_engaging = (df_out["Engaging User"].to_list() if cpu else df_out["Engaging User"].to_arrow().to_pylist()) assert compare_authors == [1, 4, 2, 3] # User B is first in frequency based ordering assert compare_engaging == [1, 1, 2, 3]
def test_categorify_lists(tmpdir, freq_threshold): df = cudf.DataFrame( { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) cat_names = ["Authors", "Engaging User"] label_name = ["Post"] cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold) workflow = nvt.Workflow(cat_features + label_name) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() # Columns are encoded independently if freq_threshold < 2: assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3], [3]] else: assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 0], [0, 2], [2]]
def test_categorify_hash_bucket(cpu): df = dispatch._make_df({ "Authors": ["User_A", "User_A", "User_E", "User_B", "User_C"], "Engaging_User": ["User_B", "User_B", "User_A", "User_D", "User_D"], "Post": [1, 2, 3, 4, 5], }) cat_names = ["Authors", "Engaging_User"] buckets = 10 dataset = nvt.Dataset(df, cpu=cpu) hash_features = cat_names >> ops.Categorify(num_buckets=buckets) processor = nvt.Workflow(hash_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check hashed values assert new_gdf["Authors"].max() <= (buckets - 1) assert new_gdf["Engaging_User"].max() <= (buckets - 1) # check embedding size is equal to the num_buckets after hashing assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == buckets assert nvt.ops.get_embedding_sizes( processor)["Engaging_User"][0] == buckets
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) if cont_names: processor.add_feature([ops.FillMedian()]) processor.add_feature(ops.Normalize()) if cat_names: processor.add_feature(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_format=None, ) df_out = processor.get_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths, engine="parquet") data_loader = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, shuffle=False, labels=label_name, ) batch = next(iter(data_loader)) assert all(name in batch[0] for name in cat_names) assert all(name in batch[0] for name in cont_names) num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1 assert num_label_cols == len(label_name)
def test_categorify_lists_with_start_index(tmpdir, cpu, start_index): df = dispatch._make_df({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors", "Engaging User"] label_name = ["Post"] dataset = nvt.Dataset(df, cpu=cpu) cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir), start_index=start_index) processor = nvt.Workflow(cat_features + label_name) processor.fit(dataset) df_out = processor.transform(dataset).to_ddf().compute() if cpu: compare = [list(row) for row in df_out["Authors"].tolist()] else: compare = df_out["Authors"].to_arrow().to_pylist() # Note that start_index is the start_index of the range of encoding, which # includes both an initial value for the encoding for out-of-vocabulary items, # as well as the values for the rest of the in-vocabulary items. # In this group of tests below, there are no out-of-vocabulary items, so our start index # value does not appear in the expected comparison object. if start_index == 0: assert compare == [[1], [1, 4], [3, 2], [2]] elif start_index == 1: assert compare == [[2], [2, 5], [4, 3], [3]] elif start_index == 16: assert compare == [[17], [17, 20], [19, 18], [18]] # We expect five entries in the embedding size, one for each author, # plus start_index many additional entries for our offset start_index. embeddings = nvt.ops.get_embedding_sizes(processor) assert embeddings[1]["Authors"][0] == (5 + start_index)
def test_categorify_multi(tmpdir, groups, kind): df = pd.DataFrame({ "Author": ["User_A", "User_E", "User_B", "User_C"], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Author", "Engaging User"] cont_names = [] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess( ops.Categorify(columns=groups, out_path=str(tmpdir), encode_type=kind)) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") if groups: if kind == "joint": # Columns are encoded jointly assert df_out["Author"].to_arrow().to_pylist() == [1, 5, 2, 3] assert df_out["Engaging User"].to_arrow().to_pylist() == [ 2, 2, 1, 4 ] else: # Column combinations are encoded assert df_out["Author_Engaging User"].to_arrow().to_pylist() == [ 1, 4, 2, 3 ] else: # Columns are encoded independently assert df_out["Author"].to_arrow().to_pylist() == [1, 4, 2, 3] assert df_out["Engaging User"].to_arrow().to_pylist() == [2, 2, 1, 3]
def test_s3_dataset(s3, paths, engine, df): # create a mocked out bucket here bucket = "testbucket" s3.create_bucket(Bucket=bucket) s3_paths = [] for path in paths: s3_path = f"s3://{bucket}/{path}" with fsspec.open(s3_path, "wb") as f: f.write(open(path, "rb").read()) s3_paths.append(s3_path) # create a basic s3 dataset dataset = nvt.Dataset(s3_paths) # make sure the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = cudf.concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify(cat_cache="host")) processor.finalize() processor.update_stats(dataset)
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names: features.append(cat_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.ColumnGroup(label_name)) if not graph.columns: # if we don't have conts/cats/labels we're done return processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name))) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_categorify_lists(tmpdir, freq_threshold): df = cudf.DataFrame( { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) cat_names = ["Authors", "Engaging User"] cont_names = [] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess(ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold)) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") # Columns are encoded independently if freq_threshold < 2: assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3], [3]] else: assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 0], [0, 2], [2]]
def test_parquet_lists(tmpdir, freq_threshold, shuffle, out_files_per_proc): # the cudf 0.17 dev container returns a '0+untagged.1.ga6296e3' version for cudf # (which is tough to parse correctly with LooseVersion et al). This also fails # to run this test frequently, whereas it works with later versions of cudf. # skip if we are running this specific version of cudf (and lets remove this # check entirely after we've upgraded the CI container) if cudf.__version__.startswith("0+untagged"): pytest.skip("parquet lists support is flakey here without cudf0.18") df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) input_dir = str(tmpdir.mkdir("input")) output_dir = str(tmpdir.mkdir("output")) filename = os.path.join(input_dir, "test.parquet") df.to_parquet(filename) cat_names = ["Authors", "Engaging User"] cats = cat_names >> ops.Categorify(out_path=str(output_dir)) workflow = nvt.Workflow(cats + "Post") transformed = workflow.fit_transform(nvt.Dataset(filename)) transformed.to_parquet( output_path=output_dir, shuffle=shuffle, out_files_per_proc=out_files_per_proc, ) out_paths = glob.glob(os.path.join(output_dir, "*.parquet")) df_out = cudf.read_parquet(out_paths) df_out = df_out.sort_values(by="Post", ascending=True) assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3], [3]]
def test_workflow_generate_columns(tmpdir, use_parquet): out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) # Stripped down dataset with geo_locaiton codes like in outbrains df = cudf.DataFrame({"geo_location": ["US>CA", "CA>BC", "US>TN>659"]}) # defining a simple workflow that strips out the country code from the first two digits of the # geo_location code and sticks in a new 'geo_location_country' field country = (["geo_location"] >> ops.LambdaOp( f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country")) cat_features = ["geo_location"] + country >> ops.Categorify() workflow = Workflow(cat_features) if use_parquet: df.to_parquet(path) dataset = nvt.Dataset(path) else: dataset = nvt.Dataset(df) # just make sure this works without errors workflow.fit(dataset) workflow.transform(dataset).to_parquet(out_path)
def test_categorify_freq_limit(tmpdir, freq_limit, buckets, search_sort, cpu): if search_sort and cpu: # invalid combination - don't test return df = dispatch._make_df({ "Author": [ "User_A", "User_E", "User_B", "User_C", "User_A", "User_E", "User_B", "User_C", "User_B", "User_C", ], "Engaging User": [ "User_B", "User_B", "User_A", "User_D", "User_B", "User_c", "User_A", "User_D", "User_D", "User_D", ], }) isfreqthr = freq_limit > 0 if isinstance(freq_limit, int) else isinstance( freq_limit, dict) if (not search_sort and isfreqthr) or (search_sort and not isfreqthr): cat_names = ["Author", "Engaging User"] cats = cat_names >> ops.Categorify( freq_threshold=freq_limit, out_path=str(tmpdir), search_sorted=search_sort, num_buckets=buckets, ) workflow = nvt.Workflow(cats) df_out = (workflow.fit_transform(nvt.Dataset( df, cpu=cpu)).to_ddf().compute(scheduler="synchronous")) if freq_limit and not buckets: # Column combinations are encoded if isinstance(freq_limit, dict): assert df_out["Author"].max() == 2 assert df_out["Engaging User"].max() == 1 else: assert len(df["Author"].unique()) == df_out["Author"].max() assert len(df["Engaging User"].unique() ) == df_out["Engaging User"].max() elif not freq_limit and buckets: if isinstance(buckets, dict): assert df_out["Author"].max() <= 9 assert df_out["Engaging User"].max() <= 19 else: assert df_out["Author"].max() <= 9 assert df_out["Engaging User"].max() <= 9 elif freq_limit and buckets: if (isinstance(buckets, dict) and isinstance(buckets, dict) and not isinstance(df, pd.DataFrame)): assert ( df_out["Author"].max() <= (df["Author"].hash_values() % buckets["Author"]).max() + 2 + 1) assert (df_out["Engaging User"].max() <= (df["Engaging User"].hash_values() % buckets["Engaging User"]).max() + 1 + 1)
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine, client): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y"] label_name = ["label"] columns = mycols_pq if engine == "parquet" else mycols_csv df_copy = df.copy() config = nvt.workflow.get_new_config() processor = nvtabular.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) columns_ctx = {} columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = cont_names columns_ctx["all"] = {} columns_ctx["all"]["base"] = columns # Substring # Replacement op = ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.str.slice(1, 3), columns=["name-cat", "name-string"], replace=True, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.slice(1, 3)) assert new_gdf["name-string"].equals(df_copy["name-string"].str.slice( 1, 3)) # No Replacement df = df_copy.copy() op = ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.str.slice(1, 3), columns=["name-cat", "name-string"], replace=False, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat_slice"].equals(df_copy["name-cat"].str.slice( 1, 3)) assert new_gdf["name-string_slice"].equals( df_copy["name-string"].str.slice(1, 3)) assert new_gdf["name-cat"].equals(df_copy["name-cat"]) assert new_gdf["name-string"].equals(df_copy["name-string"]) # Replace # Replacement df = df_copy.copy() op = ops.LambdaOp( op_name="replace", f=lambda col, gdf: col.str.replace("e", "XX"), columns=["name-cat", "name-string"], replace=True, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.replace( "e", "XX")) assert new_gdf["name-string"].equals(df_copy["name-string"].str.replace( "e", "XX")) # No Replacement df = df_copy.copy() op = ops.LambdaOp( op_name="replace", f=lambda col, gdf: col.str.replace("e", "XX"), columns=["name-cat", "name-string"], replace=False, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat_replace"].equals(df_copy["name-cat"].str.replace( "e", "XX")) assert new_gdf["name-string_replace"].equals( df_copy["name-string"].str.replace("e", "XX")) assert new_gdf["name-cat"].equals(df_copy["name-cat"]) assert new_gdf["name-string"].equals(df_copy["name-string"]) # astype # Replacement df = df_copy.copy() op = ops.LambdaOp(op_name="astype", f=lambda col, gdf: col.astype(float), columns=["id"], replace=True) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["id"].dtype == "float64" # Workflow # Replacement import glob processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.astype(str).str.slice(0, 1), columns=["name-cat"], replace=True, ), ops.Categorify(), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out1") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert is_integer_dtype(df_pp["name-cat"].dtype) processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.Categorify(), ops.LambdaOp(op_name="add100", f=lambda col, gdf: col + 100, replace=True), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out2") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert is_integer_dtype(df_pp["name-cat"].dtype) assert np.sum(df_pp["name-cat"] < 100) == 0 # Workflow # No Replacement processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.astype(str).str.slice(0, 1), columns=["name-cat"], replace=False, ), ops.Categorify(), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out3") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert df_pp["name-cat"].dtype == "O" print(df_pp) assert is_integer_dtype(df_pp["name-cat_slice"].dtype) assert np.sum(df_pp["name-cat_slice"] == 0) == 0 processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.Categorify(), ops.LambdaOp(op_name="add100", f=lambda col, gdf: col + 100, replace=False), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out4") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert is_integer_dtype(df_pp["name-cat_add100"].dtype) assert np.sum(df_pp["name-cat_add100"] < 100) == 0 processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.LambdaOp(op_name="mul0", f=lambda col, gdf: col * 0, columns=["x"], replace=False), ops.LambdaOp(op_name="add100", f=lambda col, gdf: col + 100, replace=False), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out5") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert np.sum(df_pp["x_mul0_add100"] < 100) == 0
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, use_client): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client if use_client else None) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client if use_client else None) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebook’s DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory BASE_DIR = args.out_path dask_workdir = os.path.join(BASE_DIR, "workdir") output_path = os.path.join(BASE_DIR, "output") stats_path = os.path.join(BASE_DIR, "stats") if not os.path.isdir(BASE_DIR): os.mkdir(BASE_DIR) for dir_path in (dask_workdir, output_path, stats_path): if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.mkdir(dir_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) if args.normalize: processor.add_feature([ops.FillMissing(), ops.Normalize()]) else: processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, num_io_threads=args.num_io_threads, ) else: processor.apply( dataset, num_io_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def test_tf_gpu_dl(tmpdir, paths, use_paths, dataset, batch_size, gpu_memory_frac, engine): cont_names = ["x", "y", "id"] cat_names = ["name-string"] label_name = ["label"] if engine == "parquet": cat_names.append("name-cat") columns = cont_names + cat_names conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) workflow.transform(dataset).to_parquet(tmpdir + "/processed") data_itr = tf_dataloader.KerasSequenceLoader( str(tmpdir + "/processed"), # workflow.transform(dataset), cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, buffer_size=gpu_memory_frac, label_names=label_name, engine=engine, shuffle=False, ) _ = tf.random.uniform((1, )) rows = 0 for idx in range(len(data_itr)): X, y = next(data_itr) # first elements to check epoch-to-epoch consistency if idx == 0: X0, y0 = X, y # check that we have at most batch_size elements num_samples = y.shape[0] if num_samples != batch_size: try: next(data_itr) except StopIteration: rows += num_samples continue else: raise ValueError("Batch size too small at idx {}".format(idx)) # check that all the features in X have the # appropriate length and that the set of # their names is exactly the set of names in # `columns` these_cols = columns.copy() for column, x in X.items(): try: these_cols.remove(column) except ValueError: raise AssertionError assert x.shape[0] == num_samples assert len(these_cols) == 0 rows += num_samples assert (idx + 1) * batch_size >= rows assert rows == (60 * 24 * 3 + 1) # if num_samples is equal to batch size, # we didn't exhaust the iterator and do # cleanup. Try that now if num_samples == batch_size: try: next(data_itr) except StopIteration: pass else: raise ValueError assert not data_itr._working assert data_itr._batch_itr is None # check start of next epoch to ensure consistency X, y = next(data_itr) assert (y.numpy() == y0.numpy()).all() for column, x in X.items(): x0 = X0.pop(column) assert (x.numpy() == x0.numpy()).all() assert len(X0) == 0 data_itr.stop() assert not data_itr._working assert data_itr._batch_itr is None
def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, devices): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], devices=devices, ) columns = mycols_pq df_test = cudf.read_parquet(tar_paths[0])[columns] df_test.columns = [x for x in range(0, len(columns))] num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( tar_paths[0]) rows = 0 # works with iterator alone, needs to test inside torch dataloader for idx, chunk in enumerate(data_itr): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert rows == num_rows def gen_col(batch): batch = batch[0] return batch[0], batch[1], batch[2] t_dl = torch_dataloader.DLDataLoader(data_itr, collate_fn=gen_col, pin_memory=False, num_workers=0) rows = 0 for idx, chunk in enumerate(t_dl): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) if os.path.exists(output_train): shutil.rmtree(output_train)
def test_mh_model_support(tmpdir): df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Null User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], "Cont1": [0.3, 0.4, 0.5, 0.6], "Cont2": [0.3, 0.4, 0.5, 0.6], "Cat1": ["A", "B", "A", "C"], }) cat_names = ["Cat1", "Null User", "Authors", "Reviewers"] # , "Engaging User"] cont_names = ["Cont1", "Cont2"] label_name = ["Post"] out_path = os.path.join(tmpdir, "train/") os.mkdir(out_path) cats = cat_names >> ops.Categorify() conts = cont_names >> ops.Normalize() processor = nvt.Workflow(cats + conts + label_name) df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute() data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=2, ) emb_sizes = nvt.ops.get_embedding_sizes(processor) EMBEDDING_DROPOUT_RATE = 0.04 DROPOUT_RATES = [0.001, 0.01] HIDDEN_DIMS = [1000, 500] LEARNING_RATE = 0.001 model = Model( embedding_table_shapes=emb_sizes, num_continuous=len(cont_names), emb_dropout=EMBEDDING_DROPOUT_RATE, layer_hidden_dims=HIDDEN_DIMS, layer_dropout_rates=DROPOUT_RATES, ).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) def rmspe_func(y_pred, y): "Return y_pred and y to non-log space and compute RMSPE" y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1 pct_var = (y_pred - y) / y return (pct_var**2).mean().pow(0.5) train_loss, y_pred, y = process_epoch( data_itr, model, train=True, optimizer=optimizer, # transform=batch_transform, amp=False, ) train_rmspe = None train_rmspe = rmspe_func(y_pred, y) assert train_rmspe is not None assert len(y_pred) > 0 assert len(y) > 0
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr(nvt_data, cats=cat_names, conts=cont_names, labels=["label"]) results = {} for batch_size in [2**i for i in range(9, 25, 1)]: print("Checking batch size: ", batch_size) num_iter = max(10 * 1000 * 1000 // batch_size, 100) # load 10e7 samples # import pdb; pdb.set_trace() data_itr.batch_size = batch_size start = time.time() for i, data in enumerate(data_itr): if i >= num_iter: break del data stop = time.time() throughput = i * batch_size / (stop - start) results[batch_size] = throughput print( "batch size: ", batch_size, ", throughput: ", throughput, "items", i * batch_size, "time", stop - start, )
def test_tf_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cont_names = ["x", "y", "id"] cat_names = ["name-string"] label_name = ["label"] if engine == "parquet": cat_names.append("name-cat") columns = cont_names + cat_names processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() data_itr = tf_dataloader.KerasSequenceDataset( paths, columns=columns, batch_size=batch_size, buffer_size=gpu_memory_frac, label_name=label_name[0], engine=engine, shuffle=False, ) processor.update_stats(data_itr.nvt_dataset, record_stats=True) data_itr.map(processor) rows = 0 for idx in range(len(data_itr)): X, y = next(data_itr) # first elements to check epoch-to-epoch consistency if idx == 0: X0, y0 = X, y # check that we have at most batch_size elements num_samples = y.shape[0] assert num_samples <= batch_size # check that all the features in X have the # appropriate length and that the set of # their names is exactly the set of names in # `columns` these_cols = columns.copy() for column, x in X.items(): try: these_cols.remove(column) except ValueError: raise AssertionError assert x.shape[0] == num_samples assert len(these_cols) == 0 rows += num_samples # check start of next epoch to ensure consistency X, y = next(data_itr) assert (y.numpy() == y0.numpy()).all() for column, x in X.items(): x0 = X0.pop(column) assert (x.numpy() == x0.numpy()).all() assert len(X0) == 0 # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == (60 * 24 * 3 + 1)
def test_gpu_preproc(tmpdir, datasets, dump, gpu_memory_frac, engine, preprocessing): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature( [ops.FillMissing(), ops.LogOp(preprocessing=preprocessing)]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) processor.update_stats(data_itr) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log x_col = "x" if preprocessing else "x_LogOp" y_col = "y" if preprocessing else "y_LogOp" assert math.isclose( get_norms(df.x).mean(), processor.stats["means"][x_col], rel_tol=1e-2, ) assert math.isclose( get_norms(df.y).mean(), processor.stats["means"][y_col], rel_tol=1e-2, ) assert math.isclose( get_norms(df.x).std(), processor.stats["stds"][x_col], rel_tol=1e-2, ) assert math.isclose( get_norms(df.y).std(), processor.stats["stds"][y_col], rel_tol=1e-2, ) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_to_string() cats0 = processor.stats["encoders"]["name-cat"].get_cats( ).values_to_string() assert cats0 == ["None"] + cats_expected0 cats_expected1 = df["name-string"].unique().values_to_string() cats1 = processor.stats["encoders"]["name-string"].get_cats( ).values_to_string() print(cats1) assert cats1 == ["None"] + cats_expected1 # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, data_itr, nfiles=10, shuffle=True, apply_ops=True) processor.create_final_cols() # if preprocessing if not preprocessing: for col in cont_names: assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][ "continuous"] dlc = nvtabular.torch_dataloader.DLCollator(preproc=processor, apply_ops=False) data_files = [ nvtabular.torch_dataloader.FileItrDataset( x, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) for x in glob.glob(str(tmpdir) + "/ds_part.*.parquet") ] data_itr = torch.utils.data.ChainDataset(data_files) dl = nvtabular.torch_dataloader.DLDataLoader(data_itr, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0) len_df_pp = 0 for chunk in dl: len_df_pp += len(chunk[0][0]) data_itr = nvtabular.io.GPUDatasetIterator( glob.glob(str(tmpdir) + "/ds_part.*.parquet"), use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) x = processor.ds_to_tensors(data_itr, apply_ops=False) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert len(x[0]) == len_df_pp itr_ds = nvtabular.torch_dataloader.TensorItrDataset([x[0], x[1], x[2]], batch_size=512000) count_tens_itr = 0 for data_gd in itr_ds: count_tens_itr += len(data_gd[1]) assert data_gd[0][0].shape[1] > 0 assert data_gd[0][1].shape[1] > 0 assert len_df_pp == count_tens_itr if os.path.exists(processor.ds_exports): shutil.rmtree(processor.ds_exports)
def test_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature([ops.FillMissing()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( data_itr, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr( tar_paths[0], engine="parquet", sub_batch_size=batch_size, gpu_memory_frac=gpu_memory_frac, cats=cat_names, conts=cont_names, labels=["label"], names=mycols_csv, sep="\t", ) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( tar_paths[0]) rows = 0 for idx, chunk in enumerate(data_itr): rows += len(chunk[0]) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == num_rows if os.path.exists(output_train): shutil.rmtree(output_train)
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names, label_name, num_rows): json_sample["num_rows"] = num_rows cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1)) dataset = df_gen.create_df(num_rows, cols) dataset = nvt.Dataset(dataset) features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names or mh_names: features.append(cat_names + mh_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.WorkflowNode(label_name)) processor = nvt.Workflow(graph) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") if processor.output_node.output_schema.apply_inverse( ColumnSelector("lab_1")): # if we don't have conts/cats/labels we're done return data_itr = None with pytest.raises(ValueError) as exc_info: data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names + mh_names, conts=cont_names, labels=label_name, batch_size=2, ) assert "Neither Categorical or Continuous columns were found by the dataloader. " in str( exc_info.value) if data_itr: for nvt_batch in data_itr: cats_conts, labels = nvt_batch if cat_names: assert set(cat_names).issubset(set(list(cats_conts.keys()))) if cont_names: assert set(cont_names).issubset(set(list(cats_conts.keys()))) if cat_names or cont_names or mh_names: emb_sizes = nvt.ops.get_embedding_sizes(processor) EMBEDDING_DROPOUT_RATE = 0.04 DROPOUT_RATES = [0.001, 0.01] HIDDEN_DIMS = [1000, 500] LEARNING_RATE = 0.001 model = Model( embedding_table_shapes=emb_sizes, num_continuous=len(cont_names), emb_dropout=EMBEDDING_DROPOUT_RATE, layer_hidden_dims=HIDDEN_DIMS, layer_dropout_rates=DROPOUT_RATES, ).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) def rmspe_func(y_pred, y): "Return y_pred and y to non-log space and compute RMSPE" y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1 pct_var = (y_pred - y) / y return (pct_var**2).mean().pow(0.5) train_loss, y_pred, y = process_epoch( data_itr, model, train=True, optimizer=optimizer, amp=False, ) train_rmspe = None train_rmspe = rmspe_func(y_pred, y) assert train_rmspe is not None assert len(y_pred) > 0 assert len(y) > 0