def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) p = nvt.Workflow( cat_names=["ad_id", "source_id", "platform"], cont_names=["cont"], label_name=["clicked"], client=client, ) p.add_feature(ops.FillMissing()) p.add_feature(ops.Normalize()) p.add_feature(ops.Categorify()) p.add_feature( ops.TargetEncoding( cat_groups=["ad_id", "source_id", "platform"], cont_target="clicked", kfold=5, fold_seed=42, p_smooth=20, )) p.apply(nvt.Dataset(gdf_test), record_stats=True) assert p.stats
def test_normalize(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cont_features = op_columns >> ops.Normalize() processor = nvtabular.Workflow(cont_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() new_gdf.index = df.index # Make sure index is aligned for checks for col in op_columns: assert math.isclose(df[col].mean(), processor.column_group.op.means[col], rel_tol=1e-4) assert math.isclose(df[col].std(), processor.column_group.op.stds[col], rel_tol=1e-4) df[col] = (df[col] - processor.column_group.op.means[col] ) / processor.column_group.op.stds[col] assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2) # our normalize op also works on dicts of cupy/numpy tensors. make sure this works like we'd # expect df = dataset.compute() cupy_inputs = {col: df[col].values for col in op_columns} cupy_outputs = cont_features.op.transform(op_columns, cupy_inputs) for col in op_columns: assert np.allclose(cupy_outputs[col], new_gdf[col].values)
def test_normalize(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_name = ["label"] config = nvt.workflow.get_new_config() config["PP"]["continuous"] = [ops.Moments(columns=op_columns)] processor = nvtabular.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config) processor.update_stats(dataset) op = ops.Normalize() columns_ctx = {} columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = op_columns or cont_names new_gdf = op.apply_op(df, columns_ctx, "continuous", stats_context=processor.stats) df["x"] = (df["x"] - processor.stats["means"]["x"]) / processor.stats["stds"]["x"] assert new_gdf["x"].equals(df["x"])
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] normalize = ops.Normalize() conts = cont_names >> ops.FillMissing() >> normalize workflow = Workflow(conts + cat_names + label_name, client=client) dataset = Dataset(paths, engine) result = workflow.fit_transform(dataset).to_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() for name in cont_names: assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3) assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_s3_dataset(s3, paths, engine, df): # create a mocked out bucket here bucket = "testbucket" s3.create_bucket(Bucket=bucket) s3_paths = [] for path in paths: s3_path = f"s3://{bucket}/{path}" with fsspec.open(s3_path, "wb") as f: f.write(open(path, "rb").read()) s3_paths.append(s3_path) # create a basic s3 dataset dataset = nvt.Dataset(s3_paths) # make sure the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = cudf.concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify(cat_cache="host")) processor.finalize() processor.update_stats(dataset)
def test_parquet_output(client, use_client, tmpdir, shuffle): out_files_per_proc = 2 n_workers = len(client.cluster.workers) if use_client else 1 out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) size = 25 row_group_size = 5 df = pd.DataFrame({"a": np.arange(size)}) df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow") columns = ["a"] dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1) workflow = nvt.Workflow(columns >> ops.Normalize(), client=client if use_client else None) workflow.fit_transform(dataset).to_parquet( output_path=out_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc) # Check that the number of output files is correct result = glob.glob(os.path.join(out_path, "*.parquet")) assert len(result) == out_files_per_proc * n_workers # Make sure _metadata exists meta_path = os.path.join(out_path, "_metadata") assert os.path.exists(meta_path) # Make sure _metadata makes sense _metadata = cudf.io.read_parquet_metadata(meta_path) assert _metadata[0] == size assert _metadata[2] == columns
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts # first with no continuous columns processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_feature(ops.Normalize()) processor.add_feature(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_format=None, ) df_out = processor.get_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 ) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_normalize_upcastfloat64(tmpdir, dataset, gpu_memory_frac, engine, op_columns): df = cudf.DataFrame( { "x": [1.9e10, 2.3e16, 3.4e18, 1.6e19], "label": [1, 0, 1, 0] }, dtype="float32") cont_features = op_columns >> ops.Normalize() processor = nvtabular.Workflow(cont_features) dataset = nvt.Dataset(df) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() for col in op_columns: assert math.isclose(df[col].mean(), processor.column_group.op.means[col], rel_tol=1e-4) assert math.isclose(df[col].std(), processor.column_group.op.stds[col], rel_tol=1e-4) df[col] = (df[col] - processor.column_group.op.means[col] ) / processor.column_group.op.stds[col] assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2)
def test_error_handling(tmpdir): df = _make_df({"x": np.arange(10), "y": np.arange(10)}) def custom_transform(col): if len(col) == 2: raise ValueError("Lets cause some problems") return col features = ["x", "y" ] >> ops.FillMissing() >> ops.Normalize() >> custom_transform workflow = nvt.Workflow(features) workflow.fit(nvt.Dataset(df)) model_name = "test_error_handling" triton.generate_nvtabular_model(workflow, model_name, tmpdir + f"/{model_name}", backend=BACKEND) with run_triton_server(tmpdir) as client: inputs = triton.convert_df_to_triton_input(["x", "y"], df[:2]) with pytest.raises( tritonclient.utils.InferenceServerException) as exception_info: client.infer(model_name, inputs) assert "ValueError: Lets cause some problems" in str( exception_info.value)
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(ops.Normalize()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() counts = df0[cont_names].count() for name in cont_names: assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3) assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3) assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_schema_write_read_dataset(tmpdir, dataset, engine): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name) workflow.fit(dataset) workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, ) schema_path = Path(tmpdir) proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt") new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet")) assert """name: "name-cat"\n min: 0\n max: 27\n""" in str( proto_schema) assert new_dataset.schema == workflow.output_schema
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names: features.append(cat_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.ColumnGroup(label_name)) if not graph.columns: # if we don't have conts/cats/labels we're done return processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name))) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 ) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_normalize_upcastfloat64(tmpdir, dataset, gpu_memory_frac, engine, op_columns): df = cudf.DataFrame( {"x": [1.9e10, 2.3e16, 3.4e18, 1.6e19], "label": [1, 0, 1, 0]}, dtype="float32" ) cat_names = [] cont_names = ["x"] label_name = ["label"] config = nvt.workflow.get_new_config() config["PP"]["continuous"] = [ops.Moments(columns=op_columns)] processor = nvtabular.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config ) processor.update_stats(dataset) op = ops.Normalize() columns_ctx = {} columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = op_columns or cont_names new_gdf = op.apply_op(df, columns_ctx, "continuous", stats_context=processor.stats) df["x"] = (df["x"] - processor.stats["means"]["x"]) / processor.stats["stds"]["x"] assert new_gdf["x"].equals(df["x"])
def test_concatenate_dataframe(tmpdir, output_model): # we were seeing an issue in the rossmann workflow where we dropped certain columns, # https://github.com/NVIDIA/NVTabular/issues/961 df = _make_df({ "cat": ["aaaa", "bbbb", "cccc", "aaaa", "bbbb", "aaaa"], "cont": [0.0, 1.0, 2.0, 3.0, 4.0, 5], }) # this bug only happened with a dataframe representation: force this by using a lambda cats = ["cat"] >> ops.LambdaOp(lambda col: _hash_series(col) % 1000) conts = ["cont"] >> ops.Normalize() >> ops.FillMissing() >> ops.LogOp() dataset = Dataset(df) workflow = nvt.Workflow(cats + conts).fit_schema(dataset.infer_schema()) if output_model == "pytorch": model_info = { "cat": { "columns": ["cat"], "dtype": "int32" }, "cont": { "columns": ["cont"], "dtype": "float32" }, } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_concatenate_dataframe", output_model, model_info)
def test_normalize_std_zero(cpu): df = pd.DataFrame({"a": 7 * [10]}) dataset = nvt.Dataset(df, cpu=cpu) processor = nvtabular.Workflow(["a"] >> ops.Normalize()) processor.fit(dataset) result = processor.transform(dataset).compute()["a"] assert (result == 0).all()
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, cats=cat_names, conts=cont_names, labels=["label"] ) results = {} for batch_size in [2 ** i for i in range(9, 25, 1)]: print("Checking batch size: ", batch_size) num_iter = max(10 * 1000 * 1000 // batch_size, 100) # load 10e7 samples # import pdb; pdb.set_trace() data_itr.batch_size = batch_size start = time.time() for i, data in enumerate(data_itr): if i >= num_iter: break del data stop = time.time() throughput = i * batch_size / (stop - start) results[batch_size] = throughput print( "batch size: ", batch_size, ", throughput: ", throughput, "items", i * batch_size, "time", stop - start, )
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], device=device, ) len_dl = len(data_itr) - 1 first_chunk = 0 idx = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk = len(chunk[0]) last_chk = len(chunk[0]) print(last_chk) if idx == 1: break del chunk assert idx < len_dl first_chunk_2 = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk_2 = len(chunk[0]) del chunk assert idx == len_dl assert first_chunk == first_chunk_2
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_fit_schema_works_when_subtracting_column_names(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow1 = Workflow(cont_features - "y_renamed") workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == ["x_renamed"]
def test_fit_schema(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow = Workflow(cont_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == [ "x_renamed", "y_renamed", "id_renamed" ]
def test_gpu_dl(tmpdir, df, dataset, batch_size, gpu_memory_frac, engine): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name,) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr( tar_paths[0], engine="parquet", sub_batch_size=batch_size, gpu_memory_frac=gpu_memory_frac, cats=cat_names, conts=cont_names, labels=["label"], names=mycols_csv, sep="\t", ) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0]) rows = 0 for idx, chunk in enumerate(data_itr): rows += len(chunk) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == num_rows if os.path.exists(output_train): shutil.rmtree(output_train)
def test_generate_triton_model(tmpdir, engine, output_model, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output if output_model == "pytorch": model_info = { "name-cat": { "columns": ["name-cat"], "dtype": "int64" }, "name-string": { "columns": ["name-string"], "dtype": "int64" }, "id": { "columns": ["id"], "dtype": "float32" }, "x": { "columns": ["x"], "dtype": "float32" }, "y": { "columns": ["y"], "dtype": "float32" }, } else: model_info = None repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model( workflow=workflow, name="model", output_path=repo, version=1, output_model=output_model, output_info=model_info, ) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_remove_columns(): # _remove_columns was failing to export the criteo example, because # the label column was getting inserted into the subgroups of the output node # https://github.com/NVIDIA-Merlin/NVTabular/issues/1198 label_columns = ["label"] cats = ["a"] >> ops.Categorify() conts = ["b"] >> ops.Normalize() workflow = nvt.Workflow(cats + conts + label_columns) df = pd.DataFrame({"a": ["a", "b"], "b": [1.0, 2.0], "label": [0, 1]}) workflow.fit(nvt.Dataset(df)) removed = ensemble._remove_columns(workflow, label_columns) assert set(removed.output_dtypes.keys()) == {"a", "b"}
def test_normalize(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cont_features = op_columns >> ops.Normalize() processor = nvtabular.Workflow(cont_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() new_gdf.index = df.index # Make sure index is aligned for checks for col in op_columns: assert math.isclose(df[col].mean(), processor.column_group.op.means[col], rel_tol=1e-4) assert math.isclose(df[col].std(), processor.column_group.op.stds[col], rel_tol=1e-4) df[col] = (df[col] - processor.column_group.op.means[col]) / processor.column_group.op.stds[ col ] assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2)
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # Need to collect statistics first (for now) processor.update_stats(dataset) # Second "online" pass to write HugeCTR output processor.apply( dataset, apply_offline=False, record_stats=False, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=False, ) # Check files ext = "" if output_format == "parquet": ext = "parquet" assert os.path.isfile(outdir + "/metadata.json") elif output_format == "hugectr": ext = "data" assert os.path.isfile(outdir + "/file_list.txt") for n in range(nfiles): assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
def test_generate_triton_model(tmpdir, engine, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_workflow_transform_ddf_dtypes(): # Initial Dataset df = cudf.datasets.timeseries().reset_index() ddf = dask_cudf.from_cudf(df, npartitions=2) dataset = Dataset(ddf) # Create and Execute Workflow cols = ["name", "x", "y", "timestamp"] cat_cols = ["id"] >> ops.Normalize() workflow = Workflow(cols + cat_cols) workflow.fit(dataset) transformed_ddf = workflow.transform(dataset).to_ddf() # no transforms on the pass through cols, should have original dtypes for col in cols: assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col]) # Followup dask-cudf sorting used to throw an exception because of dtype issues, # check that it works now transformed_ddf.sort_values(["id", "timestamp"]).compute()
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_size="1MB", cpu=cpu) else: dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu) # Simple transform (normalize) cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Normalize() workflow = Workflow(conts + cat_names + label_name, client=client) transformed = workflow.fit_transform(dataset) # Write out dataset output_path = os.path.join(tmpdir, "processed") transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=4) # Check the final result df_disk = dd_read_parquet(output_path, engine="pyarrow").compute() assert_eq( df0.sort_values(["id", "x"])[["name-string", "label"]], df_disk.sort_values(["id", "x"])[["name-string", "label"]], check_index=False, )
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths, engine="parquet") data_loader = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, shuffle=False, labels=label_name, ) batch = next(iter(data_loader)) assert all(name in batch[0] for name in cat_names) assert all(name in batch[0] for name in cont_names) num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1 assert num_label_cols == len(label_name)
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify() if replace: cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms else: fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >> ops.Rename(postfix="_FillMissing_1_LogOp_1")) cont_features = cont_names + fillmissing_logop >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log concat_ops = "_FillMissing_1_LogOp_1" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), norms.means["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), norms.means["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)