def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf): dataset = Dataset(str(datasets["parquet"]), engine=engine) if on_ddf: dataset = dataset.to_ddf() cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] data_loader = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, ) # Convert to iterators and then to DataFrames df1 = _concat(list(data_loader._buff.itr)) df2 = _concat(list(data_loader.epochs(epochs)._buff.itr)) # Check that the DataFrame sizes and rows make sense assert len(df2) == epochs * len(df1) assert_eq( _concat([df1 for i in range(epochs)]).reset_index(drop=True), df2.reset_index(drop=True), )
def test_workflow_node_select(): df = dispatch._make_df({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) dataset = Dataset(df) input_features = WorkflowNode(ColumnSelector(["a", "b", "c"])) # pylint: disable=unnecessary-lambda sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col)) plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) workflow.fit(dataset) df_out = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") expected = dispatch._make_df() expected["a"] = np.sqrt(df["a"]) expected["c"] = np.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_gpu_file_iterator_ds(df, dataset, batch, engine): df_itr = nvt.dispatch._make_df({}) for data_gd in dataset.to_iter(columns=mycols_csv): df_itr = nvt.dispatch._concat( [df_itr, data_gd], axis=0) if df_itr is not None else data_gd assert_eq(df_itr.reset_index(drop=True), df.reset_index(drop=True))
def test_workflow_move_saved(tmpdir): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = nvt.dispatch._make_df({"geo": raw}) geo_location = ColumnSelector(["geo"]) state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >> ops.Rename(postfix="_state")) country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >> ops.Rename(postfix="_country")) geo_features = state + country + geo_location >> ops.Categorify() # create the workflow and transform the input workflow = Workflow(geo_features) expected = workflow.fit_transform(Dataset(data)).to_ddf().compute() # save the workflow (including categorical mapping parquet files) # and then verify we can load the saved workflow after moving the directory out_path = os.path.join(tmpdir, "output", "workflow") workflow.save(out_path) moved_path = os.path.join(tmpdir, "output", "workflow2") shutil.move(out_path, moved_path) workflow2 = Workflow.load(moved_path) # also check that when transforming our input we get the same results after loading transformed = workflow2.transform(Dataset(data)).to_ddf().compute() assert_eq(expected, transformed)
def test_log(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns, cpu): cont_features = op_columns >> nvt.ops.LogOp() processor = nvt.Workflow(cont_features) processor.fit(dataset) new_df = processor.transform(dataset).to_ddf().compute() for col in op_columns: values = dispatch._array(new_df[col]) original = dispatch._array(df[col]) assert_eq(values, np.log(original.astype(np.float32) + 1))
def test_logop_lists(tmpdir, cpu): df = dispatch._make_df(device="cpu" if cpu else "gpu") df["vals"] = [[np.exp(0) - 1, np.exp(1) - 1], [np.exp(2) - 1], []] features = ["vals"] >> nvt.ops.LogOp() workflow = nvt.Workflow(features) new_df = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() expected = dispatch._make_df(device="cpu" if cpu else "gpu") expected["vals"] = [[0.0, 1.0], [2.0], []] assert_eq(expected, new_df)
def _verify_workflow_on_tritonserver( tmpdir, workflow, df, model_name, output_model="tensorflow", model_info=None, sparse_max=None, ): """tests that the nvtabular workflow produces the same results when run locally in the process, and when run in tritonserver""" # fit the workflow and test on the input dataset = nvt.Dataset(df) workflow.fit(dataset) local_df = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") for col in workflow.output_node.output_columns.names: if sparse_max and col in sparse_max.keys(): workflow.output_dtypes[col] = workflow.output_dtypes.get( col).element_type triton.generate_nvtabular_model( workflow=workflow, name=model_name, output_path=tmpdir + f"/{model_name}", version=1, output_model=output_model, output_info=model_info, sparse_max=sparse_max, backend=BACKEND, ) inputs = triton.convert_df_to_triton_input(df.columns, df) outputs = [ grpcclient.InferRequestedOutput(col) for col in workflow.output_dtypes.keys() ] with run_triton_server(tmpdir) as client: response = client.infer(model_name, inputs, outputs=outputs) for col in workflow.output_dtypes.keys(): features = response.as_numpy(col) if sparse_max and col in sparse_max: features = features.tolist() triton_df = _make_df() triton_df[col] = features else: triton_df = _make_df( {col: features.reshape(features.shape[0])}) assert_eq(triton_df, local_df[[col]])
def test_generate_triton_model(tmpdir, engine, output_model, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output if output_model == "pytorch": model_info = { "name-cat": { "columns": ["name-cat"], "dtype": "int64" }, "name-string": { "columns": ["name-string"], "dtype": "int64" }, "id": { "columns": ["id"], "dtype": "float32" }, "x": { "columns": ["x"], "dtype": "float32" }, "y": { "columns": ["y"], "dtype": "float32" }, } else: model_info = None repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model( workflow=workflow, name="model", output_path=repo, version=1, output_model=output_model, output_info=model_info, ) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_rename(cpu): DataFrame = pd.DataFrame if cpu else cudf.DataFrame df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [6, 7, 8, 9, 10]}) selector = ColumnSelector(["x", "y"]) op = ops.Rename(f=lambda name: name.upper()) transformed = op.transform(selector, df) expected = DataFrame({"X": [1, 2, 3, 4, 5], "Y": [6, 7, 8, 9, 10]}) assert_eq(transformed, expected) op = ops.Rename(postfix="_lower") transformed = op.transform(selector, df) expected = DataFrame({"x_lower": [1, 2, 3, 4, 5], "y_lower": [6, 7, 8, 9, 10]}) assert_eq(transformed, expected) selector = ColumnSelector(["x"]) op = ops.Rename(name="z") transformed = op.transform(selector, df) expected = DataFrame({"z": [1, 2, 3, 4, 5]}) assert_eq(transformed, expected) op = nvt.ops.Rename(f=lambda name: name.upper()) transformed = op.transform(selector, df) expected = DataFrame({"X": [1, 2, 3, 4, 5]}) assert_eq(transformed, expected)
def test_s3_dataset(s3_base, s3so, paths, datasets, engine, df): # Copy files to mock s3 bucket files = {} for i, path in enumerate(paths): with open(path, "rb") as f: fbytes = f.read() fn = path.split(os.path.sep)[-1] files[fn] = BytesIO() files[fn].write(fbytes) files[fn].seek(0) if engine == "parquet": # Workaround for nvt#539. In order to avoid the # bug in Dask's `create_metadata_file`, we need # to manually generate a "_metadata" file here. # This can be removed after dask#7295 is merged # (see https://github.com/dask/dask/pull/7295) fn = "_metadata" files[fn] = BytesIO() meta = create_metadata_file( paths, engine="pyarrow", out_dir=False, ) meta.write_metadata_file(files[fn]) files[fn].seek(0) with s3_context(s3_base=s3_base, bucket=engine, files=files): # Create nvt.Dataset from mock s3 paths url = f"s3://{engine}" if engine == "parquet" else f"s3://{engine}/*" dataset = nvt.Dataset(url, engine=engine, storage_options=s3so) # Check that the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = cudf.concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cats = cat_names >> ops.Categorify(cat_cache="host") processor = nvt.Workflow(conts + cats + label_name) processor.fit(dataset)
def test_target_encode_multi(tmpdir, npartitions, cpu): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = dispatch._make_df({ "cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2 }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=npartitions) else: df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_groups = ["cat", "cat2", ["cat", "cat2"]] te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"], out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32") workflow = nvt.Workflow(te_features) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def test_list_slice_pad(cpu): DataFrame = pd.DataFrame if cpu else cudf.DataFrame df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]}) # 0 pad to 5 elements op = ops.ListSlice(5, pad=True) selector = ColumnSelector(["y"]) transformed = op.transform(selector, df) expected = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3, 0], [1, 223, 4, 0, 0]]}) assert_eq(transformed, expected) # make sure we can also pad when start != 0, and when pad_value is set op = ops.ListSlice(1, 6, pad=True, pad_value=123) selector = ColumnSelector(["y"]) transformed = op.transform(selector, df) expected = DataFrame({"y": [[1, 2, 2, 767, 123], [2, 2, 3, 123, 123], [223, 4, 123, 123, 123]]}) assert_eq(transformed, expected)
def test_fit_simple(): data = cudf.DataFrame({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = cudf.DataFrame({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) assert_eq(expected, transformed)
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu): df = dispatch._make_df({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=3) else: df = dask_cudf.from_cudf(df, npartitions=3) cont_names = ["Cost"] te_features = cat_groups >> ops.TargetEncoding( cont_names, out_path=str(tmpdir), kfold=kfold, out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate ) cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") df_lib = dispatch.get_lib() if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = df_lib.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check, check_dtype=False)
def test_generate_triton_model(tmpdir, engine, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_fit_simple(): data = nvt.dispatch._make_df({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = nvt.dispatch._make_df({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) if not HAS_GPU: transformed["x"] = transformed["x"].astype(expected["x"].dtype) transformed["y"] = transformed["y"].astype(expected["y"].dtype) assert_eq(expected, transformed)
def test_normalize_lists(tmpdir, cpu): df = dispatch._make_df(device="cpu" if cpu else "gpu") df["vals"] = [ [0.0, 1.0, 2.0], [ 3.0, 4.0, ], [5.0], ] features = ["vals"] >> nvt.ops.Normalize() workflow = nvt.Workflow(features) transformed = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() expected = _flatten_list_column_values(df["vals"]).astype("float32") expected = (expected - expected.mean()) / expected.std() expected_df = type(transformed)({"vals": expected}) assert_eq(expected_df, _flatten_list_column(transformed["vals"]))
def test_workflow_transform_ddf_dtypes(): # Initial Dataset df = cudf.datasets.timeseries().reset_index() ddf = dask_cudf.from_cudf(df, npartitions=2) dataset = Dataset(ddf) # Create and Execute Workflow cols = ["name", "x", "y", "timestamp"] cat_cols = ["id"] >> ops.Normalize() workflow = Workflow(cols + cat_cols) workflow.fit(dataset) transformed_ddf = workflow.transform(dataset).to_ddf() # no transforms on the pass through cols, should have original dtypes for col in cols: assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col]) # Followup dask-cudf sorting used to throw an exception because of dtype issues, # check that it works now transformed_ddf.sort_values(["id", "timestamp"]).compute()
def test_column_group_select(): df = cudf.DataFrame({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) input_features = ColumnGroup(["a", "b", "c"]) sqrt_features = input_features[["a", "c"]] >> cudf.sqrt plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) df_out = workflow.fit_transform( Dataset(df)).to_ddf().compute(scheduler="synchronous") expected = cudf.DataFrame() expected["a"] = cudf.sqrt(df["a"]) expected["c"] = cudf.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_convert_format(_from, _to): convert_format = data_conversions.convert_format # we want to test conversion from '_from' to '_to' but this requires us roundtripping # from a known format. I'm picking pd -> _from -> _to -> pandas somewhat arbitrarily df = pd.DataFrame({ "float": [0.0, 1.0, 2.0], "int": [10, 11, 12], "multihot": [[0, 1, 2, 3], [3, 4], [5]] }) if _from != Supports.GPU_DICT_ARRAY and _to != Supports.GPU_DICT_ARRAY: df["string"] = ["aa", "bb", "cc"] df["multihot_string"] = [["aaaa", "bb", "cc"], ["dd", "ee"], ["fffffff"]] start, kind = convert_format(df, Supports.CPU_DATAFRAME, _from) assert kind == _from mid, kind = convert_format(start, kind, _to) assert kind == _to final, kind = convert_format(mid, kind, Supports.CPU_DATAFRAME) assert kind == Supports.CPU_DATAFRAME assert_eq(df, final)
def test_generate_triton_multihot(tmpdir): df = _make_df({ "userId": ["a", "a", "b"], "movieId": ["1", "2", "2"], "genres": [["action", "adventure"], ["action", "comedy"], ["comedy"]], }) cats = ["userId", "movieId", "genres"] >> nvt.ops.Categorify() workflow = nvt.Workflow(cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def _verify_workflow_on_tritonserver(tmpdir, workflow, df, model_name): """tests that the nvtabular workflow produces the same results when run locally in the process, and when run in tritonserver""" # fit the workflow and test on the input dataset = nvt.Dataset(df) workflow.fit(dataset) local_df = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") triton.generate_nvtabular_model(workflow, model_name, tmpdir + f"/{model_name}", backend=BACKEND) inputs = triton.convert_df_to_triton_input(df.columns, df) with run_triton_server(tmpdir) as client: response = client.infer(model_name, inputs) for col in df.columns: features = response.as_numpy(col) triton_df = cudf.DataFrame( {col: features.reshape(features.shape[0])}) assert_eq(triton_df, local_df[[col]])
def test_transform_geolocation(): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = nvt.dispatch._make_df({"geo_location": raw}) geo_location = ColumnSelector(["geo_location"]) state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >> ops.Rename(postfix="_state")) country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >> ops.Rename(postfix="_country")) geo_features = state + country + geo_location >> ops.HashBucket( num_buckets=100) # for this workflow we don't have any statoperators, so we can get away without fitting workflow = Workflow(geo_features) transformed = workflow.transform(Dataset(data)).to_ddf().compute() expected = nvt.dispatch._make_df() expected["geo_location_state"] = data["geo_location"].str.slice( 0, 5).hash_values() % 100 expected["geo_location_country"] = data["geo_location"].str.slice( 0, 2).hash_values() % 100 expected["geo_location"] = data["geo_location"].hash_values() % 100 assert_eq(expected, transformed)
def test_list_slice(cpu): DataFrame = pd.DataFrame if cpu else cudf.DataFrame df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]}) op = ops.ListSlice(0, 2) transformed = op.transform(["y"], df) expected = DataFrame({"y": [[0, 1], [1, 2], [1, 223]]}) assert_eq(transformed, expected) op = ops.ListSlice(3, 5) transformed = op.transform(["y"], df) expected = DataFrame({"y": [[2, 767], [3], []]}) assert_eq(transformed, expected) op = ops.ListSlice(4, 10) transformed = op.transform(["y"], df) expected = DataFrame({"y": [[767], [], []]}) assert_eq(transformed, expected) op = ops.ListSlice(100, 20000) transformed = op.transform(["y"], df) expected = DataFrame({"y": [[], [], []]}) assert_eq(transformed, expected) op = ops.ListSlice(-4) transformed = op.transform(["y"], df) expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]}) assert_eq(transformed, expected) op = ops.ListSlice(-3, -1) transformed = op.transform(["y"], df) expected = DataFrame({"y": [[2, 2], [2, 2], [1, 223]]}) assert_eq(transformed, expected)
def test_s3_dataset(s3_base, s3so, paths, datasets, engine, df, patch_aiobotocore): # Copy files to mock s3 bucket files = {} for i, path in enumerate(paths): with open(path, "rb") as f: fbytes = f.read() fn = path.split(os.path.sep)[-1] files[fn] = BytesIO() files[fn].write(fbytes) files[fn].seek(0) if engine == "parquet": # Workaround for nvt#539. In order to avoid the # bug in Dask's `create_metadata_file`, we need # to manually generate a "_metadata" file here. # This can be removed after dask#7295 is merged # (see https://github.com/dask/dask/pull/7295) fn = "_metadata" files[fn] = BytesIO() meta = create_metadata_file( paths, engine="pyarrow", out_dir=False, ) meta.write_metadata_file(files[fn]) files[fn].seek(0) with s3_context(s3_base=s3_base, bucket=engine, files=files) as s3fs: # Create nvt.Dataset from mock s3 paths url = f"s3://{engine}" if engine == "parquet" else f"s3://{engine}/*" dataset = nvt.Dataset(url, engine=engine, storage_options=s3so) # Check that the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = nvt.dispatch._concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cats = cat_names >> ops.Categorify(cat_cache="host") processor = nvt.Workflow(conts + cats + label_name) processor.fit(dataset) # make sure we can write out the dataset back to S3 # (https://github.com/NVIDIA-Merlin/NVTabular/issues/1214) processor.transform(dataset).to_parquet(f"s3://{engine}/output") expected = processor.transform(dataset).to_ddf().compute() # make sure we can write out the workflow to s3 processor.save(f"s3://{engine}/saved_workflow/") # make sure the workflow got saved to the right spot in S3 workflow_files = s3fs.ls(f"/{engine}/saved_workflow/") assert workflow_files # finally make sure we can read in the workflow from S3, and use it # to transform values and get the same result as on the local fs reloaded = nvt.Workflow.load(f"s3://{engine}/saved_workflow/") from_s3 = reloaded.transform(dataset).to_ddf().compute() assert_eq(expected, from_s3)
def test_gpu_dataset_iterator_csv(df, dataset, engine): df_itr = nvt.dispatch._concat(list(dataset.to_iter(columns=mycols_csv)), axis=0) assert_eq(df_itr.reset_index(drop=True), df.reset_index(drop=True))