def test_normalize_minmax(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_name = ["label"] config = nvt.workflow.get_new_config() config["PP"]["continuous"] = [ops.MinMax()] processor = nvtabular.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config) processor.update_stats(dataset) op = ops.NormalizeMinMax() columns_ctx = {} columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = cont_names new_gdf = op.apply_op(df, columns_ctx, "continuous", stats_context=processor.stats) df["x"] = (df["x"] - processor.stats["mins"]["x"]) / ( processor.stats["maxs"]["x"] - processor.stats["mins"]["x"]) assert new_gdf["x"].equals(df["x"])
def test_minmax(tmpdir, client, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_name = ["label"] config = nvtabular.workflow.get_new_config() config["PP"]["all"] = [ops.MinMax(columns=op_columns)] processor = nvtabular.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config) processor.update_stats(dataset) x_min = df["x"].min() assert x_min == pytest.approx(processor.stats["mins"]["x"], 1e-2) x_max = df["x"].max() assert x_max == pytest.approx(processor.stats["maxs"]["x"], 1e-2) if not op_columns: name_min = min(df["name-string"].tolist()) name_max = max(df["name-string"].tolist()) assert name_min == processor.stats["mins"]["name-string"] y_max = df["y"].max() y_min = df["y"].min() assert y_max == processor.stats["maxs"]["y"] assert name_max == processor.stats["maxs"]["name-string"] assert y_min == processor.stats["mins"]["y"]
def test_minmax(tmpdir, datasets, gpu_memory_frac, engine, op_columns): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) config = nvtabular.workflow.get_new_config() config["PP"]["all"] = [ops.MinMax(columns=op_columns)] processor = nvtabular.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, to_cpu=False, ) processor.update_stats(data_itr) x_min = df["x"].min() assert x_min == pytest.approx(processor.stats["mins"]["x"], 1e-2) x_max = df["x"].max() assert x_max == pytest.approx(processor.stats["maxs"]["x"], 1e-2) if not op_columns: name_min = min(df["name-string"].tolist()) name_max = max(df["name-string"].tolist()) assert name_min == processor.stats["mins"]["name-string"] y_max = df["y"].max() y_min = df["y"].min() assert y_max == processor.stats["maxs"]["y"] assert name_max == processor.stats["maxs"]["name-string"] assert y_min == processor.stats["mins"]["y"] return processor.ds_exports
def req_stats(self): return [ops.MinMax()]