def test_median(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() config["PP"]["continuous"] = [ops.Median(columns=op_columns)] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config) processor.update_stats(dataset) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) if not op_columns: y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1)
def test_median(tmpdir, datasets, gpu_memory_frac, engine, op_columns): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) config = nvt.workflow.get_new_config() config["PP"]["continuous"] = [ops.Median(columns=op_columns)] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, to_cpu=False, ) processor.update_stats(data_itr) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) if not op_columns: y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) return processor.ds_exports
def req_stats(self): return [ops.Median()]