def test_gpu_file_iterator_dl(datasets, batch, dskey): paths = glob.glob(str(datasets[dskey]) + "/*.csv") names = allcols_csv if dskey == "csv-no-header" else None header = None if dskey == "csv-no-header" else 0 df_expect = cudf.read_csv(paths[0], header=header, names=names)[mycols_csv] df_expect["id"] = df_expect["id"].astype("int64") df_itr = cudf.DataFrame() processor = nvt.Workflow(cat_names=["name-string"], cont_names=["x", "y", "id"], label_name=["label"]) data_itr = torch_dataloader.FileItrDataset( paths[0], engine="csv", batch_size=batch, gpu_memory_frac=0.01, columns=mycols_csv, names=names, ) data_chain = torch.utils.data.ChainDataset([data_itr]) dlc = torch_dataloader.DLCollator(processor) torch_dataloader.DLDataLoader(data_itr, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0) for data_gd in data_itr: df_itr = cudf.concat([df_itr, data_gd], axis=0) if df_itr else data_gd assert len(data_itr) == len(data_chain) assert_eq(df_itr.reset_index(drop=True), df_expect.reset_index(drop=True))
def test_gpu_preproc(tmpdir, df, dataset, dump, gpu_memory_frac, engine, preprocessing): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMedian(), ops.LogOp(preprocessing=preprocessing)]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log x_col = "x" if preprocessing else "x_LogOp" y_col = "y" if preprocessing else "y_LogOp" assert math.isclose(get_norms(df.x).mean(), processor.stats["means"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"][y_col], rel_tol=1e-2) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"][y_col], rel_tol=1e-2) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) processor.create_final_cols() # if preprocessing if not preprocessing: for col in cont_names: assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][ "continuous"] dlc = torch_dataloader.DLCollator(preproc=processor, apply_ops=False) data_files = [ torch_dataloader.FileItrDataset(x, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv) for x in glob.glob(str(tmpdir) + "/*.parquet") ] data_itr = torch.utils.data.ChainDataset(data_files) dl = torch_dataloader.DLDataLoader(data_itr, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0) len_df_pp = 0 for chunk in dl: len_df_pp += len(chunk[0][0]) dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) x = processor.ds_to_tensors(dataset.to_iter(), apply_ops=False) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert len(x[0]) == len_df_pp itr_ds = torch_dataloader.TensorItrDataset([x[0], x[1], x[2]], batch_size=512000) count_tens_itr = 0 for data_gd in itr_ds: count_tens_itr += len(data_gd[1]) assert data_gd[0].shape[1] > 0 assert data_gd[1].shape[1] > 0 assert len_df_pp == count_tens_itr