def test_inspect_datagen(tmpdir, datasets, engine, dist): # Dataset paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) # Dataset columns type config columns_dict = {} columns_dict["cats"] = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] columns_dict["conts"] = ["x", "y"] columns_dict["labels"] = ["label"] # Create inspector and inspect output_inspect1 = tmpdir + "/dataset_info1.json" dataset = Dataset(paths, engine=engine) a = datains.DatasetInspector() a.inspect(dataset, columns_dict, output_inspect1) assert os.path.isfile(output_inspect1) # Generate dataset using data_gen tool output_datagen = tmpdir + "/datagen" os.mkdir(output_datagen) with fsspec.open(output_inspect1) as f: output1 = json.load(f) cols = datagen._get_cols_from_schema(output1) if dist == "uniform": df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) else: df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1), gpu_frac=0.00001) output_datagen_files = df_gen.full_df_create(output1["num_rows"], cols, entries=True, output=output_datagen) # Inspect again and check output are the same output_inspect2 = tmpdir + "/dataset_info2.json" dataset = Dataset(output_datagen_files, engine=engine) a.inspect(dataset, columns_dict, output_inspect2) assert os.path.isfile(output_inspect2) # Compare json outputs with fsspec.open(output_inspect2) as f: output2 = json.load(f) for k1 in output1.keys(): if k1 == "num_rows": assert output1[k1] == output2[k1] else: for k2 in output1[k1].keys(): for k3 in output1[k1][k2].keys(): if k3 == "dtype": if output1[k1][k2][k3] == "object": assert (output1[k1][k2][k3] == output2[k1][k2][k3] or output2[k1][k2][k3] == "int64") else: assert output1[k1][k2][k3] == output2[k1][k2][k3] else: assert output1[k1][k2][k3] == pytest.approx( output2[k1][k2][k3], rel=1e-0, abs=1e-0)
class FileItrDataset(torch.utils.data.IterableDataset): gpu_itr = None def __init__(self, file, **kwargs): columns = kwargs.pop("columns", None) self.gpu_itr = Dataset(file, **kwargs).to_iter(columns=columns) def __iter__(self): return self.gpu_itr.__iter__() def __len__(self): return len(self.gpu_itr)
def __init__(self, path, sub_batch_size=1, cats=None, conts=None, labels=None, pin_memory=False, **kwargs): self.apply_ops = kwargs.get("apply_ops", False) self.cat_cols = cats self.cont_cols = conts self.label_cols = labels self.itr = Dataset(path, **kwargs).to_iter(columns=cats + conts + labels) self.batch_size = sub_batch_size self.num_chunks = len(self.itr)
def main(args): # Get device configuration device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Get dataset columns with fsspec.open(args.config_file) as f: config = json.load(f) # Create Dataset dataset = Dataset(args.data_path, engine=args.format, part_size=part_size) # Call Inspector with managed_client(args.devices, device_limit, args.protocol) as client: setup_rmm_pool(client, device_pool_size) a = datains.DatasetInspector(client) a.inspect(dataset, config, args.output_file)
def test_full_df(num_rows, tmpdir, distro): json_sample["num_rows"] = num_rows cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir) test_size = 0 full_df = _make_df() for fi in df_files: df = Dataset(fi).to_ddf().compute() test_size = test_size + df.shape[0] full_df = _concat([full_df, df]) assert test_size == num_rows conts_rep = cols["conts"] cats_rep = cols["cats"] labels_rep = cols["labels"] assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep) for idx, cat in enumerate(cats[1:]): dist = cats_rep[idx + 1].distro or df_gen.dist if HAS_GPU: if not _is_string_dtype(full_df[cat]._column): sts, ps = dist.verify(full_df[cat].to_pandas()) assert all(s > 0.9 for s in sts) else: if not _is_string_dtype(full_df[cat]): sts, ps = dist.verify(full_df[cat]) assert all(s > 0.9 for s in sts) # these are not mh series assert full_df[cat].nunique() == cats_rep[0].cardinality assert full_df[cat].str.len().min() == cats_rep[0].min_entry_size assert full_df[cat].str.len().max() == cats_rep[0].max_entry_size # check the mh list here cat 0 only if HAS_GPU: check_ser = _make_df(list(full_df[cats[0]]._column.elements.values_host))[0] else: check_ser = _pull_apart_list(full_df[cats[0]])[0] assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size
proc.add_feature([ZeroFill(replace=True), LogOp(replace=True)]) proc.add_preprocess(Normalize(replace=True)) if int(args.freq_thresh) == 0: proc.add_preprocess(Categorify(replace=True, out_path=args.out_dir)) else: proc.add_preprocess( Categorify( replace=True, use_frequency=True, freq_threshold=int(args.freq_thresh), out_path=args.out_dir, )) print("Creating Dataset Iterator") dataset_args = {"sep": "\t"} if args.in_file_type == "csv" else {} trains_ds = Dataset(train_set, engine=args.in_file_type, part_mem_fraction=float(args.gpu_mem_frac), **dataset_args) valids_ds = Dataset(valid_set, engine=args.in_file_type, part_mem_fraction=float(args.gpu_mem_frac), **dataset_args) print("Running apply") out_train = os.path.join(args.out_dir, "train") out_valid = os.path.join(args.out_dir, "valid") start = time() proc.apply( trains_ds, apply_offline=True, record_stats=True,
def test_gpu_preproc(tmpdir, df, dataset, dump, gpu_memory_frac, engine, preprocessing): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMedian(), ops.LogOp(preprocessing=preprocessing)]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log x_col = "x" if preprocessing else "x_LogOp" y_col = "y" if preprocessing else "y_LogOp" assert math.isclose(get_norms(df.x).mean(), processor.stats["means"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"][y_col], rel_tol=1e-2) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"][y_col], rel_tol=1e-2) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) processor.create_final_cols() # if preprocessing if not preprocessing: for col in cont_names: assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][ "continuous"] dlc = torch_dataloader.DLCollator(preproc=processor, apply_ops=False) data_files = [ torch_dataloader.FileItrDataset(x, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv) for x in glob.glob(str(tmpdir) + "/*.parquet") ] data_itr = torch.utils.data.ChainDataset(data_files) dl = torch_dataloader.DLDataLoader(data_itr, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0) len_df_pp = 0 for chunk in dl: len_df_pp += len(chunk[0][0]) dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) x = processor.ds_to_tensors(dataset.to_iter(), apply_ops=False) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert len(x[0]) == len_df_pp itr_ds = torch_dataloader.TensorItrDataset([x[0], x[1], x[2]], batch_size=512000) count_tens_itr = 0 for data_gd in itr_ds: count_tens_itr += len(data_gd[1]) assert data_gd[0].shape[1] > 0 assert data_gd[1].shape[1] > 0 assert len_df_pp == count_tens_itr
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, op_columns, use_client): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client if use_client else None, ) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify(cat_cache="host")) processor.finalize() processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log if not op_columns: assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() # add operators with dependencies config["FE"]["continuous"] = [[ ops.FillMissing(replace=replace), ops.LogOp(replace=replace) ]] config["PP"]["continuous"] = [[ ops.LogOp(replace=replace), ops.Normalize() ]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log concat_ops = "_FillMissing_LogOp" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_gpu_workflow(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() config["FE"]["continuous"] = [ops.ZeroFill()] config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-4) # assert math.isclose(get_norms(df.id).mean(), # processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-3) # assert math.isclose(get_norms(df.id).std(), # processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def __init__(self, file, **kwargs): columns = kwargs.pop("columns", None) self.gpu_itr = Dataset(file, **kwargs).to_iter(columns=columns)
def preprocess_criteo_parquet( input_path: str, output_path: str, client, frequency_threshold: int, ): train_days = [str(x) for x in CRITEO_TRAIN_DAYS] train_files = [ os.path.join(input_path, x) for x in os.listdir(input_path) if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days ] valid_file = os.path.join(input_path, "day_23.part2.parquet") test_file = os.path.join(input_path, "day_23.part1.parquet") all_set = train_files + [valid_file] + [test_file] print(all_set, train_files, valid_file, test_file) print("Creating Workflow Object") workflow = Workflow(cat_names=CRITEO_CATEGORICAL_COLUMNS, cont_names=CRITEO_CONTINUOUS_COLUMNS, label_name=CRITEO_CLICK_COLUMNS) # We want to assign 0 to all missing values, and calculate log(x+3) for present values # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0 workflow.add_cont_feature([ FillMissing(fill_val=-2.0), LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne', f=lambda col, _: col.add(2.0)), LogOp(), # Log(1+x) ]) workflow.add_cat_preprocess( Categorify(freq_threshold=frequency_threshold, out_path=output_path)) workflow.finalize() print("Creating Dataset Iterator") all_ds = Dataset(all_set, engine="parquet", part_mem_fraction=ALL_DS_MEM_FRAC) trains_ds = Dataset(train_files, engine="parquet", part_mem_fraction=TRAIN_DS_MEM_FRAC) valid_ds = Dataset(valid_file, engine="parquet", part_mem_fraction=TEST_DS_MEM_FRAC) test_ds = Dataset(test_file, engine="parquet", part_mem_fraction=VALID_DS_MEM_FRAC) print("Running apply") out_train = os.path.join(output_path, "train") out_valid = os.path.join(output_path, "validation") out_test = os.path.join(output_path, "test") start = time() workflow.update_stats(all_ds) print(f"Gathering statistics time: {time() - start}") start = time() workflow.apply(trains_ds, record_stats=False, output_path=out_train) print(f"train preprocess time: {time() - start}") start = time() workflow.apply(valid_ds, record_stats=False, output_path=out_valid) print(f"valid preprocess time: {time() - start}") start = time() workflow.apply(test_ds, record_stats=False, output_path=out_test) print(f"test preprocess time: {time() - start}") save_model_size_config(workflow, output_path)