def test_full_df(num_rows, tmpdir, distro): json_sample["num_rows"] = num_rows cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir) test_size = 0 full_df = cudf.DataFrame() for fi in df_files: df = cudf.read_parquet(fi) test_size = test_size + df.shape[0] full_df = cudf.concat([full_df, df]) assert test_size == num_rows conts_rep = cols["conts"] cats_rep = cols["cats"] labels_rep = cols["labels"] assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep) for idx, cat in enumerate(cats[1:]): dist = cats_rep[idx + 1].distro or df_gen.dist if not is_string_dtype(full_df[cat]._column): sts, ps = dist.verify(full_df[cat].to_pandas()) assert all(s > 0.9 for s in sts) assert full_df[cat].nunique() == cats_rep[idx + 1].cardinality assert full_df[cat].str.len().min() == cats_rep[idx + 1].min_entry_size assert full_df[cat].str.len().max() == cats_rep[idx + 1].max_entry_size check_ser = cudf.Series(full_df[cats[0]]._column.elements.values_host) assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_inspect_datagen(tmpdir, datasets, engine, dist): # Dataset paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) # Dataset columns type config columns_dict = {} columns_dict["cats"] = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] columns_dict["conts"] = ["x", "y"] columns_dict["labels"] = ["label"] # Create inspector and inspect output_inspect1 = tmpdir + "/dataset_info1.json" dataset = Dataset(paths, engine=engine) a = datains.DatasetInspector() a.inspect(dataset, columns_dict, output_inspect1) assert os.path.isfile(output_inspect1) # Generate dataset using data_gen tool output_datagen = tmpdir + "/datagen" os.mkdir(output_datagen) with fsspec.open(output_inspect1) as f: output1 = json.load(f) cols = datagen._get_cols_from_schema(output1) if dist == "uniform": df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) else: df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1), gpu_frac=0.00001) output_datagen_files = df_gen.full_df_create(output1["num_rows"], cols, entries=True, output=output_datagen) # Inspect again and check output are the same output_inspect2 = tmpdir + "/dataset_info2.json" dataset = Dataset(output_datagen_files, engine=engine) a.inspect(dataset, columns_dict, output_inspect2) assert os.path.isfile(output_inspect2) # Compare json outputs with fsspec.open(output_inspect2) as f: output2 = json.load(f) for k1 in output1.keys(): if k1 == "num_rows": assert output1[k1] == output2[k1] else: for k2 in output1[k1].keys(): for k3 in output1[k1][k2].keys(): if k3 == "dtype": if output1[k1][k2][k3] == "object": assert (output1[k1][k2][k3] == output2[k1][k2][k3] or output2[k1][k2][k3] == "int64") else: assert output1[k1][k2][k3] == output2[k1][k2][k3] else: assert output1[k1][k2][k3] == pytest.approx( output2[k1][k2][k3], rel=1e-0, abs=1e-0)
def test_uniform(num_rows, distro): cats = list(json_sample["cats"].keys())[1:] cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro()) df_uni = df_gen.create_df(num_rows, cols) sts, ps = df_gen.verify_df(df_uni[cats]) assert all(s > 0.9 for s in sts)
def _get_random_movielens_data(tmpdir, rows, dataset="movie", valid=None): if dataset == "movie": json_sample_movie = { "conts": {}, "cats": { "genres": { "dtype": None, "cardinality": 50, "min_entry_size": 1, "max_entry_size": 5, "multi_min": 2, "multi_max": 4, "multi_avg": 3, }, "movieId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, }, } cols = datagen._get_cols_from_schema(json_sample_movie) if dataset == "ratings": json_sample_ratings = { "conts": {}, "cats": { "movieId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, "userId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, }, "labels": {"rating": {"dtype": None, "cardinality": 5}}, } cols = datagen._get_cols_from_schema(json_sample_ratings) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.1) target_path = tmpdir df_gen.full_df_create(rows, cols, output=target_path) if dataset == "movie": movies_converted = cudf.read_parquet(os.path.join(tmpdir, "dataset_0.parquet")) movies_converted = movies_converted.drop_duplicates(["movieId"], keep="first") movies_converted.to_parquet(os.path.join(tmpdir, "movies_converted.parquet")) elif dataset == "ratings" and not valid: os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "train.parquet")) else: os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "valid.parquet"))
def test_sparse_tensors(tmpdir, sparse_dense): # create small dataset, add values to sparse_list json_sample = { "conts": {}, "cats": { "spar1": { "dtype": None, "cardinality": 50, "min_entry_size": 1, "max_entry_size": 5, "multi_min": 2, "multi_max": 4, "multi_avg": 3, }, "spar2": { "dtype": None, "cardinality": 50, "min_entry_size": 1, "max_entry_size": 5, "multi_min": 3, "multi_max": 5, "multi_avg": 4, }, # "": {"dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5}, }, "labels": {"rating": {"dtype": None, "cardinality": 2}}, } cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001) target_path = os.path.join(tmpdir, "input/") os.mkdir(target_path) df_files = df_gen.full_df_create(10000, cols, output=target_path) spa_lst = ["spar1", "spar2"] spa_mx = {"spar1": 5, "spar2": 6} batch_size = 10 data_itr = tf_dataloader.KerasSequenceLoader( df_files, cat_names=spa_lst, cont_names=[], label_names=["rating"], batch_size=batch_size, buffer_size=0.1, sparse_names=spa_lst, sparse_max=spa_mx, sparse_as_dense=sparse_dense, ) for batch in data_itr: feats, labs = batch for col in spa_lst: feature_tensor = feats[f"{col}"] if not sparse_dense: assert list(feature_tensor.shape) == [batch_size, spa_mx[col]] assert isinstance(feature_tensor, tf.sparse.SparseTensor) else: assert feature_tensor.shape[1] == spa_mx[col] assert not isinstance(feature_tensor, tf.sparse.SparseTensor)
def test_width(num_rows, distro): json_sample_1 = { "conts": { "cont_1": {"dtype": np.float32, "min_val": 0, "max_val": 1, "width": 20}, } } json_sample_1["num_rows"] = num_rows cols = datagen._get_cols_from_schema(json_sample_1, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro()) df_uni = df_gen.create_df(num_rows, cols) assert df_uni.shape[1] == 20
def test_cat_rep(num_rows, distro): cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro()) df_uni = df_gen.create_df(num_rows, cols, entries=True) df_cats = df_uni[cats] assert df_cats.shape[1] == len(cats) assert df_cats.shape[0] == num_rows cats_rep = cols["cats"] for idx, cat in enumerate(cats[1:]): assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size check_ser = cudf.Series(df_uni[cats[0]]._column.elements.values_host) assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_full_df(num_rows, tmpdir, distro): json_sample["num_rows"] = num_rows cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir) test_size = 0 full_df = _make_df() for fi in df_files: df = Dataset(fi).to_ddf().compute() test_size = test_size + df.shape[0] full_df = _concat([full_df, df]) assert test_size == num_rows conts_rep = cols["conts"] cats_rep = cols["cats"] labels_rep = cols["labels"] assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep) for idx, cat in enumerate(cats[1:]): dist = cats_rep[idx + 1].distro or df_gen.dist if HAS_GPU: if not _is_string_dtype(full_df[cat]._column): sts, ps = dist.verify(full_df[cat].to_pandas()) assert all(s > 0.9 for s in sts) else: if not _is_string_dtype(full_df[cat]): sts, ps = dist.verify(full_df[cat]) assert all(s > 0.9 for s in sts) # these are not mh series assert full_df[cat].nunique() == cats_rep[0].cardinality assert full_df[cat].str.len().min() == cats_rep[0].min_entry_size assert full_df[cat].str.len().max() == cats_rep[0].max_entry_size # check the mh list here cat 0 only if HAS_GPU: check_ser = _make_df(list(full_df[cats[0]]._column.elements.values_host))[0] else: check_ser = _pull_apart_list(full_df[cats[0]])[0] assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_cat_rep(num_rows, distro): json_sample["num_rows"] = num_rows cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro()) df_uni = df_gen.create_df(num_rows, cols, entries=True) df_cats = df_uni[cats] assert df_cats.shape[1] == len(cats) assert df_cats.shape[0] == num_rows cats_rep = cols["cats"] for idx, cat in enumerate(cats[1:]): assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size if HAS_GPU: check_ser = _make_df(list(df_uni[cats[0]]._column.elements.values_host))[0] else: check_ser = df_uni[cats[0]] if isinstance(check_ser[0], (list, np.ndarray)): check_ser = _pull_apart_list(check_ser)[0] assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_horovod_multigpu(tmpdir): json_sample = { "conts": {}, "cats": { "genres": { "dtype": None, "cardinality": 50, "min_entry_size": 1, "max_entry_size": 5, "multi_min": 2, "multi_max": 4, "multi_avg": 3, }, "movieId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, "userId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5 }, }, "labels": { "rating": { "dtype": None, "cardinality": 2 } }, } cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001) target_path = os.path.join(tmpdir, "input/") os.mkdir(target_path) df_files = df_gen.full_df_create(10000, cols, output=target_path) # process them cat_features = nvt.ColumnGroup(["userId", "movieId", "genres" ]) >> nvt.ops.Categorify() ratings = nvt.ColumnGroup(["rating"]) >> (lambda col: (col > 3).astype("int8")) output = cat_features + ratings proc = nvt.Workflow(output) train_iter = nvt.Dataset(df_files, part_size="10MB") proc.fit(train_iter) target_path_train = os.path.join(tmpdir, "train/") os.mkdir(target_path_train) proc.transform(train_iter).to_parquet(output_path=target_path_train, out_files_per_proc=5) # add new location target_path = os.path.join(tmpdir, "workflow/") os.mkdir(target_path) proc.save(target_path) curr_path = os.path.abspath(__file__) repo_root = os.path.relpath( os.path.normpath(os.path.join(curr_path, "../../.."))) hvd_wrap_path = os.path.join( repo_root, "examples/multi-gpu-movielens/hvd_wrapper.sh") hvd_exam_path = os.path.join(repo_root, "examples/multi-gpu-movielens/tf_trainer.py") process = subprocess.Popen( [ "horovodrun", "-np", "2", "-H", "localhost:2", "sh", hvd_wrap_path, "python", hvd_exam_path, "--dir_in", f"{tmpdir}", "--batch_size", "1024", ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) process.wait() stdout, stderr = process.communicate() print(stdout, stderr) assert "Loss:" in str(stdout)