def test_empty_cols(tmpdir, df, dataset, engine): # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts # first with no continuous columns no_conts = torch_dataloader.TorchAsyncItr( dataset, cats=["id"], conts=[], labels=["label"], batch_size=1 ) assert all(conts is None for _, conts, _ in no_conts) # and with no categorical columns no_cats = torch_dataloader.TorchAsyncItr(dataset, cats=[], conts=["x"], labels=["label"]) assert all(cats is None for cats, _, _ in no_cats)
def test_sparse_tensors(sparse_dense): # create small dataset, add values to sparse_list df = nvt.dispatch._make_df({ "spar1": [[1, 2, 3, 4], [4, 2, 4, 4], [1, 3, 4, 3], [1, 1, 3, 3]], "spar2": [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14], [15, 16]], }) spa_lst = ["spar1", "spar2"] spa_mx = {"spar1": 5, "spar2": 6} batch_size = 2 data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df), cats=spa_lst, conts=[], labels=[], batch_size=batch_size, sparse_names=spa_lst, sparse_max=spa_mx, sparse_as_dense=sparse_dense, ) for batch in data_itr: feats, labs = batch for col in spa_lst: feature_tensor = feats[col] if not sparse_dense: assert list(feature_tensor.shape) == [batch_size, spa_mx[col]] assert feature_tensor.is_sparse else: assert feature_tensor.shape[1] == spa_mx[col] assert not feature_tensor.is_sparse
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts # first with no continuous columns processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_feature(ops.Normalize()) processor.add_feature(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_format=None, ) df_out = processor.get_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 ) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_mh_support(tmpdir): df = cudf.DataFrame( { "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], } ) cat_names = ["Authors", "Reviewers"] # , "Engaging User"] cont_names = [] label_name = ["Post"] cats = cat_names >> ops.HashBucket(num_buckets=10) processor = nvt.Workflow(cats + label_name) df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same authors = df_out["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name ) idx = 0 for batch in data_itr: idx = idx + 1 cats, conts, labels = batch cats, mh = cats # mh is a tuple of dictionaries {Column name: (values, offsets)} assert len(mh) == len(cat_names) assert not cats assert idx > 0
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names: features.append(cat_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.ColumnGroup(label_name)) if not graph.columns: # if we don't have conts/cats/labels we're done return processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name))) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 ) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], device=device, ) len_dl = len(data_itr) - 1 first_chunk = 0 idx = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk = len(chunk[0]) last_chk = len(chunk[0]) print(last_chk) if idx == 1: break del chunk assert idx < len_dl first_chunk_2 = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk_2 = len(chunk[0]) del chunk assert idx == len_dl assert first_chunk == first_chunk_2
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, cats=cat_names, conts=cont_names, labels=["label"] ) results = {} for batch_size in [2 ** i for i in range(9, 25, 1)]: print("Checking batch size: ", batch_size) num_iter = max(10 * 1000 * 1000 // batch_size, 100) # load 10e7 samples # import pdb; pdb.set_trace() data_itr.batch_size = batch_size start = time.time() for i, data in enumerate(data_itr): if i >= num_iter: break del data stop = time.time() throughput = i * batch_size / (stop - start) results[batch_size] = throughput print( "batch size: ", batch_size, ", throughput: ", throughput, "items", i * batch_size, "time", stop - start, )
def test_torch_drp_reset(tmpdir, batch_size, drop_last, num_rows): df = nvt.dispatch._make_df({ "cat1": [1] * num_rows, "cat2": [2] * num_rows, "cat3": [3] * num_rows, "label": [0] * num_rows, "cont3": [3.0] * num_rows, "cont2": [2.0] * num_rows, "cont1": [1.0] * num_rows, }) path = os.path.join(tmpdir, "dataset.parquet") df.to_parquet(path) cat_names = ["cat3", "cat2", "cat1"] cont_names = ["cont3", "cont2", "cont1"] label_name = ["label"] data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset([path]), cats=cat_names, conts=cont_names, labels=label_name, batch_size=batch_size, drop_last=drop_last, device="cpu", ) all_len = len(data_itr) if drop_last else len(data_itr) - 1 all_rows = 0 df_cols = df.columns.to_list() for idx, chunk in enumerate(data_itr): all_rows += len(chunk[0]["cat1"]) if idx < all_len: for col in df_cols: if col in chunk[0].keys(): if nvt.dispatch.HAS_GPU: assert (list( chunk[0][col].cpu().numpy()) == df[col].values_host ).all() else: assert (list( chunk[0][col].cpu().numpy()) == df[col].values ).all() if drop_last and num_rows % batch_size > 0: assert num_rows > all_rows else: assert num_rows == all_rows
def test_mh_support(tmpdir): df = nvt.dispatch._make_df({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors", "Reviewers"] # , "Engaging User"] cont_names = [] label_name = ["Post"] if HAS_GPU: cats = cat_names >> ops.HashBucket(num_buckets=10) else: cats = cat_names >> ops.Categorify() processor = nvt.Workflow(cats + label_name) df_out = processor.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same if HAS_GPU: authors = df_out["Authors"].to_arrow().to_pylist() else: authors = df_out["Authors"] assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name) idx = 0 for batch in data_itr: idx = idx + 1 cats_conts, labels = batch assert "Reviewers" in cats_conts # check it is multihot assert isinstance(cats_conts["Reviewers"], tuple) # mh is a tuple of dictionaries {Column name: (values, offsets)} assert "Authors" in cats_conts assert isinstance(cats_conts["Authors"], tuple) assert idx > 0
def test_torch_drp_reset(tmpdir, batch_size, drop_last, num_rows): df = cudf.DataFrame( { "cat1": [1] * num_rows, "cat2": [2] * num_rows, "cat3": [3] * num_rows, "label": [0] * num_rows, "cont3": [3.0] * num_rows, "cont2": [2.0] * num_rows, "cont1": [1.0] * num_rows, } ) path = os.path.join(tmpdir, "dataset.parquet") df.to_parquet(path) cat_names = ["cat3", "cat2", "cat1"] cont_names = ["cont3", "cont2", "cont1"] label_name = ["label"] data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset([path]), cats=cat_names, conts=cont_names, labels=label_name, batch_size=batch_size, drop_last=drop_last, ) all_len = len(data_itr) if drop_last else len(data_itr) - 1 all_rows = 0 for idx, chunk in enumerate(data_itr): all_rows += len(chunk[0]) if idx < all_len: for sub in chunk[:2]: sub = sub.cpu() assert list(sub[:, 0].numpy()) == [1] * batch_size assert list(sub[:, 1].numpy()) == [2] * batch_size assert list(sub[:, 2].numpy()) == [3] * batch_size if drop_last and num_rows % batch_size > 0: assert num_rows > all_rows else: assert num_rows == all_rows
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths, engine="parquet") data_loader = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, shuffle=False, labels=label_name, ) batch = next(iter(data_loader)) assert all(name in batch[0] for name in cat_names) assert all(name in batch[0] for name in cont_names) num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1 assert num_label_cols == len(label_name)
def test_shuffling(): num_rows = 10000 batch_size = 10000 df = pd.DataFrame({ "a": np.asarray(range(num_rows)), "b": np.asarray([0] * num_rows) }) train_dataset = torch_dataloader.TorchAsyncItr(Dataset(df), conts=["a"], labels=["b"], batch_size=batch_size, shuffle=True) batch = next(iter(train_dataset)) first_batch = batch[0]["a"].cpu() in_order = torch.arange(0, batch_size) assert (first_batch != in_order).any() assert (torch.sort(first_batch).values == in_order).all()
def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, devices): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], devices=devices, ) columns = mycols_pq df_test = cudf.read_parquet(tar_paths[0])[columns] df_test.columns = [x for x in range(0, len(columns))] num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( tar_paths[0]) rows = 0 # works with iterator alone, needs to test inside torch dataloader for idx, chunk in enumerate(data_itr): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert rows == num_rows def gen_col(batch): batch = batch[0] return batch[0], batch[1], batch[2] t_dl = torch_dataloader.DLDataLoader(data_itr, collate_fn=gen_col, pin_memory=False, num_workers=0) rows = 0 for idx, chunk in enumerate(t_dl): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) if os.path.exists(output_train): shutil.rmtree(output_train)
def test_mh_model_support(tmpdir): df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Null User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], "Cont1": [0.3, 0.4, 0.5, 0.6], "Cont2": [0.3, 0.4, 0.5, 0.6], "Cat1": ["A", "B", "A", "C"], }) cat_names = ["Cat1", "Null User", "Authors", "Reviewers"] # , "Engaging User"] cont_names = ["Cont1", "Cont2"] label_name = ["Post"] out_path = os.path.join(tmpdir, "train/") os.mkdir(out_path) cats = cat_names >> ops.Categorify() conts = cont_names >> ops.Normalize() processor = nvt.Workflow(cats + conts + label_name) df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute() data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=2, ) emb_sizes = nvt.ops.get_embedding_sizes(processor) EMBEDDING_DROPOUT_RATE = 0.04 DROPOUT_RATES = [0.001, 0.01] HIDDEN_DIMS = [1000, 500] LEARNING_RATE = 0.001 model = Model( embedding_table_shapes=emb_sizes, num_continuous=len(cont_names), emb_dropout=EMBEDDING_DROPOUT_RATE, layer_hidden_dims=HIDDEN_DIMS, layer_dropout_rates=DROPOUT_RATES, ).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) def rmspe_func(y_pred, y): "Return y_pred and y to non-log space and compute RMSPE" y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1 pct_var = (y_pred - y) / y return (pct_var**2).mean().pow(0.5) train_loss, y_pred, y = process_epoch( data_itr, model, train=True, optimizer=optimizer, # transform=batch_transform, amp=False, ) train_rmspe = None train_rmspe = rmspe_func(y_pred, y) assert train_rmspe is not None assert len(y_pred) > 0 assert len(y) > 0
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names, label_name, num_rows): json_sample["num_rows"] = num_rows cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1)) dataset = df_gen.create_df(num_rows, cols) dataset = nvt.Dataset(dataset) features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names or mh_names: features.append(cat_names + mh_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.WorkflowNode(label_name)) processor = nvt.Workflow(graph) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") if processor.output_node.output_schema.apply_inverse( ColumnSelector("lab_1")): # if we don't have conts/cats/labels we're done return data_itr = None with pytest.raises(ValueError) as exc_info: data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names + mh_names, conts=cont_names, labels=label_name, batch_size=2, ) assert "Neither Categorical or Continuous columns were found by the dataloader. " in str( exc_info.value) if data_itr: for nvt_batch in data_itr: cats_conts, labels = nvt_batch if cat_names: assert set(cat_names).issubset(set(list(cats_conts.keys()))) if cont_names: assert set(cont_names).issubset(set(list(cats_conts.keys()))) if cat_names or cont_names or mh_names: emb_sizes = nvt.ops.get_embedding_sizes(processor) EMBEDDING_DROPOUT_RATE = 0.04 DROPOUT_RATES = [0.001, 0.01] HIDDEN_DIMS = [1000, 500] LEARNING_RATE = 0.001 model = Model( embedding_table_shapes=emb_sizes, num_continuous=len(cont_names), emb_dropout=EMBEDDING_DROPOUT_RATE, layer_hidden_dims=HIDDEN_DIMS, layer_dropout_rates=DROPOUT_RATES, ).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) def rmspe_func(y_pred, y): "Return y_pred and y to non-log space and compute RMSPE" y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1 pct_var = (y_pred - y) / y return (pct_var**2).mean().pow(0.5) train_loss, y_pred, y = process_epoch( data_itr, model, train=True, optimizer=optimizer, amp=False, ) train_rmspe = None train_rmspe = rmspe_func(y_pred, y) assert train_rmspe is not None assert len(y_pred) > 0 assert len(y) > 0