def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts # first with no continuous columns processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_feature(ops.Normalize()) processor.add_feature(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_format=None, ) df_out = processor.get_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 ) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names: features.append(cat_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.ColumnGroup(label_name)) if not graph.columns: # if we don't have conts/cats/labels we're done return processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name))) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 ) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], device=device, ) len_dl = len(data_itr) - 1 first_chunk = 0 idx = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk = len(chunk[0]) last_chk = len(chunk[0]) print(last_chk) if idx == 1: break del chunk assert idx < len_dl first_chunk_2 = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk_2 = len(chunk[0]) del chunk assert idx == len_dl assert first_chunk == first_chunk_2
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, cats=cat_names, conts=cont_names, labels=["label"] ) results = {} for batch_size in [2 ** i for i in range(9, 25, 1)]: print("Checking batch size: ", batch_size) num_iter = max(10 * 1000 * 1000 // batch_size, 100) # load 10e7 samples # import pdb; pdb.set_trace() data_itr.batch_size = batch_size start = time.time() for i, data in enumerate(data_itr): if i >= num_iter: break del data stop = time.time() throughput = i * batch_size / (stop - start) results[batch_size] = throughput print( "batch size: ", batch_size, ", throughput: ", throughput, "items", i * batch_size, "time", stop - start, )
def test_gpu_dl(tmpdir, df, dataset, batch_size, gpu_memory_frac, engine): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name,) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr( tar_paths[0], engine="parquet", sub_batch_size=batch_size, gpu_memory_frac=gpu_memory_frac, cats=cat_names, conts=cont_names, labels=["label"], names=mycols_csv, sep="\t", ) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0]) rows = 0 for idx, chunk in enumerate(data_itr): rows += len(chunk) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == num_rows if os.path.exists(output_train): shutil.rmtree(output_train)
def test_fit_simple(): data = cudf.DataFrame({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = cudf.DataFrame({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) assert_eq(expected, transformed)
def test_fit_simple(): data = nvt.dispatch._make_df({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = nvt.dispatch._make_df({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) if not HAS_GPU: transformed["x"] = transformed["x"].astype(expected["x"].dtype) transformed["y"] = transformed["y"].astype(expected["y"].dtype) assert_eq(expected, transformed)
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths, engine="parquet") data_loader = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, shuffle=False, labels=label_name, ) batch = next(iter(data_loader)) assert all(name in batch[0] for name in cat_names) assert all(name in batch[0] for name in cont_names) num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1 assert num_label_cols == len(label_name)
def test_tf_gpu_dl(tmpdir, paths, use_paths, dataset, batch_size, gpu_memory_frac, engine): cont_names = ["x", "y", "id"] cat_names = ["name-string"] label_name = ["label"] if engine == "parquet": cat_names.append("name-cat") columns = cont_names + cat_names conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) workflow.transform(dataset).to_parquet(tmpdir + "/processed") data_itr = tf_dataloader.KerasSequenceLoader( str(tmpdir + "/processed"), # workflow.transform(dataset), cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, buffer_size=gpu_memory_frac, label_names=label_name, engine=engine, shuffle=False, ) _ = tf.random.uniform((1, )) rows = 0 for idx in range(len(data_itr)): X, y = next(data_itr) # first elements to check epoch-to-epoch consistency if idx == 0: X0, y0 = X, y # check that we have at most batch_size elements num_samples = y.shape[0] if num_samples != batch_size: try: next(data_itr) except StopIteration: rows += num_samples continue else: raise ValueError("Batch size too small at idx {}".format(idx)) # check that all the features in X have the # appropriate length and that the set of # their names is exactly the set of names in # `columns` these_cols = columns.copy() for column, x in X.items(): try: these_cols.remove(column) except ValueError: raise AssertionError assert x.shape[0] == num_samples assert len(these_cols) == 0 rows += num_samples assert (idx + 1) * batch_size >= rows assert rows == (60 * 24 * 3 + 1) # if num_samples is equal to batch size, # we didn't exhaust the iterator and do # cleanup. Try that now if num_samples == batch_size: try: next(data_itr) except StopIteration: pass else: raise ValueError assert not data_itr._working assert data_itr._batch_itr is None # check start of next epoch to ensure consistency X, y = next(data_itr) assert (y.numpy() == y0.numpy()).all() for column, x in X.items(): x0 = X0.pop(column) assert (x.numpy() == x0.numpy()).all() assert len(X0) == 0 data_itr.stop() assert not data_itr._working assert data_itr._batch_itr is None
def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, devices): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], devices=devices, ) columns = mycols_pq df_test = cudf.read_parquet(tar_paths[0])[columns] df_test.columns = [x for x in range(0, len(columns))] num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( tar_paths[0]) rows = 0 # works with iterator alone, needs to test inside torch dataloader for idx, chunk in enumerate(data_itr): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert rows == num_rows def gen_col(batch): batch = batch[0] return batch[0], batch[1], batch[2] t_dl = torch_dataloader.DLDataLoader(data_itr, collate_fn=gen_col, pin_memory=False, num_workers=0) rows = 0 for idx, chunk in enumerate(t_dl): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) if os.path.exists(output_train): shutil.rmtree(output_train)
def test_tf_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cont_names = ["x", "y", "id"] cat_names = ["name-string"] label_name = ["label"] if engine == "parquet": cat_names.append("name-cat") columns = cont_names + cat_names processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() data_itr = tf_dataloader.KerasSequenceDataset( paths, columns=columns, batch_size=batch_size, buffer_size=gpu_memory_frac, label_name=label_name[0], engine=engine, shuffle=False, ) processor.update_stats(data_itr.nvt_dataset, record_stats=True) data_itr.map(processor) rows = 0 for idx in range(len(data_itr)): X, y = next(data_itr) # first elements to check epoch-to-epoch consistency if idx == 0: X0, y0 = X, y # check that we have at most batch_size elements num_samples = y.shape[0] assert num_samples <= batch_size # check that all the features in X have the # appropriate length and that the set of # their names is exactly the set of names in # `columns` these_cols = columns.copy() for column, x in X.items(): try: these_cols.remove(column) except ValueError: raise AssertionError assert x.shape[0] == num_samples assert len(these_cols) == 0 rows += num_samples # check start of next epoch to ensure consistency X, y = next(data_itr) assert (y.numpy() == y0.numpy()).all() for column, x in X.items(): x0 = X0.pop(column) assert (x.numpy() == x0.numpy()).all() assert len(X0) == 0 # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == (60 * 24 * 3 + 1)
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names, label_name, num_rows): json_sample["num_rows"] = num_rows cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1)) dataset = df_gen.create_df(num_rows, cols) dataset = nvt.Dataset(dataset) features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names or mh_names: features.append(cat_names + mh_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.WorkflowNode(label_name)) processor = nvt.Workflow(graph) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") if processor.output_node.output_schema.apply_inverse( ColumnSelector("lab_1")): # if we don't have conts/cats/labels we're done return data_itr = None with pytest.raises(ValueError) as exc_info: data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names + mh_names, conts=cont_names, labels=label_name, batch_size=2, ) assert "Neither Categorical or Continuous columns were found by the dataloader. " in str( exc_info.value) if data_itr: for nvt_batch in data_itr: cats_conts, labels = nvt_batch if cat_names: assert set(cat_names).issubset(set(list(cats_conts.keys()))) if cont_names: assert set(cont_names).issubset(set(list(cats_conts.keys()))) if cat_names or cont_names or mh_names: emb_sizes = nvt.ops.get_embedding_sizes(processor) EMBEDDING_DROPOUT_RATE = 0.04 DROPOUT_RATES = [0.001, 0.01] HIDDEN_DIMS = [1000, 500] LEARNING_RATE = 0.001 model = Model( embedding_table_shapes=emb_sizes, num_continuous=len(cont_names), emb_dropout=EMBEDDING_DROPOUT_RATE, layer_hidden_dims=HIDDEN_DIMS, layer_dropout_rates=DROPOUT_RATES, ).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) def rmspe_func(y_pred, y): "Return y_pred and y to non-log space and compute RMSPE" y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1 pct_var = (y_pred - y) / y return (pct_var**2).mean().pow(0.5) train_loss, y_pred, y = process_epoch( data_itr, model, train=True, optimizer=optimizer, amp=False, ) train_rmspe = None train_rmspe = rmspe_func(y_pred, y) assert train_rmspe is not None assert len(y_pred) > 0 assert len(y) > 0
def test_gpu_preproc(tmpdir, df, dataset, dump, gpu_memory_frac, engine, preprocessing): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMedian(), ops.LogOp(preprocessing=preprocessing)]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log x_col = "x" if preprocessing else "x_LogOp" y_col = "y" if preprocessing else "y_LogOp" assert math.isclose(get_norms(df.x).mean(), processor.stats["means"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"][y_col], rel_tol=1e-2) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"][y_col], rel_tol=1e-2) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) processor.create_final_cols() # if preprocessing if not preprocessing: for col in cont_names: assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][ "continuous"] dlc = torch_dataloader.DLCollator(preproc=processor, apply_ops=False) data_files = [ torch_dataloader.FileItrDataset(x, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv) for x in glob.glob(str(tmpdir) + "/*.parquet") ] data_itr = torch.utils.data.ChainDataset(data_files) dl = torch_dataloader.DLDataLoader(data_itr, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0) len_df_pp = 0 for chunk in dl: len_df_pp += len(chunk[0][0]) dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) x = processor.ds_to_tensors(dataset.to_iter(), apply_ops=False) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert len(x[0]) == len_df_pp itr_ds = torch_dataloader.TensorItrDataset([x[0], x[1], x[2]], batch_size=512000) count_tens_itr = 0 for data_gd in itr_ds: count_tens_itr += len(data_gd[1]) assert data_gd[0].shape[1] > 0 assert data_gd[1].shape[1] > 0 assert len_df_pp == count_tens_itr
def test_gpu_dl(tmpdir, df, dataset, batch_size, gpu_memory_frac, engine): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr( tar_paths[0], engine="parquet", sub_batch_size=batch_size, gpu_memory_frac=gpu_memory_frac, cats=cat_names, conts=cont_names, labels=["label"], names=mycols_csv, sep="\t", ) columns = mycols_pq if engine == "parquet" else mycols_csv df_test = cudf.read_parquet(tar_paths[0])[columns] df_test.columns = [x for x in range(0, len(columns))] num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( tar_paths[0]) rows = 0 # works with iterator alone, needs to test inside torch dataloader for idx, chunk in enumerate(data_itr): assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert rows == num_rows def gen_col(batch): batch = batch[0] return batch[0], batch[1], batch[2] t_dl = nvt.torch_dataloader.DLDataLoader(data_itr, collate_fn=gen_col, pin_memory=False, num_workers=0) rows = 0 for idx, chunk in enumerate(t_dl): assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) if os.path.exists(output_train): shutil.rmtree(output_train)
def processing( self, X_pd, y_names=[], encode_categor_type=None, #'categorify', 'onehotencoding', outliers_detection_technique=None, #'iqr_proximity_rule', 'gaussian_approximation','quantiles' fill_with_value=None, #'extreme_values', 'zeros','mean-median' targetencoding=False, file_path=None, ): X = dd.from_pandas(X_pd, npartitions=self.n_gpus) X = X.replace(np.nan, None) try: self.time_columns except AttributeError: try: self.initialize_types( X, n_unique_val_th=n_unique_val_th_, categor_columns_keep=categor_columns_keep_, numer_columns_keep=numer_columns_keep_) except NameError: self.initialize_types(X) workflow = nvt.Workflow(cat_names=self.categor_columns, cont_names=self.numer_columns, label_name=y_names, client=self.client) # Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html # Categorify https://nvidia.github.io/NVTabular/main/api/ops/categorify.html if encode_categor_type == 'categorify': if len(self.categor_columns) != 0: workflow.add_preprocess( ops.Categorify(columns=self.categor_columns, out_path='./')) if encode_categor_type == 'onehotencoding': #OneHotEncoder().get_feature_names(input_features=<list of features encoded>) does not work #lengths=True - chunk sizes can be computed for column in self.categor_columns: #X[column] = X[column].astype(str) X_cat_encoded = OneHotEncoder().fit_transform( X[column].to_dask_array(lengths=True).reshape(-1, 1)) uvs = X[column].unique().compute().values X = X.drop([column], axis=1) X_cat_encoded = dd.from_array( X_cat_encoded.compute().todense()) X_cat_encoded.columns = [ column + '_{}'.format(uv) for uv in uvs ] X = dd.concat([X, X_cat_encoded], axis=1) X = X.repartition(npartitions=2) for column in X.columns: if any(str(column)[-4:] == t for t in ['_nan', 'None']): # What else? X = X.drop([column], axis=1) self.initialize_types(X) print('Retyping:', self.initialize_types(X)) # Reinitialize workflow workflow = nvt.Workflow(cat_names=self.categor_columns, cont_names=self.numer_columns, label_name=y_names, client=self.client) # OutlDetect https://nvidia.github.io/NVTabular/main/api/ops/clip.html if (len(self.numer_columns) != 0) and (outliers_detection_technique != None): lower, upper = self.outldetect(outliers_detection_technique, X[self.numer_columns]) for i in range(len(self.numer_columns)): logging.info( f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}' ) print( f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}' ) workflow.add_preprocess( ops.Clip(min_value=lower[i], max_value=upper[i], columns=[self.numer_columns[i]])) # FillMissing https://nvidia.github.io/NVTabular/main/api/ops/fillmissing.html if fill_with_value == 'zeros': workflow.add_preprocess( ops.FillMissing(fill_val=0, columns=self.categor_columns + self.numer_columns)) if fill_with_value == 'extreme_values': extrim_values = {} if len(self.numer_columns) != 0: extrim_values.update( self.extrvalsdetect(X[self.numer_columns], 'numer_columns')) if len(self.categor_columns) != 0: extrim_values.update( self.extrvalsdetect(X[self.categor_columns], 'categor_columns')) logging.info(f'extrim_values: {extrim_values}') output = open('extrim_values', 'wb') pickle.dump(extrim_values, output) output.close() for fill_val, column in zip(list(extrim_values.values()), list(extrim_values.keys())): workflow.add_preprocess( ops.FillMissing(fill_val=fill_val, columns=[column])) if fill_with_value == 'mean-median': if len(self.categor_columns) != 0: workflow.add_preprocess( ops.FillMedian(columns=self.categor_columns, preprocessing=True, replace=True)) if len(self.numer_columns) != 0: means = list( dd.from_pandas( X[self.numer_columns], npartitions=self.n_gpus).mean().compute().values) for fill_val, column in zip(means, self.numer_columns): workflow.add_preprocess( ops.FillMissing(fill_val=fill_val, columns=[column])) if targetencoding: #https://nvidia.github.io/NVTabular/main/api/ops/targetencoding.html if len(self.y_names) != 0: if len(self.cat_groups) == 0: print( '\n Target encoding will be applied to all categorical columns' ) workflow.add_preprocess( ops.TargetEncoding(cat_groups=self.categor_columns, cont_target=self.y_names)) else: workflow.add_preprocess( ops.TargetEncoding(cat_groups=self.cat_groups, cont_target=self.y_names)) #----------------------------------------------------------------------------------------- workflow.finalize() dataset = nvt.Dataset(X) tmp_output_path = "./parquet_data_tmp" workflow.apply( dataset, output_format="parquet", output_path=tmp_output_path, shuffle=Shuffle.PER_WORKER, # Shuffle algorithm out_files_per_proc=1, # Number of output files per worker ) files = glob.glob(tmp_output_path + "/*.parquet") X_final = cudf.read_parquet(files[0]) for i in range(1, len(files)): X_final = X_final.append(cudf.read_parquet(files[i])) # Delete temporary files shutil.rmtree(tmp_output_path, ignore_errors=True) # if len(self.rest_col_names) != 0: # print(1) # X_final = pd.concat([X_final.to_pandas(), X_pd[self.rest_col_names]], axis=1) if file_path is not None: X_final.to_csv(file_path, index=False) return X_final
def test_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( data_itr, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr( tar_paths[0], engine="parquet", sub_batch_size=batch_size, gpu_memory_frac=gpu_memory_frac, cats=cat_names, conts=cont_names, labels=["label"], names=mycols_csv, sep="\t", ) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0]) rows = 0 for idx, chunk in enumerate(data_itr): rows += len(chunk) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == num_rows if os.path.exists(output_train): shutil.rmtree(output_train)