Example #1
0
def test_inspect_datagen(tmpdir, datasets, engine, dist):
    # Dataset
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    # Dataset columns type config
    columns_dict = {}
    columns_dict["cats"] = ["name-cat", "name-string"
                            ] if engine == "parquet" else ["name-string"]
    columns_dict["conts"] = ["x", "y"]
    columns_dict["labels"] = ["label"]

    # Create inspector and inspect
    output_inspect1 = tmpdir + "/dataset_info1.json"
    dataset = Dataset(paths, engine=engine)
    a = datains.DatasetInspector()
    a.inspect(dataset, columns_dict, output_inspect1)
    assert os.path.isfile(output_inspect1)

    # Generate dataset using data_gen tool
    output_datagen = tmpdir + "/datagen"
    os.mkdir(output_datagen)
    with fsspec.open(output_inspect1) as f:
        output1 = json.load(f)
    cols = datagen._get_cols_from_schema(output1)
    if dist == "uniform":
        df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    else:
        df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1),
                                    gpu_frac=0.00001)

    output_datagen_files = df_gen.full_df_create(output1["num_rows"],
                                                 cols,
                                                 entries=True,
                                                 output=output_datagen)

    # Inspect again and check output are the same
    output_inspect2 = tmpdir + "/dataset_info2.json"
    dataset = Dataset(output_datagen_files, engine=engine)
    a.inspect(dataset, columns_dict, output_inspect2)
    assert os.path.isfile(output_inspect2)

    # Compare json outputs
    with fsspec.open(output_inspect2) as f:
        output2 = json.load(f)
    for k1 in output1.keys():
        if k1 == "num_rows":
            assert output1[k1] == output2[k1]
        else:
            for k2 in output1[k1].keys():
                for k3 in output1[k1][k2].keys():
                    if k3 == "dtype":
                        if output1[k1][k2][k3] == "object":
                            assert (output1[k1][k2][k3] == output2[k1][k2][k3]
                                    or output2[k1][k2][k3] == "int64")
                        else:
                            assert output1[k1][k2][k3] == output2[k1][k2][k3]
                    else:
                        assert output1[k1][k2][k3] == pytest.approx(
                            output2[k1][k2][k3], rel=1e-0, abs=1e-0)
Example #2
0
class FileItrDataset(torch.utils.data.IterableDataset):
    gpu_itr = None

    def __init__(self, file, **kwargs):
        columns = kwargs.pop("columns", None)
        self.gpu_itr = Dataset(file, **kwargs).to_iter(columns=columns)

    def __iter__(self):
        return self.gpu_itr.__iter__()

    def __len__(self):
        return len(self.gpu_itr)
Example #3
0
 def __init__(self,
              path,
              sub_batch_size=1,
              cats=None,
              conts=None,
              labels=None,
              pin_memory=False,
              **kwargs):
     self.apply_ops = kwargs.get("apply_ops", False)
     self.cat_cols = cats
     self.cont_cols = conts
     self.label_cols = labels
     self.itr = Dataset(path,
                        **kwargs).to_iter(columns=cats + conts + labels)
     self.batch_size = sub_batch_size
     self.num_chunks = len(self.itr)
Example #4
0
def main(args):
    # Get device configuration
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Get dataset columns
    with fsspec.open(args.config_file) as f:
        config = json.load(f)

    # Create Dataset
    dataset = Dataset(args.data_path, engine=args.format, part_size=part_size)

    # Call Inspector
    with managed_client(args.devices, device_limit, args.protocol) as client:
        setup_rmm_pool(client, device_pool_size)
        a = datains.DatasetInspector(client)
        a.inspect(dataset, config, args.output_file)
Example #5
0
def test_full_df(num_rows, tmpdir, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir)
    test_size = 0
    full_df = _make_df()
    for fi in df_files:
        df = Dataset(fi).to_ddf().compute()
        test_size = test_size + df.shape[0]
        full_df = _concat([full_df, df])
    assert test_size == num_rows
    conts_rep = cols["conts"]
    cats_rep = cols["cats"]
    labels_rep = cols["labels"]
    assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep)
    for idx, cat in enumerate(cats[1:]):
        dist = cats_rep[idx + 1].distro or df_gen.dist
        if HAS_GPU:
            if not _is_string_dtype(full_df[cat]._column):
                sts, ps = dist.verify(full_df[cat].to_pandas())
                assert all(s > 0.9 for s in sts)
        else:
            if not _is_string_dtype(full_df[cat]):
                sts, ps = dist.verify(full_df[cat])
                assert all(s > 0.9 for s in sts)
        # these are not mh series
        assert full_df[cat].nunique() == cats_rep[0].cardinality
        assert full_df[cat].str.len().min() == cats_rep[0].min_entry_size
        assert full_df[cat].str.len().max() == cats_rep[0].max_entry_size
    # check the mh list here cat 0 only
    if HAS_GPU:
        check_ser = _make_df(list(full_df[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = _pull_apart_list(full_df[cats[0]])[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
Example #6
0
proc.add_feature([ZeroFill(replace=True), LogOp(replace=True)])
proc.add_preprocess(Normalize(replace=True))
if int(args.freq_thresh) == 0:
    proc.add_preprocess(Categorify(replace=True, out_path=args.out_dir))
else:
    proc.add_preprocess(
        Categorify(
            replace=True,
            use_frequency=True,
            freq_threshold=int(args.freq_thresh),
            out_path=args.out_dir,
        ))
print("Creating Dataset Iterator")
dataset_args = {"sep": "\t"} if args.in_file_type == "csv" else {}
trains_ds = Dataset(train_set,
                    engine=args.in_file_type,
                    part_mem_fraction=float(args.gpu_mem_frac),
                    **dataset_args)
valids_ds = Dataset(valid_set,
                    engine=args.in_file_type,
                    part_mem_fraction=float(args.gpu_mem_frac),
                    **dataset_args)
print("Running apply")

out_train = os.path.join(args.out_dir, "train")
out_valid = os.path.join(args.out_dir, "valid")

start = time()
proc.apply(
    trains_ds,
    apply_offline=True,
    record_stats=True,
Example #7
0
def test_gpu_preproc(tmpdir, df, dataset, dump, gpu_memory_frac, engine,
                     preprocessing):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_feature(
        [ops.FillMedian(),
         ops.LogOp(preprocessing=preprocessing)])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log
    x_col = "x" if preprocessing else "x_LogOp"
    y_col = "y" if preprocessing else "y_LogOp"
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"][x_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"][y_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"][x_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"][y_col],
                        rel_tol=1e-2)

    # Check median (TODO: Improve the accuracy)
    x_median = df.x.dropna().quantile(0.5, interpolation="linear")
    y_median = df.y.dropna().quantile(0.5, interpolation="linear")
    id_median = df.id.dropna().quantile(0.5, interpolation="linear")
    assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1)
    assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1)
    assert math.isclose(id_median,
                        processor.stats["medians"]["id"],
                        rel_tol=1e1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    #     Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               nfiles=10,
                               shuffle=True,
                               apply_ops=True)

    processor.create_final_cols()

    # if preprocessing
    if not preprocessing:
        for col in cont_names:
            assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][
                "continuous"]

    dlc = torch_dataloader.DLCollator(preproc=processor, apply_ops=False)
    data_files = [
        torch_dataloader.FileItrDataset(x,
                                        use_row_groups=True,
                                        gpu_memory_frac=gpu_memory_frac,
                                        names=allcols_csv)
        for x in glob.glob(str(tmpdir) + "/*.parquet")
    ]

    data_itr = torch.utils.data.ChainDataset(data_files)
    dl = torch_dataloader.DLDataLoader(data_itr,
                                       collate_fn=dlc.gdf_col,
                                       pin_memory=False,
                                       num_workers=0)

    len_df_pp = 0
    for chunk in dl:
        len_df_pp += len(chunk[0][0])

    dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                      part_mem_fraction=gpu_memory_frac)
    x = processor.ds_to_tensors(dataset.to_iter(), apply_ops=False)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert len(x[0]) == len_df_pp

    itr_ds = torch_dataloader.TensorItrDataset([x[0], x[1], x[2]],
                                               batch_size=512000)
    count_tens_itr = 0
    for data_gd in itr_ds:
        count_tens_itr += len(data_gd[1])
        assert data_gd[0].shape[1] > 0
        assert data_gd[1].shape[1] > 0

    assert len_df_pp == count_tens_itr
Example #8
0
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                          dump, op_columns, use_client):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        client=client if use_client else None,
    )

    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify(cat_cache="host"))
    processor.finalize()

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log

    if not op_columns:
        assert math.isclose(get_norms(df.y).mean(),
                            processor.stats["means"]["y"],
                            rel_tol=1e-1)
        assert math.isclose(get_norms(df.y).std(),
                            processor.stats["stds"]["y"],
                            rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x"],
                        rel_tol=1e-1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Example #9
0
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac,
                             engine, dump, replace):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    # add operators with dependencies
    config["FE"]["continuous"] = [[
        ops.FillMissing(replace=replace),
        ops.LogOp(replace=replace)
    ]]
    config["PP"]["continuous"] = [[
        ops.LogOp(replace=replace),
        ops.Normalize()
    ]]
    config["PP"]["categorical"] = [ops.Categorify()]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        client=client,
    )

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log

    concat_ops = "_FillMissing_LogOp"
    if replace:
        concat_ops = ""
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"]["y" + concat_ops],
                        rel_tol=1e-1)

    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"]["y" + concat_ops],
                        rel_tol=1e-1)
    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Example #10
0
def test_gpu_workflow(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                      dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["FE"]["continuous"] = [ops.ZeroFill()]
    config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]]
    config["PP"]["categorical"] = [ops.Categorify()]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        client=client,
    )

    processor.update_stats(dataset)
    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"]["y"],
                        rel_tol=1e-4)
    #     assert math.isclose(get_norms(df.id).mean(),
    #                         processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x"],
                        rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"]["y"],
                        rel_tol=1e-3)
    #     assert math.isclose(get_norms(df.id).std(),
    #                         processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Example #11
0
 def __init__(self, file, **kwargs):
     columns = kwargs.pop("columns", None)
     self.gpu_itr = Dataset(file, **kwargs).to_iter(columns=columns)
def preprocess_criteo_parquet(
    input_path: str,
    output_path: str,
    client,
    frequency_threshold: int,
):
    train_days = [str(x) for x in CRITEO_TRAIN_DAYS]
    train_files = [
        os.path.join(input_path, x) for x in os.listdir(input_path)
        if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days
    ]
    valid_file = os.path.join(input_path, "day_23.part2.parquet")
    test_file = os.path.join(input_path, "day_23.part1.parquet")

    all_set = train_files + [valid_file] + [test_file]

    print(all_set, train_files, valid_file, test_file)
    print("Creating Workflow Object")

    workflow = Workflow(cat_names=CRITEO_CATEGORICAL_COLUMNS,
                        cont_names=CRITEO_CONTINUOUS_COLUMNS,
                        label_name=CRITEO_CLICK_COLUMNS)

    # We want to assign 0 to all missing values, and calculate log(x+3) for present values
    # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0
    workflow.add_cont_feature([
        FillMissing(fill_val=-2.0),
        LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne',
                 f=lambda col, _: col.add(2.0)),
        LogOp(),  # Log(1+x)
    ])

    workflow.add_cat_preprocess(
        Categorify(freq_threshold=frequency_threshold, out_path=output_path))

    workflow.finalize()

    print("Creating Dataset Iterator")
    all_ds = Dataset(all_set,
                     engine="parquet",
                     part_mem_fraction=ALL_DS_MEM_FRAC)
    trains_ds = Dataset(train_files,
                        engine="parquet",
                        part_mem_fraction=TRAIN_DS_MEM_FRAC)
    valid_ds = Dataset(valid_file,
                       engine="parquet",
                       part_mem_fraction=TEST_DS_MEM_FRAC)
    test_ds = Dataset(test_file,
                      engine="parquet",
                      part_mem_fraction=VALID_DS_MEM_FRAC)

    print("Running apply")
    out_train = os.path.join(output_path, "train")
    out_valid = os.path.join(output_path, "validation")
    out_test = os.path.join(output_path, "test")

    start = time()
    workflow.update_stats(all_ds)
    print(f"Gathering statistics time: {time() - start}")

    start = time()
    workflow.apply(trains_ds, record_stats=False, output_path=out_train)
    print(f"train preprocess time: {time() - start}")

    start = time()
    workflow.apply(valid_ds, record_stats=False, output_path=out_valid)
    print(f"valid preprocess time: {time() - start}")

    start = time()
    workflow.apply(test_ds, record_stats=False, output_path=out_test)
    print(f"test preprocess time: {time() - start}")

    save_model_size_config(workflow, output_path)