Esempio n. 1
0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    p = nvt.Workflow(
        cat_names=["ad_id", "source_id", "platform"],
        cont_names=["cont"],
        label_name=["clicked"],
        client=client,
    )
    p.add_feature(ops.FillMissing())
    p.add_feature(ops.Normalize())
    p.add_feature(ops.Categorify())
    p.add_feature(
        ops.TargetEncoding(
            cat_groups=["ad_id", "source_id", "platform"],
            cont_target="clicked",
            kfold=5,
            fold_seed=42,
            p_smooth=20,
        ))

    p.apply(nvt.Dataset(gdf_test), record_stats=True)
    assert p.stats
Esempio n. 2
0
def test_normalize(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cont_features = op_columns >> ops.Normalize()
    processor = nvtabular.Workflow(cont_features)
    processor.fit(dataset)

    new_gdf = processor.transform(dataset).to_ddf().compute()
    new_gdf.index = df.index  # Make sure index is aligned for checks
    for col in op_columns:
        assert math.isclose(df[col].mean(),
                            processor.column_group.op.means[col],
                            rel_tol=1e-4)
        assert math.isclose(df[col].std(),
                            processor.column_group.op.stds[col],
                            rel_tol=1e-4)
        df[col] = (df[col] - processor.column_group.op.means[col]
                   ) / processor.column_group.op.stds[col]
        assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2)

    # our normalize op also works on dicts of cupy/numpy tensors. make sure this works like we'd
    # expect
    df = dataset.compute()
    cupy_inputs = {col: df[col].values for col in op_columns}
    cupy_outputs = cont_features.op.transform(op_columns, cupy_inputs)
    for col in op_columns:
        assert np.allclose(cupy_outputs[col], new_gdf[col].values)
Esempio n. 3
0
def test_normalize(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["PP"]["continuous"] = [ops.Moments(columns=op_columns)]

    processor = nvtabular.Workflow(cat_names=cat_names,
                                   cont_names=cont_names,
                                   label_name=label_name,
                                   config=config)

    processor.update_stats(dataset)

    op = ops.Normalize()

    columns_ctx = {}
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = op_columns or cont_names

    new_gdf = op.apply_op(df,
                          columns_ctx,
                          "continuous",
                          stats_context=processor.stats)
    df["x"] = (df["x"] -
               processor.stats["means"]["x"]) / processor.stats["stds"]["x"]
    assert new_gdf["x"].equals(df["x"])
Esempio n. 4
0
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    normalize = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> normalize
    workflow = Workflow(conts + cat_names + label_name, client=client)

    dataset = Dataset(paths, engine)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    for name in cont_names:
        assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3)
        assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
Esempio n. 5
0
def test_s3_dataset(s3, paths, engine, df):
    # create a mocked out bucket here
    bucket = "testbucket"
    s3.create_bucket(Bucket=bucket)

    s3_paths = []
    for path in paths:
        s3_path = f"s3://{bucket}/{path}"
        with fsspec.open(s3_path, "wb") as f:
            f.write(open(path, "rb").read())
        s3_paths.append(s3_path)

    # create a basic s3 dataset
    dataset = nvt.Dataset(s3_paths)

    # make sure the iteration API works
    columns = mycols_pq if engine == "parquet" else mycols_csv
    gdf = cudf.concat(list(dataset.to_iter()))[columns]
    assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True))

    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify(cat_cache="host"))
    processor.finalize()

    processor.update_stats(dataset)
Esempio n. 6
0
def test_parquet_output(client, use_client, tmpdir, shuffle):
    out_files_per_proc = 2
    n_workers = len(client.cluster.workers) if use_client else 1
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    size = 25
    row_group_size = 5
    df = pd.DataFrame({"a": np.arange(size)})
    df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow")

    columns = ["a"]
    dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1)

    workflow = nvt.Workflow(columns >> ops.Normalize(),
                            client=client if use_client else None)
    workflow.fit_transform(dataset).to_parquet(
        output_path=out_path,
        shuffle=shuffle,
        out_files_per_proc=out_files_per_proc)

    # Check that the number of output files is correct
    result = glob.glob(os.path.join(out_path, "*.parquet"))
    assert len(result) == out_files_per_proc * n_workers

    # Make sure _metadata exists
    meta_path = os.path.join(out_path, "_metadata")
    assert os.path.exists(meta_path)

    # Make sure _metadata makes sense
    _metadata = cudf.io.read_parquet_metadata(meta_path)
    assert _metadata[0] == size
    assert _metadata[2] == columns
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name):
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    # first with no continuous columns
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_feature(ops.Normalize())
    processor.add_feature(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_format=None,
    )
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1
    )

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
Esempio n. 8
0
def test_normalize_upcastfloat64(tmpdir, dataset, gpu_memory_frac, engine,
                                 op_columns):
    df = cudf.DataFrame(
        {
            "x": [1.9e10, 2.3e16, 3.4e18, 1.6e19],
            "label": [1, 0, 1, 0]
        },
        dtype="float32")

    cont_features = op_columns >> ops.Normalize()
    processor = nvtabular.Workflow(cont_features)
    dataset = nvt.Dataset(df)
    processor.fit(dataset)

    new_gdf = processor.transform(dataset).to_ddf().compute()

    for col in op_columns:
        assert math.isclose(df[col].mean(),
                            processor.column_group.op.means[col],
                            rel_tol=1e-4)
        assert math.isclose(df[col].std(),
                            processor.column_group.op.stds[col],
                            rel_tol=1e-4)
        df[col] = (df[col] - processor.column_group.op.means[col]
                   ) / processor.column_group.op.stds[col]
        assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2)
def test_error_handling(tmpdir):
    df = _make_df({"x": np.arange(10), "y": np.arange(10)})

    def custom_transform(col):
        if len(col) == 2:
            raise ValueError("Lets cause some problems")
        return col

    features = ["x", "y"
                ] >> ops.FillMissing() >> ops.Normalize() >> custom_transform
    workflow = nvt.Workflow(features)
    workflow.fit(nvt.Dataset(df))

    model_name = "test_error_handling"
    triton.generate_nvtabular_model(workflow,
                                    model_name,
                                    tmpdir + f"/{model_name}",
                                    backend=BACKEND)

    with run_triton_server(tmpdir) as client:
        inputs = triton.convert_df_to_triton_input(["x", "y"], df[:2])
        with pytest.raises(
                tritonclient.utils.InferenceServerException) as exception_info:
            client.infer(model_name, inputs)

        assert "ValueError: Lets cause some problems" in str(
            exception_info.value)
Esempio n. 10
0
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(ops.Normalize())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    counts = df0[cont_names].count()
    for name in cont_names:
        assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3)
        assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3)
        assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
def test_schema_write_read_dataset(tmpdir, dataset, engine):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify(cat_cache="host")
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp >> norms

    workflow = Workflow(cat_features + cont_features + label_name)

    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
    )

    schema_path = Path(tmpdir)
    proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt")
    new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"))
    assert """name: "name-cat"\n    min: 0\n    max: 27\n""" in str(
        proto_schema)
    assert new_dataset.schema == workflow.output_schema
Esempio n. 12
0
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name):

    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names:
        features.append(cat_names >> ops.Categorify())

    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.ColumnGroup(label_name))
    if not graph.columns:
        # if we don't have conts/cats/labels we're done
        return

    processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name)))

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1
    )

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
Esempio n. 13
0
def test_normalize_upcastfloat64(tmpdir, dataset, gpu_memory_frac, engine, op_columns):
    df = cudf.DataFrame(
        {"x": [1.9e10, 2.3e16, 3.4e18, 1.6e19], "label": [1, 0, 1, 0]}, dtype="float32"
    )

    cat_names = []
    cont_names = ["x"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["PP"]["continuous"] = [ops.Moments(columns=op_columns)]

    processor = nvtabular.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config
    )

    processor.update_stats(dataset)

    op = ops.Normalize()

    columns_ctx = {}
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = op_columns or cont_names

    new_gdf = op.apply_op(df, columns_ctx, "continuous", stats_context=processor.stats)
    df["x"] = (df["x"] - processor.stats["means"]["x"]) / processor.stats["stds"]["x"]
    assert new_gdf["x"].equals(df["x"])
def test_concatenate_dataframe(tmpdir, output_model):
    # we were seeing an issue in the rossmann workflow where we dropped certain columns,
    # https://github.com/NVIDIA/NVTabular/issues/961
    df = _make_df({
        "cat": ["aaaa", "bbbb", "cccc", "aaaa", "bbbb", "aaaa"],
        "cont": [0.0, 1.0, 2.0, 3.0, 4.0, 5],
    })
    # this bug only happened with a dataframe representation: force this by using a lambda
    cats = ["cat"] >> ops.LambdaOp(lambda col: _hash_series(col) % 1000)
    conts = ["cont"] >> ops.Normalize() >> ops.FillMissing() >> ops.LogOp()

    dataset = Dataset(df)
    workflow = nvt.Workflow(cats + conts).fit_schema(dataset.infer_schema())

    if output_model == "pytorch":
        model_info = {
            "cat": {
                "columns": ["cat"],
                "dtype": "int32"
            },
            "cont": {
                "columns": ["cont"],
                "dtype": "float32"
            },
        }
    else:
        model_info = None

    _verify_workflow_on_tritonserver(tmpdir, workflow, df,
                                     "test_concatenate_dataframe",
                                     output_model, model_info)
Esempio n. 15
0
def test_normalize_std_zero(cpu):
    df = pd.DataFrame({"a": 7 * [10]})
    dataset = nvt.Dataset(df, cpu=cpu)
    processor = nvtabular.Workflow(["a"] >> ops.Normalize())
    processor.fit(dataset)
    result = processor.transform(dataset).compute()["a"]
    assert (result == 0).all()
Esempio n. 16
0
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction)

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data, cats=cat_names, conts=cont_names, labels=["label"]
    )

    results = {}

    for batch_size in [2 ** i for i in range(9, 25, 1)]:
        print("Checking batch size: ", batch_size)
        num_iter = max(10 * 1000 * 1000 // batch_size, 100)  # load 10e7 samples
        # import pdb; pdb.set_trace()
        data_itr.batch_size = batch_size
        start = time.time()
        for i, data in enumerate(data_itr):
            if i >= num_iter:
                break
            del data

        stop = time.time()

        throughput = i * batch_size / (stop - start)
        results[batch_size] = throughput
        print(
            "batch size: ",
            batch_size,
            ", throughput: ",
            throughput,
            "items",
            i * batch_size,
            "time",
            stop - start,
        )
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction,
                      engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0],
                           engine="parquet",
                           part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        device=device,
    )
    len_dl = len(data_itr) - 1

    first_chunk = 0
    idx = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk = len(chunk[0])
        last_chk = len(chunk[0])
        print(last_chk)
        if idx == 1:
            break
        del chunk

    assert idx < len_dl

    first_chunk_2 = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk_2 = len(chunk[0])
        del chunk
    assert idx == len_dl

    assert first_chunk == first_chunk_2
Esempio n. 18
0
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms
    cats = cat_names >> ops.Categorify()
    workflow = nvt.Workflow(conts + cats + label_name)

    workflow.fit(dataset)
    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        output_path=tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
def test_fit_schema_works_when_subtracting_column_names():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(
        ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp
                     >> ops.Normalize() >> ops.Rename(postfix="_renamed"))

    workflow1 = Workflow(cont_features - "y_renamed")
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == ["x_renamed"]
def test_fit_schema():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing()
                     >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize()
                     >> ops.Rename(postfix="_renamed"))

    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == [
        "x_renamed", "y_renamed", "id_renamed"
    ]
Esempio n. 21
0
def test_gpu_dl(tmpdir, df, dataset, batch_size, gpu_memory_frac, engine):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name,)

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=True,
        output_path=output_train,
        num_out_files=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr(
        tar_paths[0],
        engine="parquet",
        sub_batch_size=batch_size,
        gpu_memory_frac=gpu_memory_frac,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        names=mycols_csv,
        sep="\t",
    )

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0])
    rows = 0
    for idx, chunk in enumerate(data_itr):
        rows += len(chunk)
        del chunk

    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert (idx + 1) * batch_size >= rows
    assert rows == num_rows
    if os.path.exists(output_train):
        shutil.rmtree(output_train)
def test_generate_triton_model(tmpdir, engine, output_model, df):
    tmpdir = "./tmp"
    conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize()
    cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host")
    workflow = nvt.Workflow(conts + cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    if output_model == "pytorch":
        model_info = {
            "name-cat": {
                "columns": ["name-cat"],
                "dtype": "int64"
            },
            "name-string": {
                "columns": ["name-string"],
                "dtype": "int64"
            },
            "id": {
                "columns": ["id"],
                "dtype": "float32"
            },
            "x": {
                "columns": ["x"],
                "dtype": "float32"
            },
            "y": {
                "columns": ["y"],
                "dtype": "float32"
            },
        }
    else:
        model_info = None

    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(
        workflow=workflow,
        name="model",
        output_path=repo,
        version=1,
        output_model=output_model,
        output_info=model_info,
    )
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()
    assert_eq(expected, transformed)
def test_remove_columns():
    # _remove_columns was failing to export the criteo example, because
    # the label column was getting inserted into the subgroups of the output node
    # https://github.com/NVIDIA-Merlin/NVTabular/issues/1198
    label_columns = ["label"]
    cats = ["a"] >> ops.Categorify()
    conts = ["b"] >> ops.Normalize()
    workflow = nvt.Workflow(cats + conts + label_columns)

    df = pd.DataFrame({"a": ["a", "b"], "b": [1.0, 2.0], "label": [0, 1]})
    workflow.fit(nvt.Dataset(df))

    removed = ensemble._remove_columns(workflow, label_columns)
    assert set(removed.output_dtypes.keys()) == {"a", "b"}
Esempio n. 24
0
def test_normalize(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cont_features = op_columns >> ops.Normalize()
    processor = nvtabular.Workflow(cont_features)
    processor.fit(dataset)

    new_gdf = processor.transform(dataset).to_ddf().compute()
    new_gdf.index = df.index  # Make sure index is aligned for checks
    for col in op_columns:
        assert math.isclose(df[col].mean(), processor.column_group.op.means[col], rel_tol=1e-4)
        assert math.isclose(df[col].std(), processor.column_group.op.stds[col], rel_tol=1e-4)
        df[col] = (df[col] - processor.column_group.op.means[col]) / processor.column_group.op.stds[
            col
        ]
        assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2)
Esempio n. 25
0
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names)
    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # Need to collect statistics first (for now)
    processor.update_stats(dataset)

    # Second "online" pass to write HugeCTR output
    processor.apply(
        dataset,
        apply_offline=False,
        record_stats=False,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=False,
    )

    # Check files
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
        assert os.path.isfile(outdir + "/metadata.json")
    elif output_format == "hugectr":
        ext = "data"

    assert os.path.isfile(outdir + "/file_list.txt")
    for n in range(nfiles):
        assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
Esempio n. 26
0
def test_generate_triton_model(tmpdir, engine, df):
    tmpdir = "./tmp"
    conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize()
    cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host")
    workflow = nvt.Workflow(conts + cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(workflow, "model", repo)
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    assert_eq(expected, transformed)
Esempio n. 27
0
def test_workflow_transform_ddf_dtypes():
    # Initial Dataset
    df = cudf.datasets.timeseries().reset_index()
    ddf = dask_cudf.from_cudf(df, npartitions=2)
    dataset = Dataset(ddf)

    # Create and Execute Workflow
    cols = ["name", "x", "y", "timestamp"]
    cat_cols = ["id"] >> ops.Normalize()
    workflow = Workflow(cols + cat_cols)
    workflow.fit(dataset)
    transformed_ddf = workflow.transform(dataset).to_ddf()

    # no transforms on the pass through cols, should have original dtypes
    for col in cols:
        assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col])

    # Followup dask-cudf sorting used to throw an exception because of dtype issues,
    # check that it works now
    transformed_ddf.sort_values(["id", "timestamp"]).compute()
Esempio n. 28
0
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_size="1MB", cpu=cpu)
    else:
        dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu)

    # Simple transform (normalize)
    cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]
    conts = cont_names >> ops.FillMissing() >> ops.Normalize()
    workflow = Workflow(conts + cat_names + label_name, client=client)
    transformed = workflow.fit_transform(dataset)

    # Write out dataset
    output_path = os.path.join(tmpdir, "processed")
    transformed.to_parquet(output_path=output_path,
                           shuffle=shuffle,
                           out_files_per_proc=4)

    # Check the final result
    df_disk = dd_read_parquet(output_path, engine="pyarrow").compute()
    assert_eq(
        df0.sort_values(["id", "x"])[["name-string", "label"]],
        df_disk.sort_values(["id", "x"])[["name-string", "label"]],
        check_index=False,
    )
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths, engine="parquet")

    data_loader = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        shuffle=False,
        labels=label_name,
    )

    batch = next(iter(data_loader))
    assert all(name in batch[0] for name in cat_names)
    assert all(name in batch[0] for name in cont_names)

    num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1
    assert num_label_cols == len(label_name)
Esempio n. 30
0
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac,
                             engine, dump, replace):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify()
    if replace:
        cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms
    else:
        fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >>
                             ops.Rename(postfix="_FillMissing_1_LogOp_1"))
        cont_features = cont_names + fillmissing_logop >> norms

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client)

    workflow.fit(dataset)

    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir, client=client)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Clip, Log

    concat_ops = "_FillMissing_1_LogOp_1"
    if replace:
        concat_ops = ""
    assert math.isclose(get_norms(df.x).mean(),
                        norms.means["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).mean(),
                        norms.means["y" + concat_ops],
                        rel_tol=1e-1)

    assert math.isclose(get_norms(df.x).std(),
                        norms.stds["x" + concat_ops],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(),
                        norms.stds["y" + concat_ops],
                        rel_tol=1e-1)
    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
    )

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)