def test_mh_support(tmpdir):
    df = nvt.dispatch._make_df({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors", "Reviewers"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]
    if HAS_GPU:
        cats = cat_names >> ops.HashBucket(num_buckets=10)
    else:
        cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(cats + label_name)
    df_out = processor.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    if HAS_GPU:
        authors = df_out["Authors"].to_arrow().to_pylist()
    else:
        authors = df_out["Authors"]
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name)
    idx = 0
    for batch in data_itr:
        idx = idx + 1
        cats_conts, labels = batch
        assert "Reviewers" in cats_conts
        # check it is multihot
        assert isinstance(cats_conts["Reviewers"], tuple)
        # mh is a tuple of dictionaries {Column name: (values, offsets)}
        assert "Authors" in cats_conts
        assert isinstance(cats_conts["Authors"], tuple)
    assert idx > 0
Example #2
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client):
    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]

    cats = ColumnGroup(cat_names)
    cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)
    groupby_features = cats >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir)
    )

    workflow = Workflow(cat_features + groupby_features, client=client)
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
Example #3
0
def test_generate_triton_model(tmpdir, engine, df):
    tmpdir = "./tmp"
    conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize()
    cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host")
    workflow = nvt.Workflow(conts + cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(workflow, "model", repo)
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    assert_eq(expected, transformed)
Example #4
0
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names)
    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # Need to collect statistics first (for now)
    processor.update_stats(dataset)

    # Second "online" pass to write HugeCTR output
    processor.apply(
        dataset,
        apply_offline=False,
        record_stats=False,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=False,
    )

    # Check files
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
        assert os.path.isfile(outdir + "/metadata.json")
    elif output_format == "hugectr":
        ext = "data"

    assert os.path.isfile(outdir + "/file_list.txt")
    for n in range(nfiles):
        assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
Example #5
0
def test_categorify_single_table():
    df = dispatch._make_df({
        "Authors": [None, "User_A", "User_A", "User_E", "User_B", "User_C"],
        "Engaging_User":
        [None, "User_B", "User_B", "User_A", "User_D", "User_D"],
        "Post": [1, 2, 3, 4, None, 5],
    })
    cat_names = ["Authors", "Engaging_User"]
    dataset = nvt.Dataset(df)
    features = cat_names >> ops.Categorify(single_table=True)
    processor = nvt.Workflow(features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    old_max = 0
    for name in cat_names:
        curr_min = new_gdf[name].min()
        assert old_max <= curr_min
        curr_max = new_gdf[name].max()
        old_max += curr_max
Example #6
0
def test_categorify_multi_combo(tmpdir):
    cat_names = [["Author", "Engaging User"], ["Author"], "Engaging User"]
    kind = "combo"
    df = pd.DataFrame(
        {
            "Author": ["User_A", "User_E", "User_B", "User_C"],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )

    label_name = ["Post"]
    cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind)
    workflow = nvt.Workflow(cats + label_name)
    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # Column combinations are encoded
    assert df_out["Author"].to_arrow().to_pylist() == [1, 4, 2, 3]
    assert df_out["Engaging User"].to_arrow().to_pylist() == [2, 2, 1, 3]
    assert df_out["Author_Engaging User"].to_arrow().to_pylist() == [1, 4, 2, 3]
Example #7
0
def test_categorify_multi(tmpdir, cat_names, kind, cpu):
    df = pd.DataFrame({
        "Author": ["User_A", "User_E", "User_B", "User_C"],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })

    label_name = ["Post"]

    cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind)

    workflow = nvt.Workflow(cats + label_name)

    df_out = (workflow.fit_transform(nvt.Dataset(
        df, cpu=cpu)).to_ddf().compute(scheduler="synchronous"))

    if len(cat_names) == 1:
        if kind == "joint":
            # Columns are encoded jointly
            compare_authors = (df_out["Author"].to_list() if cpu else
                               df_out["Author"].to_arrow().to_pylist())
            compare_engaging = (df_out["Engaging User"].to_list() if cpu else
                                df_out["Engaging User"].to_arrow().to_pylist())
            # again userB has highest frequency given lowest encoding
            assert compare_authors == [2, 5, 1, 3]
            assert compare_engaging == [1, 1, 2, 4]
        else:
            # Column combinations are encoded
            compare_engaging = (
                df_out["Author_Engaging User"].to_list() if cpu else
                df_out["Author_Engaging User"].to_arrow().to_pylist())
            assert compare_engaging == [1, 4, 2, 3]
    else:
        # Columns are encoded independently
        compare_authors = (df_out["Author"].to_list()
                           if cpu else df_out["Author"].to_arrow().to_pylist())
        compare_engaging = (df_out["Engaging User"].to_list() if cpu else
                            df_out["Engaging User"].to_arrow().to_pylist())
        assert compare_authors == [1, 4, 2, 3]
        # User B is first in frequency based ordering
        assert compare_engaging == [1, 1, 2, 3]
Example #8
0
def test_categorify_lists(tmpdir, freq_threshold):
    df = cudf.DataFrame(
        {
            "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )
    cat_names = ["Authors", "Engaging User"]
    label_name = ["Post"]

    cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold)

    workflow = nvt.Workflow(cat_features + label_name)
    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    # Columns are encoded independently
    if freq_threshold < 2:
        assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3], [3]]
    else:
        assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 0], [0, 2], [2]]
Example #9
0
def test_categorify_hash_bucket(cpu):
    df = dispatch._make_df({
        "Authors": ["User_A", "User_A", "User_E", "User_B", "User_C"],
        "Engaging_User": ["User_B", "User_B", "User_A", "User_D", "User_D"],
        "Post": [1, 2, 3, 4, 5],
    })
    cat_names = ["Authors", "Engaging_User"]
    buckets = 10
    dataset = nvt.Dataset(df, cpu=cpu)
    hash_features = cat_names >> ops.Categorify(num_buckets=buckets)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check hashed values
    assert new_gdf["Authors"].max() <= (buckets - 1)
    assert new_gdf["Engaging_User"].max() <= (buckets - 1)
    # check embedding size is equal to the num_buckets after hashing
    assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == buckets
    assert nvt.ops.get_embedding_sizes(
        processor)["Engaging_User"][0] == buckets
Example #10
0
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names,
                    label_name):
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    if cont_names:
        processor.add_feature([ops.FillMedian()])
        processor.add_feature(ops.Normalize())
    if cat_names:
        processor.add_feature(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_format=None,
    )
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name,
                                              batch_size=1)

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths, engine="parquet")

    data_loader = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        shuffle=False,
        labels=label_name,
    )

    batch = next(iter(data_loader))
    assert all(name in batch[0] for name in cat_names)
    assert all(name in batch[0] for name in cont_names)

    num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1
    assert num_label_cols == len(label_name)
Example #12
0
def test_categorify_lists_with_start_index(tmpdir, cpu, start_index):
    df = dispatch._make_df({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors", "Engaging User"]
    label_name = ["Post"]
    dataset = nvt.Dataset(df, cpu=cpu)
    cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir),
                                               start_index=start_index)
    processor = nvt.Workflow(cat_features + label_name)
    processor.fit(dataset)
    df_out = processor.transform(dataset).to_ddf().compute()

    if cpu:
        compare = [list(row) for row in df_out["Authors"].tolist()]
    else:
        compare = df_out["Authors"].to_arrow().to_pylist()

    # Note that start_index is the start_index of the range of encoding, which
    # includes both an initial value for the encoding for out-of-vocabulary items,
    # as well as the values for the rest of the in-vocabulary items.
    # In this group of tests below, there are no out-of-vocabulary items, so our start index
    # value does not appear in the expected comparison object.
    if start_index == 0:
        assert compare == [[1], [1, 4], [3, 2], [2]]
    elif start_index == 1:
        assert compare == [[2], [2, 5], [4, 3], [3]]
    elif start_index == 16:
        assert compare == [[17], [17, 20], [19, 18], [18]]

    # We expect five entries in the embedding size, one for each author,
    # plus start_index many additional entries for our offset start_index.
    embeddings = nvt.ops.get_embedding_sizes(processor)

    assert embeddings[1]["Authors"][0] == (5 + start_index)
Example #13
0
def test_categorify_multi(tmpdir, groups, kind):

    df = pd.DataFrame({
        "Author": ["User_A", "User_E", "User_B", "User_C"],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })

    cat_names = ["Author", "Engaging User"]
    cont_names = []
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess(
        ops.Categorify(columns=groups, out_path=str(tmpdir), encode_type=kind))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    if groups:
        if kind == "joint":
            # Columns are encoded jointly
            assert df_out["Author"].to_arrow().to_pylist() == [1, 5, 2, 3]
            assert df_out["Engaging User"].to_arrow().to_pylist() == [
                2, 2, 1, 4
            ]
        else:
            # Column combinations are encoded
            assert df_out["Author_Engaging User"].to_arrow().to_pylist() == [
                1, 4, 2, 3
            ]
    else:
        # Columns are encoded independently
        assert df_out["Author"].to_arrow().to_pylist() == [1, 4, 2, 3]
        assert df_out["Engaging User"].to_arrow().to_pylist() == [2, 2, 1, 3]
Example #14
0
def test_s3_dataset(s3, paths, engine, df):
    # create a mocked out bucket here
    bucket = "testbucket"
    s3.create_bucket(Bucket=bucket)

    s3_paths = []
    for path in paths:
        s3_path = f"s3://{bucket}/{path}"
        with fsspec.open(s3_path, "wb") as f:
            f.write(open(path, "rb").read())
        s3_paths.append(s3_path)

    # create a basic s3 dataset
    dataset = nvt.Dataset(s3_paths)

    # make sure the iteration API works
    columns = mycols_pq if engine == "parquet" else mycols_csv
    gdf = cudf.concat(list(dataset.to_iter()))[columns]
    assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True))

    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_feature(
        [ops.FillMissing(),
         ops.Clip(min_value=0),
         ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify(cat_cache="host"))
    processor.finalize()

    processor.update_stats(dataset)
Example #15
0
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names,
                    label_name):

    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names:
        features.append(cat_names >> ops.Categorify())

    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.ColumnGroup(label_name))
    if not graph.columns:
        # if we don't have conts/cats/labels we're done
        return

    processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name)))

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name,
                                              batch_size=1)

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
Example #16
0
def test_categorify_lists(tmpdir, freq_threshold):
    df = cudf.DataFrame(
        {
            "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )
    cat_names = ["Authors", "Engaging User"]
    cont_names = []
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)
    processor.add_preprocess(ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    # Columns are encoded independently
    if freq_threshold < 2:
        assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3], [3]]
    else:
        assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 0], [0, 2], [2]]
Example #17
0
def test_parquet_lists(tmpdir, freq_threshold, shuffle, out_files_per_proc):
    # the cudf 0.17 dev container returns a '0+untagged.1.ga6296e3' version for cudf
    # (which is tough to parse correctly with LooseVersion et al). This also fails
    # to run this test frequently, whereas it works with later versions of cudf.
    # skip if we are running this specific version of cudf (and lets remove this
    # check entirely after we've upgraded the CI container)
    if cudf.__version__.startswith("0+untagged"):
        pytest.skip("parquet lists support is flakey here without cudf0.18")

    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })

    input_dir = str(tmpdir.mkdir("input"))
    output_dir = str(tmpdir.mkdir("output"))
    filename = os.path.join(input_dir, "test.parquet")
    df.to_parquet(filename)

    cat_names = ["Authors", "Engaging User"]
    cats = cat_names >> ops.Categorify(out_path=str(output_dir))
    workflow = nvt.Workflow(cats + "Post")

    transformed = workflow.fit_transform(nvt.Dataset(filename))
    transformed.to_parquet(
        output_path=output_dir,
        shuffle=shuffle,
        out_files_per_proc=out_files_per_proc,
    )

    out_paths = glob.glob(os.path.join(output_dir, "*.parquet"))
    df_out = cudf.read_parquet(out_paths)
    df_out = df_out.sort_values(by="Post", ascending=True)
    assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3],
                                                        [3]]
Example #18
0
def test_workflow_generate_columns(tmpdir, use_parquet):
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    # Stripped down dataset with geo_locaiton codes like in outbrains
    df = cudf.DataFrame({"geo_location": ["US>CA", "CA>BC", "US>TN>659"]})

    # defining a simple workflow that strips out the country code from the first two digits of the
    # geo_location code and sticks in a new 'geo_location_country' field
    country = (["geo_location"] >> ops.LambdaOp(
        f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country"))
    cat_features = ["geo_location"] + country >> ops.Categorify()

    workflow = Workflow(cat_features)

    if use_parquet:
        df.to_parquet(path)
        dataset = nvt.Dataset(path)
    else:
        dataset = nvt.Dataset(df)

    # just make sure this works without errors
    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(out_path)
Example #19
0
def test_categorify_freq_limit(tmpdir, freq_limit, buckets, search_sort, cpu):
    if search_sort and cpu:
        # invalid combination - don't test
        return

    df = dispatch._make_df({
        "Author": [
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_B",
            "User_C",
        ],
        "Engaging User": [
            "User_B",
            "User_B",
            "User_A",
            "User_D",
            "User_B",
            "User_c",
            "User_A",
            "User_D",
            "User_D",
            "User_D",
        ],
    })

    isfreqthr = freq_limit > 0 if isinstance(freq_limit, int) else isinstance(
        freq_limit, dict)

    if (not search_sort and isfreqthr) or (search_sort and not isfreqthr):
        cat_names = ["Author", "Engaging User"]

        cats = cat_names >> ops.Categorify(
            freq_threshold=freq_limit,
            out_path=str(tmpdir),
            search_sorted=search_sort,
            num_buckets=buckets,
        )

        workflow = nvt.Workflow(cats)
        df_out = (workflow.fit_transform(nvt.Dataset(
            df, cpu=cpu)).to_ddf().compute(scheduler="synchronous"))

        if freq_limit and not buckets:
            # Column combinations are encoded
            if isinstance(freq_limit, dict):
                assert df_out["Author"].max() == 2
                assert df_out["Engaging User"].max() == 1
            else:
                assert len(df["Author"].unique()) == df_out["Author"].max()
                assert len(df["Engaging User"].unique()
                           ) == df_out["Engaging User"].max()
        elif not freq_limit and buckets:
            if isinstance(buckets, dict):
                assert df_out["Author"].max() <= 9
                assert df_out["Engaging User"].max() <= 19
            else:
                assert df_out["Author"].max() <= 9
                assert df_out["Engaging User"].max() <= 9
        elif freq_limit and buckets:
            if (isinstance(buckets, dict) and isinstance(buckets, dict)
                    and not isinstance(df, pd.DataFrame)):
                assert (
                    df_out["Author"].max() <=
                    (df["Author"].hash_values() % buckets["Author"]).max() +
                    2 + 1)
                assert (df_out["Engaging User"].max() <=
                        (df["Engaging User"].hash_values() %
                         buckets["Engaging User"]).max() + 1 + 1)
Example #20
0
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine, client):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y"]
    label_name = ["label"]
    columns = mycols_pq if engine == "parquet" else mycols_csv

    df_copy = df.copy()

    config = nvt.workflow.get_new_config()

    processor = nvtabular.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        client=client,
    )

    columns_ctx = {}
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = cont_names
    columns_ctx["all"] = {}
    columns_ctx["all"]["base"] = columns

    # Substring
    # Replacement
    op = ops.LambdaOp(
        op_name="slice",
        f=lambda col, gdf: col.str.slice(1, 3),
        columns=["name-cat", "name-string"],
        replace=True,
    )

    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.slice(1, 3))
    assert new_gdf["name-string"].equals(df_copy["name-string"].str.slice(
        1, 3))

    # No Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(
        op_name="slice",
        f=lambda col, gdf: col.str.slice(1, 3),
        columns=["name-cat", "name-string"],
        replace=False,
    )
    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat_slice"].equals(df_copy["name-cat"].str.slice(
        1, 3))
    assert new_gdf["name-string_slice"].equals(
        df_copy["name-string"].str.slice(1, 3))
    assert new_gdf["name-cat"].equals(df_copy["name-cat"])
    assert new_gdf["name-string"].equals(df_copy["name-string"])

    # Replace
    # Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(
        op_name="replace",
        f=lambda col, gdf: col.str.replace("e", "XX"),
        columns=["name-cat", "name-string"],
        replace=True,
    )

    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.replace(
        "e", "XX"))
    assert new_gdf["name-string"].equals(df_copy["name-string"].str.replace(
        "e", "XX"))

    # No Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(
        op_name="replace",
        f=lambda col, gdf: col.str.replace("e", "XX"),
        columns=["name-cat", "name-string"],
        replace=False,
    )
    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat_replace"].equals(df_copy["name-cat"].str.replace(
        "e", "XX"))
    assert new_gdf["name-string_replace"].equals(
        df_copy["name-string"].str.replace("e", "XX"))
    assert new_gdf["name-cat"].equals(df_copy["name-cat"])
    assert new_gdf["name-string"].equals(df_copy["name-string"])

    # astype
    # Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(op_name="astype",
                      f=lambda col, gdf: col.astype(float),
                      columns=["id"],
                      replace=True)
    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["id"].dtype == "float64"

    # Workflow
    # Replacement
    import glob

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.LambdaOp(
            op_name="slice",
            f=lambda col, gdf: col.astype(str).str.slice(0, 1),
            columns=["name-cat"],
            replace=True,
        ),
        ops.Categorify(),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out1")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert is_integer_dtype(df_pp["name-cat"].dtype)

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.Categorify(),
        ops.LambdaOp(op_name="add100",
                     f=lambda col, gdf: col + 100,
                     replace=True),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out2")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert np.sum(df_pp["name-cat"] < 100) == 0

    # Workflow
    # No Replacement
    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.LambdaOp(
            op_name="slice",
            f=lambda col, gdf: col.astype(str).str.slice(0, 1),
            columns=["name-cat"],
            replace=False,
        ),
        ops.Categorify(),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out3")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)
    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    assert df_pp["name-cat"].dtype == "O"
    print(df_pp)
    assert is_integer_dtype(df_pp["name-cat_slice"].dtype)
    assert np.sum(df_pp["name-cat_slice"] == 0) == 0

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.Categorify(),
        ops.LambdaOp(op_name="add100",
                     f=lambda col, gdf: col + 100,
                     replace=False),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out4")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert is_integer_dtype(df_pp["name-cat_add100"].dtype)
    assert np.sum(df_pp["name-cat_add100"] < 100) == 0

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.LambdaOp(op_name="mul0",
                     f=lambda col, gdf: col * 0,
                     columns=["x"],
                     replace=False),
        ops.LambdaOp(op_name="add100",
                     f=lambda col, gdf: col + 100,
                     replace=False),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out5")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert np.sum(df_pp["x_mul0_add100"] < 100) == 0
Example #21
0
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                          dump, use_client):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify(cat_cache="host")
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp >> norms

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client if use_client else None)

    workflow.fit(dataset)

    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir,
                                 client=client if use_client else None)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Clip, Log
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
    )

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Example #22
0
def main(args):
    """Multi-GPU Criteo/DLRM Preprocessing Benchmark

    This benchmark is designed to measure the time required to preprocess
    the Criteo (1TB) dataset for Facebook’s DLRM model.  The user must specify
    the path of the raw dataset (using the `--data-path` flag), as well as the
    output directory for all temporary/final data (using the `--out-path` flag)

    Example Usage
    -------------

    python dask-nvtabular-criteo-benchmark.py
                        --data-path /path/to/criteo_parquet --out-path /out/dir/`


    Dataset Requirements (Parquet)
    ------------------------------

    This benchmark is designed with a parquet-formatted dataset in mind.
    While a CSV-formatted dataset can be processed by NVTabular, converting
    to parquet will yield significantly better performance.  To convert your
    dataset, try using the `optimize_criteo.ipynb` notebook (also located
    in `NVTabular/examples/`)

    For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md`
    """

    # Input
    data_path = args.data_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.out_files_per_proc
    high_card_columns = args.high_cards.split(",")
    dashboard_port = args.dashboard_port
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS

    # Cleanup output directory
    BASE_DIR = args.out_path
    dask_workdir = os.path.join(BASE_DIR, "workdir")
    output_path = os.path.join(BASE_DIR, "output")
    stats_path = os.path.join(BASE_DIR, "stats")
    if not os.path.isdir(BASE_DIR):
        os.mkdir(BASE_DIR)
    for dir_path in (dask_workdir, output_path, stats_path):
        if os.path.isdir(dir_path):
            shutil.rmtree(dir_path)
        os.mkdir(dir_path)

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    # Specify Categorify/GroupbyStatistics options
    tree_width = {}
    cat_cache = {}
    for col in cat_names:
        if col in high_card_columns:
            tree_width[col] = args.tree_width
            cat_cache[col] = args.cat_cache_high
        else:
            tree_width[col] = 1
            cat_cache[col] = args.cat_cache_low

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Parse shuffle option
    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt_io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt_io.Shuffle.PER_PARTITION

    # Check if any device memory is already occupied
    for dev in args.devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    client = Client(cluster)

    # Setup RMM pool
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    if args.normalize:
        processor.add_feature([ops.FillMissing(), ops.Normalize()])
    else:
        processor.add_feature(
            [ops.FillMissing(),
             ops.Clip(min_value=0),
             ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=stats_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            search_sorted=not freq_limit,
            on_host=not args.cats_on_device,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                output_path=output_path,
                num_io_threads=args.num_io_threads,
            )
    else:
        processor.apply(
            dataset,
            num_io_threads=args.num_io_threads,
            shuffle=shuffle,
            out_files_per_proc=out_files_per_proc,
            output_path=output_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print(f"cats-on-device     | {args.cats_on_device}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
Example #23
0
def test_tf_gpu_dl(tmpdir, paths, use_paths, dataset, batch_size,
                   gpu_memory_frac, engine):
    cont_names = ["x", "y", "id"]
    cat_names = ["name-string"]
    label_name = ["label"]
    if engine == "parquet":
        cat_names.append("name-cat")

    columns = cont_names + cat_names

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    workflow = nvt.Workflow(conts + cats + label_name)
    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(tmpdir + "/processed")

    data_itr = tf_dataloader.KerasSequenceLoader(
        str(tmpdir + "/processed"),  # workflow.transform(dataset),
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        buffer_size=gpu_memory_frac,
        label_names=label_name,
        engine=engine,
        shuffle=False,
    )
    _ = tf.random.uniform((1, ))

    rows = 0
    for idx in range(len(data_itr)):
        X, y = next(data_itr)

        # first elements to check epoch-to-epoch consistency
        if idx == 0:
            X0, y0 = X, y

        # check that we have at most batch_size elements
        num_samples = y.shape[0]
        if num_samples != batch_size:
            try:
                next(data_itr)
            except StopIteration:
                rows += num_samples
                continue
            else:
                raise ValueError("Batch size too small at idx {}".format(idx))

        # check that all the features in X have the
        # appropriate length and that the set of
        # their names is exactly the set of names in
        # `columns`
        these_cols = columns.copy()
        for column, x in X.items():
            try:
                these_cols.remove(column)
            except ValueError:
                raise AssertionError
            assert x.shape[0] == num_samples
        assert len(these_cols) == 0
        rows += num_samples

    assert (idx + 1) * batch_size >= rows
    assert rows == (60 * 24 * 3 + 1)

    # if num_samples is equal to batch size,
    # we didn't exhaust the iterator and do
    # cleanup. Try that now
    if num_samples == batch_size:
        try:
            next(data_itr)
        except StopIteration:
            pass
        else:
            raise ValueError
    assert not data_itr._working
    assert data_itr._batch_itr is None

    # check start of next epoch to ensure consistency
    X, y = next(data_itr)
    assert (y.numpy() == y0.numpy()).all()

    for column, x in X.items():
        x0 = X0.pop(column)
        assert (x.numpy() == x0.numpy()).all()
    assert len(X0) == 0

    data_itr.stop()
    assert not data_itr._working
    assert data_itr._batch_itr is None
Example #24
0
def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine,
                devices):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0],
                           engine="parquet",
                           part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        devices=devices,
    )

    columns = mycols_pq
    df_test = cudf.read_parquet(tar_paths[0])[columns]
    df_test.columns = [x for x in range(0, len(columns))]
    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        tar_paths[0])
    rows = 0
    # works with iterator alone, needs to test inside torch dataloader

    for idx, chunk in enumerate(data_itr):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])
        del chunk
    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert rows == num_rows

    def gen_col(batch):
        batch = batch[0]
        return batch[0], batch[1], batch[2]

    t_dl = torch_dataloader.DLDataLoader(data_itr,
                                         collate_fn=gen_col,
                                         pin_memory=False,
                                         num_workers=0)
    rows = 0
    for idx, chunk in enumerate(t_dl):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])

    if os.path.exists(output_train):
        shutil.rmtree(output_train)
Example #25
0
def test_mh_model_support(tmpdir):
    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Null User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
        "Cont1": [0.3, 0.4, 0.5, 0.6],
        "Cont2": [0.3, 0.4, 0.5, 0.6],
        "Cat1": ["A", "B", "A", "C"],
    })
    cat_names = ["Cat1", "Null User", "Authors",
                 "Reviewers"]  # , "Engaging User"]
    cont_names = ["Cont1", "Cont2"]
    label_name = ["Post"]
    out_path = os.path.join(tmpdir, "train/")
    os.mkdir(out_path)

    cats = cat_names >> ops.Categorify()
    conts = cont_names >> ops.Normalize()

    processor = nvt.Workflow(cats + conts + label_name)
    df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute()
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out),
        cats=cat_names,
        conts=cont_names,
        labels=label_name,
        batch_size=2,
    )
    emb_sizes = nvt.ops.get_embedding_sizes(processor)
    EMBEDDING_DROPOUT_RATE = 0.04
    DROPOUT_RATES = [0.001, 0.01]
    HIDDEN_DIMS = [1000, 500]
    LEARNING_RATE = 0.001
    model = Model(
        embedding_table_shapes=emb_sizes,
        num_continuous=len(cont_names),
        emb_dropout=EMBEDDING_DROPOUT_RATE,
        layer_hidden_dims=HIDDEN_DIMS,
        layer_dropout_rates=DROPOUT_RATES,
    ).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    def rmspe_func(y_pred, y):
        "Return y_pred and y to non-log space and compute RMSPE"
        y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
        pct_var = (y_pred - y) / y
        return (pct_var**2).mean().pow(0.5)

    train_loss, y_pred, y = process_epoch(
        data_itr,
        model,
        train=True,
        optimizer=optimizer,
        # transform=batch_transform,
        amp=False,
    )
    train_rmspe = None
    train_rmspe = rmspe_func(y_pred, y)
    assert train_rmspe is not None
    assert len(y_pred) > 0
    assert len(y) > 0
Example #26
0
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0],
                           engine="parquet",
                           part_mem_fraction=part_mem_fraction)

    data_itr = torch_dataloader.TorchAsyncItr(nvt_data,
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=["label"])

    results = {}

    for batch_size in [2**i for i in range(9, 25, 1)]:
        print("Checking batch size: ", batch_size)
        num_iter = max(10 * 1000 * 1000 // batch_size,
                       100)  # load 10e7 samples
        # import pdb; pdb.set_trace()
        data_itr.batch_size = batch_size
        start = time.time()
        for i, data in enumerate(data_itr):
            if i >= num_iter:
                break
            del data

        stop = time.time()

        throughput = i * batch_size / (stop - start)
        results[batch_size] = throughput
        print(
            "batch size: ",
            batch_size,
            ", throughput: ",
            throughput,
            "items",
            i * batch_size,
            "time",
            stop - start,
        )
Example #27
0
def test_tf_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cont_names = ["x", "y", "id"]
    cat_names = ["name-string"]
    label_name = ["label"]
    if engine == "parquet":
        cat_names.append("name-cat")

    columns = cont_names + cat_names

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True,
    )
    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    data_itr = tf_dataloader.KerasSequenceDataset(
        paths,
        columns=columns,
        batch_size=batch_size,
        buffer_size=gpu_memory_frac,
        label_name=label_name[0],
        engine=engine,
        shuffle=False,
    )
    processor.update_stats(data_itr.nvt_dataset, record_stats=True)
    data_itr.map(processor)

    rows = 0
    for idx in range(len(data_itr)):
        X, y = next(data_itr)

        # first elements to check epoch-to-epoch consistency
        if idx == 0:
            X0, y0 = X, y

        # check that we have at most batch_size elements
        num_samples = y.shape[0]
        assert num_samples <= batch_size

        # check that all the features in X have the
        # appropriate length and that the set of
        # their names is exactly the set of names in
        # `columns`
        these_cols = columns.copy()
        for column, x in X.items():
            try:
                these_cols.remove(column)
            except ValueError:
                raise AssertionError
            assert x.shape[0] == num_samples
        assert len(these_cols) == 0

        rows += num_samples

    # check start of next epoch to ensure consistency
    X, y = next(data_itr)
    assert (y.numpy() == y0.numpy()).all()
    for column, x in X.items():
        x0 = X0.pop(column)
        assert (x.numpy() == x0.numpy()).all()
    assert len(X0) == 0

    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert (idx + 1) * batch_size >= rows
    assert rows == (60 * 24 * 3 + 1)
Example #28
0
def test_gpu_preproc(tmpdir, datasets, dump, gpu_memory_frac, engine,
                     preprocessing):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    else:
        df1 = cudf.read_csv(paths[0], header=False,
                            names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=False,
                            names=allcols_csv)[mycols_csv]
    df = cudf.concat([df1, df2], axis=0)
    df["id"] = df["id"].astype("int64")

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
        columns = mycols_pq
    else:
        cat_names = ["name-string"]
        columns = mycols_csv
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        to_cpu=True,
    )

    processor.add_feature(
        [ops.FillMissing(),
         ops.LogOp(preprocessing=preprocessing)])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    data_itr = nvtabular.io.GPUDatasetIterator(
        paths,
        columns=columns,
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
        names=allcols_csv,
    )

    processor.update_stats(data_itr)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log
    x_col = "x" if preprocessing else "x_LogOp"
    y_col = "y" if preprocessing else "y_LogOp"
    assert math.isclose(
        get_norms(df.x).mean(),
        processor.stats["means"][x_col],
        rel_tol=1e-2,
    )
    assert math.isclose(
        get_norms(df.y).mean(),
        processor.stats["means"][y_col],
        rel_tol=1e-2,
    )
    assert math.isclose(
        get_norms(df.x).std(),
        processor.stats["stds"][x_col],
        rel_tol=1e-2,
    )
    assert math.isclose(
        get_norms(df.y).std(),
        processor.stats["stds"][y_col],
        rel_tol=1e-2,
    )

    # Check median (TODO: Improve the accuracy)
    x_median = df.x.dropna().quantile(0.5, interpolation="linear")
    y_median = df.y.dropna().quantile(0.5, interpolation="linear")
    id_median = df.id.dropna().quantile(0.5, interpolation="linear")
    assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1)
    assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1)
    assert math.isclose(id_median,
                        processor.stats["medians"]["id"],
                        rel_tol=1e1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_to_string()
        cats0 = processor.stats["encoders"]["name-cat"].get_cats(
        ).values_to_string()
        assert cats0 == ["None"] + cats_expected0
    cats_expected1 = df["name-string"].unique().values_to_string()
    cats1 = processor.stats["encoders"]["name-string"].get_cats(
    ).values_to_string()
    print(cats1)
    assert cats1 == ["None"] + cats_expected1

    #     Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               data_itr,
                               nfiles=10,
                               shuffle=True,
                               apply_ops=True)

    processor.create_final_cols()

    # if preprocessing
    if not preprocessing:
        for col in cont_names:
            assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][
                "continuous"]

    dlc = nvtabular.torch_dataloader.DLCollator(preproc=processor,
                                                apply_ops=False)
    data_files = [
        nvtabular.torch_dataloader.FileItrDataset(
            x,
            use_row_groups=True,
            gpu_memory_frac=gpu_memory_frac,
            names=allcols_csv,
        ) for x in glob.glob(str(tmpdir) + "/ds_part.*.parquet")
    ]

    data_itr = torch.utils.data.ChainDataset(data_files)
    dl = nvtabular.torch_dataloader.DLDataLoader(data_itr,
                                                 collate_fn=dlc.gdf_col,
                                                 pin_memory=False,
                                                 num_workers=0)

    len_df_pp = 0
    for chunk in dl:
        len_df_pp += len(chunk[0][0])

    data_itr = nvtabular.io.GPUDatasetIterator(
        glob.glob(str(tmpdir) + "/ds_part.*.parquet"),
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
        names=allcols_csv,
    )

    x = processor.ds_to_tensors(data_itr, apply_ops=False)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert len(x[0]) == len_df_pp

    itr_ds = nvtabular.torch_dataloader.TensorItrDataset([x[0], x[1], x[2]],
                                                         batch_size=512000)
    count_tens_itr = 0
    for data_gd in itr_ds:
        count_tens_itr += len(data_gd[1])
        assert data_gd[0][0].shape[1] > 0
        assert data_gd[0][1].shape[1] > 0

    assert len_df_pp == count_tens_itr
    if os.path.exists(processor.ds_exports):
        shutil.rmtree(processor.ds_exports)
Example #29
0
def test_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    else:
        df1 = cudf.read_csv(paths[0], header=False,
                            names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=False,
                            names=allcols_csv)[mycols_csv]
    df = cudf.concat([df1, df2], axis=0)
    df["id"] = df["id"].astype("int64")

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
        columns = mycols_pq
    else:
        cat_names = ["name-string"]
        columns = mycols_csv
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        to_cpu=True,
    )

    processor.add_feature([ops.FillMissing()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    data_itr = nvtabular.io.GPUDatasetIterator(
        paths,
        columns=columns,
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
        names=allcols_csv,
    )

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        data_itr,
        apply_offline=True,
        record_stats=True,
        shuffle=True,
        output_path=output_train,
        num_out_files=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr(
        tar_paths[0],
        engine="parquet",
        sub_batch_size=batch_size,
        gpu_memory_frac=gpu_memory_frac,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        names=mycols_csv,
        sep="\t",
    )

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        tar_paths[0])
    rows = 0
    for idx, chunk in enumerate(data_itr):
        rows += len(chunk[0])
        del chunk

    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert (idx + 1) * batch_size >= rows
    assert rows == num_rows
    if os.path.exists(output_train):
        shutil.rmtree(output_train)
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names,
                    label_name, num_rows):
    json_sample["num_rows"] = num_rows

    cols = datagen._get_cols_from_schema(json_sample)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    dataset = df_gen.create_df(num_rows, cols)
    dataset = nvt.Dataset(dataset)
    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names or mh_names:
        features.append(cat_names + mh_names >> ops.Categorify())
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.WorkflowNode(label_name))
    processor = nvt.Workflow(graph)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    if processor.output_node.output_schema.apply_inverse(
            ColumnSelector("lab_1")):
        # if we don't have conts/cats/labels we're done
        return

    data_itr = None

    with pytest.raises(ValueError) as exc_info:
        data_itr = torch_dataloader.TorchAsyncItr(
            nvt.Dataset(df_out),
            cats=cat_names + mh_names,
            conts=cont_names,
            labels=label_name,
            batch_size=2,
        )
    assert "Neither Categorical or Continuous columns were found by the dataloader. " in str(
        exc_info.value)

    if data_itr:
        for nvt_batch in data_itr:
            cats_conts, labels = nvt_batch
            if cat_names:
                assert set(cat_names).issubset(set(list(cats_conts.keys())))
            if cont_names:
                assert set(cont_names).issubset(set(list(cats_conts.keys())))

        if cat_names or cont_names or mh_names:
            emb_sizes = nvt.ops.get_embedding_sizes(processor)

            EMBEDDING_DROPOUT_RATE = 0.04
            DROPOUT_RATES = [0.001, 0.01]
            HIDDEN_DIMS = [1000, 500]
            LEARNING_RATE = 0.001
            model = Model(
                embedding_table_shapes=emb_sizes,
                num_continuous=len(cont_names),
                emb_dropout=EMBEDDING_DROPOUT_RATE,
                layer_hidden_dims=HIDDEN_DIMS,
                layer_dropout_rates=DROPOUT_RATES,
            ).cuda()
            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

            def rmspe_func(y_pred, y):
                "Return y_pred and y to non-log space and compute RMSPE"
                y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
                pct_var = (y_pred - y) / y
                return (pct_var**2).mean().pow(0.5)

            train_loss, y_pred, y = process_epoch(
                data_itr,
                model,
                train=True,
                optimizer=optimizer,
                amp=False,
            )
            train_rmspe = None
            train_rmspe = rmspe_func(y_pred, y)
            assert train_rmspe is not None
            assert len(y_pred) > 0
            assert len(y) > 0