Beispiel #1
0
def test_avro_basic(tmpdir, part_size, size, nfiles):

    # Require uavro and fastavro library.
    # Note that fastavro is only required to write
    # avro files for testing, while uavro is actually
    # used by AvroDatasetEngine.
    fa = pytest.importorskip("fastavro")
    pytest.importorskip("uavro")

    # Define avro schema
    schema = fa.parse_schema(
        {
            "name": "avro.example.User",
            "type": "record",
            "fields": [
                {"name": "name", "type": "string"},
                {"name": "age", "type": "int"},
            ],
        }
    )

    # Write avro dataset with two files.
    # Collect block and record (row) count while writing.
    nblocks = 0
    nrecords = 0
    paths = [os.path.join(str(tmpdir), f"test.{i}.avro") for i in range(nfiles)]
    records = []
    for path in paths:
        names = np.random.choice(name_list, size)
        ages = np.random.randint(18, 100, size)
        data = [{"name": names[i], "age": ages[i]} for i in range(size)]
        with open(path, "wb") as f:
            fa.writer(f, schema, data)
        with open(path, "rb") as fo:
            avro_reader = fa.block_reader(fo)
            for block in avro_reader:
                nrecords += block.num_records
                nblocks += 1
                records += list(block)
    if nfiles == 1:
        paths = paths[0]

    # Read back with dask.dataframe
    df = nvt.Dataset(paths, part_size=part_size, engine="avro").to_ddf()

    # Check basic length and partition count
    if part_size == "1KB":
        assert df.npartitions == nblocks
    assert len(df) == nrecords

    # Full comparison
    expect = pd.DataFrame.from_records(records)
    expect["age"] = expect["age"].astype("int32")
    assert_eq(df.compute().reset_index(drop=True), expect)
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        device=device,
    )
    len_dl = len(data_itr) - 1

    first_chunk = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk = len(chunk[0])
        last_chk = len(chunk[0])
        print(last_chk)
        if idx == 1:
            break
        del chunk

    assert idx < len_dl

    first_chunk_2 = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk_2 = len(chunk[0])
        del chunk
    assert idx == len_dl

    assert first_chunk == first_chunk_2
Beispiel #3
0
def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline):
    out_files_per_proc = 2
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    size = 25
    row_group_size = 5

    cont_names = ["cont1", "cont2"]
    cat_names = ["cat1", "cat2"]
    label_name = ["label"]

    df = pd.DataFrame({
        "cont1": np.arange(size, dtype=np.float64),
        "cont2": np.arange(size, dtype=np.float64),
        "cat1": np.arange(size, dtype=np.int32),
        "cat2": np.arange(size, dtype=np.int32),
        "label": np.arange(size, dtype=np.float64),
    })
    df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow")

    dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1)

    cat_features = cat_names >> ops.Categorify()
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client if use_client else None)

    workflow.fit(dataset)

    # Force dtypes
    dict_dtypes = {}
    for col in cont_names:
        dict_dtypes[col] = np.float32
    for col in cat_names:
        dict_dtypes[col] = np.float32
    for col in label_name:
        dict_dtypes[col] = np.int64

    workflow.transform(dataset).to_parquet(
        # apply_offline=apply_offline, Not any more?
        # record_stats=apply_offline, Not any more?
        output_path=out_path,
        shuffle=shuffle,
        out_files_per_proc=out_files_per_proc,
        dtypes=dict_dtypes,
    )

    # Check dtypes
    for filename in glob.glob(os.path.join(out_path, "*.parquet")):
        gdf = cudf.io.read_parquet(filename)
        assert dict(gdf.dtypes) == dict_dtypes
Beispiel #4
0
def test_logop_lists(tmpdir, cpu):
    df = dispatch._make_df(device="cpu" if cpu else "gpu")
    df["vals"] = [[np.exp(0) - 1, np.exp(1) - 1], [np.exp(2) - 1], []]

    features = ["vals"] >> nvt.ops.LogOp()
    workflow = nvt.Workflow(features)
    new_df = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    expected = dispatch._make_df(device="cpu" if cpu else "gpu")
    expected["vals"] = [[0.0, 1.0], [2.0], []]

    assert_eq(expected, new_df)
def _verify_workflow_on_tritonserver(
    tmpdir,
    workflow,
    df,
    model_name,
    output_model="tensorflow",
    model_info=None,
    sparse_max=None,
):
    """tests that the nvtabular workflow produces the same results when run locally in the
    process, and when run in tritonserver"""
    # fit the workflow and test on the input
    dataset = nvt.Dataset(df)
    workflow.fit(dataset)

    local_df = workflow.transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    for col in workflow.output_node.output_columns.names:
        if sparse_max and col in sparse_max.keys():
            workflow.output_dtypes[col] = workflow.output_dtypes.get(
                col).element_type

    triton.generate_nvtabular_model(
        workflow=workflow,
        name=model_name,
        output_path=tmpdir + f"/{model_name}",
        version=1,
        output_model=output_model,
        output_info=model_info,
        sparse_max=sparse_max,
        backend=BACKEND,
    )

    inputs = triton.convert_df_to_triton_input(df.columns, df)
    outputs = [
        grpcclient.InferRequestedOutput(col)
        for col in workflow.output_dtypes.keys()
    ]
    with run_triton_server(tmpdir) as client:
        response = client.infer(model_name, inputs, outputs=outputs)

        for col in workflow.output_dtypes.keys():
            features = response.as_numpy(col)
            if sparse_max and col in sparse_max:
                features = features.tolist()
                triton_df = _make_df()
                triton_df[col] = features
            else:
                triton_df = _make_df(
                    {col: features.reshape(features.shape[0])})
            assert_eq(triton_df, local_df[[col]])
Beispiel #6
0
def test_categorify_freq_limit(tmpdir, freq_limit):
    df = pd.DataFrame({
        "Author": [
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_B",
            "User_C",
        ],
        "Engaging User": [
            "User_B",
            "User_B",
            "User_A",
            "User_D",
            "User_B",
            "User_c",
            "User_A",
            "User_D",
            "User_D",
            "User_D",
        ],
    })

    cat_names = ["Author", "Engaging User"]
    cont_names = []
    label_name = []

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess(
        ops.Categorify(columns=cat_names,
                       freq_threshold=freq_limit,
                       out_path=str(tmpdir)))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    # Column combinations are encoded
    if isinstance(freq_limit, dict):
        assert df_out["Author"].max() == 2
        assert df_out["Engaging User"].max() == 1
    else:
        assert len(df["Author"].unique()) == df_out["Author"].max()
        assert len(
            df["Engaging User"].unique()) == df_out["Engaging User"].max()
def test_mh_support(tmpdir):
    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors", "Reviewers"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)
    processor.add_preprocess(ops.HashBucket(num_buckets=10))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    authors = df_out["Authors"].to_arrow().to_pylist()
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name)
    idx = 0
    for batch in data_itr:
        idx = idx + 1
        cats, conts, labels = batch
        cats, mh = cats
        # mh is a tuple of dictionaries {Column name: (values, offsets)}
        assert len(mh) == len(cat_names)
        assert not cats
    assert idx > 0
Beispiel #8
0
def dataset(request, paths, engine):
    try:
        gpu_memory_frac = request.getfixturevalue("gpu_memory_frac")
    except Exception:
        gpu_memory_frac = 0.01

    kwargs = {}
    if engine == "csv-no-header":
        kwargs["names"] = allcols_csv

    return nvtabular.Dataset(paths,
                             part_mem_fraction=gpu_memory_frac,
                             **kwargs)
Beispiel #9
0
def test_column_similarity(on_device, metric):
    categories = cupy.sparse.coo_matrix(
        (
            cupy.ones(14),
            (
                cupy.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 4, 4, 5, 5]),
                cupy.array([0, 1, 2, 1, 2, 3, 3, 4, 5, 1, 1, 2, 0, 1]),
            ),
        )
    )

    input_df = cudf.DataFrame({"left": [0, 0, 0, 0, 4], "right": [0, 1, 2, 3, 5]})
    op = ColumnSimilarity("output", "left", categories, "right", metric=metric, on_device=on_device)
    workflow = nvtabular.Workflow(cat_names=["left", "right"], cont_names=[], label_name=[])
    workflow.add_feature(op)
    workflow.apply(nvtabular.Dataset(input_df), output_path=None)
    df = workflow.get_ddf().compute()

    output = df.output.values
    if metric in ("tfidf", "cosine"):
        # distance from document 0 to itself should be 1, since these metrics are fully normalized
        assert float(output[0]) == pytest.approx(1)

    # distance from document 0 to document 2 should be 0 since they have no features in common
    assert output[2] == 0

    # distance from document 4 to 5 should be non-zero (have category 1 in common)
    assert output[4] != 0

    # make sure that we can operate multiple times on the same matrix correctly
    op = ColumnSimilarity(
        "output", "left", categories, "right", metric="inner", on_device=on_device
    )

    workflow = nvtabular.Workflow(cat_names=["left", "right"], cont_names=[], label_name=[])
    workflow.add_feature(op)
    workflow.apply(nvtabular.Dataset(df), output_path=None)
    df = workflow.get_ddf().compute()
    assert float(df.output.values[0]) == pytest.approx(3)
Beispiel #10
0
def test_column_similarity(on_device, metric):
    categories = coo_matrix(
        (
            cupy.ones(14),
            (
                cupy.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 4, 4, 5, 5]),
                cupy.array([0, 1, 2, 1, 2, 3, 3, 4, 5, 1, 1, 2, 0, 1]),
            ),
        )
    )

    input_df = cudf.DataFrame({"left": [0, 0, 0, 0, 4], "right": [0, 1, 2, 3, 5]})

    sim_features = [["left", "right"]] >> ColumnSimilarity(
        categories, metric=metric, on_device=on_device
    )
    workflow = nvtabular.Workflow(sim_features)

    df = workflow.transform(nvtabular.Dataset(input_df)).to_ddf().compute()
    output = df["left_right_sim"].values
    if metric in ("tfidf", "cosine"):
        # distance from document 0 to itself should be 1, since these metrics are fully normalized
        assert float(output[0]) == pytest.approx(1)

    # distance from document 0 to document 2 should be 0 since they have no features in common
    assert output[2] == 0

    # distance from document 4 to 5 should be non-zero (have category 1 in common)
    assert output[4] != 0

    # make sure that we can operate multiple times on the same matrix correctly
    sim_features = [["left", "right"]] >> ColumnSimilarity(
        categories, metric="inner", on_device=on_device
    )
    workflow = nvtabular.Workflow(sim_features)

    df = workflow.transform(nvtabular.Dataset(input_df)).to_ddf().compute()

    assert float(df["left_right_sim"].values[0]) == pytest.approx(3)
Beispiel #11
0
def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how,
                       drop_duplicates):

    # Define "external" table
    shift = 100
    df_ext = df[["id"]].copy().sort_values("id")
    df_ext["new_col"] = df_ext["id"] + shift
    df_ext["new_col_2"] = "keep"
    df_ext["new_col_3"] = "ignore"
    df_ext_check = df_ext.copy()
    if kind_ext == "pandas":
        df_ext = df_ext.to_pandas()
    elif kind_ext == "arrow":
        df_ext = df_ext.to_arrow()
    elif kind_ext == "parquet":
        path = tmpdir.join("external.parquet")
        df_ext.to_parquet(path)
        df_ext = path
    elif kind_ext == "csv":
        path = tmpdir.join("external.csv")
        df_ext.to_csv(path)
        df_ext = path

    # Define Op
    on = "id"
    columns_left = list(df.columns)
    columns_ext = ["id", "new_col", "new_col_2"]
    df_ext_check = df_ext_check[columns_ext]
    if drop_duplicates:
        df_ext_check.drop_duplicates(ignore_index=True, inplace=True)
    joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal(
        df_ext,
        on,
        how=how,
        columns_ext=columns_ext,
        cache=cache,
        drop_duplicates_ext=drop_duplicates,
    )

    gdf = df.reset_index()
    dataset = nvt.Dataset(gdf)
    processor = nvt.Workflow(joined)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute().reset_index()

    check_gdf = gdf.merge(df_ext_check, how=how, on=on)
    assert len(check_gdf) == len(new_gdf)
    assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all()
    assert gdf["id"].all() == new_gdf["id"].all()
    assert "new_col_2" in new_gdf.columns
    assert "new_col_3" not in new_gdf.columns
Beispiel #12
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed):
    df = cudf.DataFrame({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    df = dask_cudf.from_cudf(df, npartitions=3)

    cat_names = ["Author", "Engaging-User"]
    cont_names = ["Cost"]
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)
    processor.add_feature(
        [ops.FillMissing(),
         ops.Clip(min_value=0),
         ops.LogOp()])
    processor.add_preprocess(
        ops.TargetEncoding(
            cat_groups,
            "Cost",  # cont_target
            out_path=str(tmpdir),
            kfold=kfold,
            out_col="test_name",
            out_dtype="float32",
            fold_seed=fold_seed,
            drop_folds=False,  # Keep folds to validate
        ))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    assert "test_name" in df_out.columns
    assert df_out["test_name"].dtype == "float32"

    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]
        check = cudf.io.read_parquet(processor.stats["te_stats"][name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check)
Beispiel #13
0
def test_s3_dataset(s3_base, s3so, paths, datasets, engine, df):

    # Copy files to mock s3 bucket
    files = {}
    for i, path in enumerate(paths):
        with open(path, "rb") as f:
            fbytes = f.read()
        fn = path.split(os.path.sep)[-1]
        files[fn] = BytesIO()
        files[fn].write(fbytes)
        files[fn].seek(0)

    if engine == "parquet":
        # Workaround for nvt#539. In order to avoid the
        # bug in Dask's `create_metadata_file`, we need
        # to manually generate a "_metadata" file here.
        # This can be removed after dask#7295 is merged
        # (see https://github.com/dask/dask/pull/7295)
        fn = "_metadata"
        files[fn] = BytesIO()
        meta = create_metadata_file(
            paths,
            engine="pyarrow",
            out_dir=False,
        )
        meta.write_metadata_file(files[fn])
        files[fn].seek(0)

    with s3_context(s3_base=s3_base, bucket=engine, files=files):

        # Create nvt.Dataset from mock s3 paths
        url = f"s3://{engine}" if engine == "parquet" else f"s3://{engine}/*"
        dataset = nvt.Dataset(url, engine=engine, storage_options=s3so)

        # Check that the iteration API works
        columns = mycols_pq if engine == "parquet" else mycols_csv
        gdf = cudf.concat(list(dataset.to_iter()))[columns]
        assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True))

        cat_names = ["name-cat", "name-string"
                     ] if engine == "parquet" else ["name-string"]
        cont_names = ["x", "y", "id"]
        label_name = ["label"]

        conts = cont_names >> ops.FillMissing() >> ops.Clip(
            min_value=0) >> ops.LogOp()
        cats = cat_names >> ops.Categorify(cat_cache="host")

        processor = nvt.Workflow(conts + cats + label_name)
        processor.fit(dataset)
Beispiel #14
0
def save_stats(data_bucket_folder, output_train_folder, train_path,
               output_valid_folder, valid_path, stats_file, hash_spec,
               local_directory, dask):
    devices = get_devices()
    shuffle = Shuffle.PER_PARTITION if len(devices) > 1 else True

    workflow = create_workflow(data_bucket_folder=data_bucket_folder,
                               hash_spec=hash_spec,
                               devices=devices,
                               local_directory=local_directory,
                               dask=dask)

    train_dataset = nvt.Dataset(train_path, part_size="1GB")
    valid_dataset = nvt.Dataset(valid_path, part_size="150MB")
    workflow.fit(train_dataset)
    workflow.transform(train_dataset).to_parquet(
        output_path=output_train_folder, shuffle=shuffle, out_files_per_proc=8)
    workflow.transform(valid_dataset).to_parquet(
        output_path=output_valid_folder, shuffle=None, output_files=8)

    workflow.save(stats_file)

    return workflow
Beispiel #15
0
def test_categorify_max_size(max_emb_size):
    df = cudf.DataFrame({
        "Author": [
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_D",
            "User_F",
            "User_F",
        ],
        "Engaging_User": [
            "User_B",
            "User_B",
            "User_A",
            "User_D",
            "User_B",
            "User_M",
            "User_A",
            "User_D",
            "User_N",
            "User_F",
            "User_E",
        ],
    })

    cat_names = ["Author", "Engaging_User"]
    buckets = 3
    dataset = nvt.Dataset(df)
    cat_features = cat_names >> ops.Categorify(max_size=max_emb_size,
                                               num_buckets=buckets)
    processor = nvt.Workflow(cat_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    if isinstance(max_emb_size, int):
        max_emb_size = {name: max_emb_size for name in cat_names}

    # check encoded values after freq_hashing with fix emb size
    assert new_gdf["Author"].max() <= max_emb_size["Author"]
    assert new_gdf["Engaging_User"].max() <= max_emb_size["Engaging_User"]
    # check embedding size is less than max_size after hashing with fix emb size.
    assert nvt.ops.get_embedding_sizes(
        processor)["Author"][0] <= max_emb_size["Author"]
    assert (nvt.ops.get_embedding_sizes(processor)["Engaging_User"][0] <=
            max_emb_size["Engaging_User"])
def test_generate_triton_multihot(tmpdir):
    df = _make_df({
        "userId": ["a", "a", "b"],
        "movieId": ["1", "2", "2"],
        "genres": [["action", "adventure"], ["action", "comedy"], ["comedy"]],
    })

    cats = ["userId", "movieId", "genres"] >> nvt.ops.Categorify()
    workflow = nvt.Workflow(cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(workflow, "model", repo)
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    assert_eq(expected, transformed)
def test_remove_columns():
    # _remove_columns was failing to export the criteo example, because
    # the label column was getting inserted into the subgroups of the output node
    # https://github.com/NVIDIA-Merlin/NVTabular/issues/1198
    label_columns = ["label"]
    cats = ["a"] >> ops.Categorify()
    conts = ["b"] >> ops.Normalize()
    workflow = nvt.Workflow(cats + conts + label_columns)

    df = pd.DataFrame({"a": ["a", "b"], "b": [1.0, 2.0], "label": [0, 1]})
    workflow.fit(nvt.Dataset(df))

    removed = ensemble._remove_columns(workflow, label_columns)
    assert set(removed.output_dtypes.keys()) == {"a", "b"}
Beispiel #18
0
def test_validater(tmpdir, batch_size):
    n_samples = 9
    rand = np.random.RandomState(0)

    gdf = cudf.DataFrame({
        "a": rand.randn(n_samples),
        "label": rand.randint(2, size=n_samples)
    })

    dataloader = tf_dataloader.KerasSequenceLoader(
        nvt.Dataset(gdf),
        batch_size=batch_size,
        cat_names=[],
        cont_names=["a"],
        label_names=["label"],
        shuffle=False,
    )

    input = tf.keras.Input(name="a", dtype=tf.float32, shape=(1, ))
    x = tf.keras.layers.Dense(128, "relu")(input)
    x = tf.keras.layers.Dense(1, activation="softmax")(x)

    model = tf.keras.Model(inputs=input, outputs=x)
    model.compile("sgd",
                  "binary_crossentropy",
                  metrics=["accuracy", tf.keras.metrics.AUC()])

    validater = tf_dataloader.KerasSequenceValidater(dataloader)
    model.fit(dataloader, epochs=2, verbose=0, callbacks=[validater])

    predictions, labels = [], []
    for X, y_true in dataloader:
        y_pred = model(X)
        labels.extend(y_true.numpy()[:, 0])
        predictions.extend(y_pred.numpy()[:, 0])
    predictions = np.array(predictions)
    labels = np.array(labels)

    logs = {}
    validater.on_epoch_end(0, logs)
    auc_key = [i for i in logs.keys() if i.startswith("val_auc")][0]

    true_accuracy = (labels == (predictions > 0.5)).mean()
    estimated_accuracy = logs["val_accuracy"]
    assert np.isclose(true_accuracy, estimated_accuracy, rtol=1e-6)

    true_auc = roc_auc_score(labels, predictions)
    estimated_auc = logs[auc_key]
    assert np.isclose(true_auc, estimated_auc, rtol=1e-6)
Beispiel #19
0
def test_workflow_generate_columns(tmpdir, use_parquet):
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    # Stripped down dataset with geo_locaiton codes like in outbrains
    df = cudf.DataFrame({"geo_location": ["US>CA", "CA>BC", "US>TN>659"]})

    # defining a simple workflow that strips out the country code from the first two digits of the
    # geo_location code and sticks in a new 'geo_location_country' field
    country = (["geo_location"] >> ops.LambdaOp(
        f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country"))
    cat_features = ["geo_location"] + country >> ops.Categorify()

    workflow = Workflow(cat_features)

    if use_parquet:
        df.to_parquet(path)
        dataset = nvt.Dataset(path)
    else:
        dataset = nvt.Dataset(df)

    # just make sure this works without errors
    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(out_path)
Beispiel #20
0
def test_target_encode_multi(tmpdir, npartitions, cpu):
    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = dispatch._make_df({
        "cat": cat_1,
        "cat2": cat_2,
        "num": num_1,
        "num_2": num_2
    })
    if cpu:
        df = dd.from_pandas(
            df if isinstance(df, pd.DataFrame) else df.to_pandas(),
            npartitions=npartitions)
    else:
        df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]
    te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"],
                                                   out_path=str(tmpdir),
                                                   kfold=1,
                                                   p_smooth=5,
                                                   out_dtype="float32")

    workflow = nvt.Workflow(te_features)

    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values,
              df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0],
                        num_1.mean(),
                        abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0],
                        num_2.mean(),
                        abs_tol=1e-3)
def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes):
    fn = f"{name}.parquet"
    out_path = fs.sep.join([out_dir, f"{name}.parquet"])
    writer = ParquetWriter(out_path, compression=None)
    for gdf in nvt.Dataset(
            path,
            engine="csv",
            names=cols,
            part_memory_fraction=gpu_mem_frac,
            sep='\t',
            dtypes=dtypes,
    ).to_iter():
        writer.write_table(gdf)
        del gdf
    md = writer.close(metadata_file_path=fn)
    return md
Beispiel #22
0
def test_fill_missing(tmpdir, df, dataset, engine):
    cont_names = ["x", "y"]
    cont_features = cont_names >> nvt.ops.FillMissing(fill_val=42)

    for col in cont_names:
        idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2))
        df[col].iloc[idx] = None

    df = df.reset_index()
    dataset = nvt.Dataset(df)
    processor = nvt.Workflow(cont_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    for col in cont_names:
        assert np.all((df[col].fillna(42) - new_gdf[col]).abs().values <= 1e-2)
        assert new_gdf[col].isna().sum() == 0
Beispiel #23
0
def test_joingroupby_dependency(tmpdir):
    df = pd.DataFrame({
        "Author": ["User_A", "User_A", "User_A", "User_B", "User_B"],
        "Cost": [100.0, 200.0, 300.0, 400.0, 400.0],
    })

    normalized_cost = ["Cost"] >> nvt.ops.NormalizeMinMax() >> nvt.ops.Rename(
        postfix="_normalized")
    groupby_features = ["Author"] >> ops.JoinGroupby(
        out_path=str(tmpdir), stats=["sum"], cont_cols=normalized_cost)
    workflow = nvt.Workflow(groupby_features)

    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()
    assert df_out["Author_Cost_normalized_sum"].to_arrow().to_pylist() == [
        1.0, 1.0, 1.0, 2.0, 2.0
    ]
def test_torch_drp_reset(tmpdir, batch_size, drop_last, num_rows):
    df = nvt.dispatch._make_df({
        "cat1": [1] * num_rows,
        "cat2": [2] * num_rows,
        "cat3": [3] * num_rows,
        "label": [0] * num_rows,
        "cont3": [3.0] * num_rows,
        "cont2": [2.0] * num_rows,
        "cont1": [1.0] * num_rows,
    })
    path = os.path.join(tmpdir, "dataset.parquet")
    df.to_parquet(path)
    cat_names = ["cat3", "cat2", "cat1"]
    cont_names = ["cont3", "cont2", "cont1"]
    label_name = ["label"]

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset([path]),
        cats=cat_names,
        conts=cont_names,
        labels=label_name,
        batch_size=batch_size,
        drop_last=drop_last,
        device="cpu",
    )

    all_len = len(data_itr) if drop_last else len(data_itr) - 1
    all_rows = 0
    df_cols = df.columns.to_list()
    for idx, chunk in enumerate(data_itr):
        all_rows += len(chunk[0]["cat1"])
        if idx < all_len:
            for col in df_cols:
                if col in chunk[0].keys():
                    if nvt.dispatch.HAS_GPU:
                        assert (list(
                            chunk[0][col].cpu().numpy()) == df[col].values_host
                                ).all()
                    else:
                        assert (list(
                            chunk[0][col].cpu().numpy()) == df[col].values
                                ).all()

    if drop_last and num_rows % batch_size > 0:
        assert num_rows > all_rows
    else:
        assert num_rows == all_rows
Beispiel #25
0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    cats = ColumnGroup(["ad_id", "source_id", "platform"])
    cat_features = cats >> ops.Categorify
    cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize
    te_features = cats >> ops.TargetEncoding(
        "clicked", kfold=5, fold_seed=42, p_smooth=20)

    p = Workflow(cat_features + cont_features + te_features, client=client)
    p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
Beispiel #26
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu):
    df = dispatch._make_df({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    if cpu:
        df = dd.from_pandas(
            df if isinstance(df, pd.DataFrame) else df.to_pandas(),
            npartitions=3)
    else:
        df = dask_cudf.from_cudf(df, npartitions=3)

    cont_names = ["Cost"]
    te_features = cat_groups >> ops.TargetEncoding(
        cont_names,
        out_path=str(tmpdir),
        kfold=kfold,
        out_dtype="float32",
        fold_seed=fold_seed,
        drop_folds=False,  # Keep folds to validate
    )

    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()
    workflow = nvt.Workflow(te_features + cont_features +
                            ["Author", "Engaging-User"])
    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    df_lib = dispatch.get_lib()
    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]

        check = df_lib.read_parquet(te_features.op.stats[name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check, check_dtype=False)
Beispiel #27
0
def run_perf_analyzer(model_path,
                      input_data_path,
                      num_rows=10,
                      model_version=1):
    """Runs perf_analyzer and returns a dataframe with statistics from it

    Parameters
    ----------
    model_path : str
        The fullpath to the model to analyze.
    input_data_path: str
        Path to datafiles containing example data to query the model with. Can
        be anything we can pass to a nvt.Dataset object (csv file/parquet etc)
    num_rows: int
        How many rows to query for
    model_version: int
        Which model version to use
    """
    # load the workflow and get the base perf analyzer commandline
    model_name = os.path.basename(model_path)

    workflow_path = os.path.join(model_path, str(model_version), "workflow")
    workflow = nvt.Workflow.load(workflow_path)
    cmdline = _get_perf_analyzer_commandline(workflow,
                                             model_name,
                                             batch_size=num_rows)

    # read in the input data and write out as a JSON file
    df = nvt.Dataset(input_data_path).to_ddf().head(num_rows)
    json_data = _convert_df_to_triton_json(df, workflow.input_dtypes)

    with tempfile.NamedTemporaryFile("w", suffix=".json") as json_file:
        json.dump(json_data, json_file, indent=2)
        cmdline.extend(["--input-data", json_file.name])
        json_file.flush()

        with tempfile.NamedTemporaryFile("w", suffix=".csv") as csv_report:
            csv_report.close()
            cmdline.extend(["-f", csv_report.name])
            result = subprocess.run(cmdline,
                                    stdout=subprocess.PIPE,
                                    check=True,
                                    encoding="utf8")
            print(result.stdout)
            return pd.read_csv(csv_report.name)
Beispiel #28
0
def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine):
    # TODO: add tests for > 2 features, multiple crosses, etc.
    cat_names = [["name-string", "id"]]
    num_buckets = 10

    hashed_cross = cat_names >> ops.HashedCross(num_buckets)
    dataset = nvt.Dataset(df)
    processor = nvtabular.Workflow(hashed_cross)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check sums for determinancy
    new_column_name = "_X_".join(cat_names[0])
    assert np.all(new_gdf[new_column_name].values >= 0)
    assert np.all(new_gdf[new_column_name].values <= 9)
    checksum = new_gdf[new_column_name].sum()
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert new_gdf[new_column_name].sum() == checksum
Beispiel #29
0
def test_join_external_workflow(tmpdir, df, dataset, engine):

    # Define "external" table
    how = "left"
    drop_duplicates = True
    cache = "device"
    shift = 100
    df_ext = df[["id"]].copy().sort_values("id")
    df_ext["new_col"] = df_ext["id"] + shift
    df_ext["new_col_2"] = "keep"
    df_ext["new_col_3"] = "ignore"
    df_ext_check = df_ext.copy()

    # Define Op
    on = "id"
    columns_left = list(df.columns)
    columns_ext = ["id", "new_col", "new_col_2"]
    df_ext_check = df_ext_check[columns_ext]
    if drop_duplicates:
        df_ext_check.drop_duplicates(ignore_index=True, inplace=True)
    joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal(
        df_ext,
        on,
        how=how,
        columns_ext=columns_ext,
        cache=cache,
        drop_duplicates_ext=drop_duplicates,
    )

    # Define Workflow
    gdf = df.reset_index()
    dataset = nvt.Dataset(gdf)
    processor = nvt.Workflow(joined)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute().reset_index()

    # Validate
    check_gdf = gdf.merge(df_ext_check, how=how, on=on)
    assert len(check_gdf) == len(new_gdf)
    assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all()
    assert gdf["id"].all() == new_gdf["id"].all()
    assert "new_col_2" in new_gdf.columns
    assert "new_col_3" not in new_gdf.columns
Beispiel #30
0
def test_categorify_size(tmpdir, cpu, include_nulls):
    num_rows = 50
    num_distinct = 10

    possible_session_ids = list(range(num_distinct))
    if include_nulls:
        possible_session_ids.append(None)

    df = dispatch._make_df(
        {
            "session_id":
            [random.choice(possible_session_ids) for _ in range(num_rows)]
        },
        device="cpu" if cpu else None,
    )

    cat_features = ["session_id"] >> nvt.ops.Categorify(out_path=str(tmpdir))
    workflow = nvt.Workflow(cat_features)
    workflow.fit_transform(nvt.Dataset(df, cpu=cpu)).to_ddf().compute()

    vals = df["session_id"].value_counts()
    vocab = dispatch._read_dispatch(cpu=cpu)(os.path.join(
        tmpdir, "categories", "unique.session_id.parquet"))

    if cpu:
        expected = dict(zip(vals.index, vals))
        computed = {
            session: size
            for session, size in zip(vocab["session_id"],
                                     vocab["session_id_size"]) if size
        }
    else:
        expected = dict(zip(vals.index.values_host, vals.values_host))
        computed = {
            session: size
            for session, size in zip(vocab["session_id"].values_host,
                                     vocab["session_id_size"].values_host)
            if size
        }
    first_key = list(computed.keys())[0]
    if pd.isna(first_key):
        computed.pop(first_key)
    assert computed == expected