def test_workflow_move_saved(tmpdir):
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = nvt.dispatch._make_df({"geo": raw})

    geo_location = ColumnSelector(["geo"])
    state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >>
             ops.Rename(postfix="_state"))
    country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >>
               ops.Rename(postfix="_country"))
    geo_features = state + country + geo_location >> ops.Categorify()

    # create the workflow and transform the input
    workflow = Workflow(geo_features)
    expected = workflow.fit_transform(Dataset(data)).to_ddf().compute()

    # save the workflow (including categorical mapping parquet files)
    # and then verify we can load the saved workflow after moving the directory
    out_path = os.path.join(tmpdir, "output", "workflow")
    workflow.save(out_path)

    moved_path = os.path.join(tmpdir, "output", "workflow2")
    shutil.move(out_path, moved_path)
    workflow2 = Workflow.load(moved_path)

    # also check that when transforming our input we get the same results after loading
    transformed = workflow2.transform(Dataset(data)).to_ddf().compute()
    assert_eq(expected, transformed)
def test_lambdaop_misalign(cpu):
    size = 12
    df0 = pd.DataFrame({
        "a":
        np.arange(size),
        "b":
        np.random.choice(["apple", "banana", "orange"], size),
        "c":
        np.random.choice([0, 1], size),
    })

    ddf0 = dd.from_pandas(df0, npartitions=4)

    cont_names = ColumnSelector(["a"])
    cat_names = ColumnSelector(["b"])
    label = ColumnSelector(["c"])
    if cpu:
        label_feature = label >> ops.LambdaOp(
            lambda col: np.where(col == 4, 0, 1))
    else:
        label_feature = label >> ops.LambdaOp(
            lambda col: cp.where(col == 4, 0, 1))
    workflow = nvt.Workflow(cat_names + cont_names + label_feature)

    dataset = nvt.Dataset(ddf0, cpu=cpu)
    transformed = workflow.transform(dataset)
    assert_eq_dd(
        df0[["a", "b"]],
        transformed.to_ddf().compute()[["a", "b"]],
        check_index=False,
    )
def test_chaining_3():
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    platform_features = ["platform"] >> ops.Dropna()
    joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    joined_lambda = (
        joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >>
        ops.Rename(postfix="_ctr"))

    workflow = Workflow(platform_features + joined + joined_lambda)

    dataset = nvt.Dataset(gdf_test, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(
        x in result.columns
        for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_chaining_2():
    gdf = cudf.DataFrame({
        "A": [1, 2, 2, 9, 6, np.nan, 3],
        "B": [2, np.nan, 4, 7, 7, 2, 5],
        "C": ["a", "b", "c", np.nan, np.nan, "g", "k"],
    })

    cat_names = ["C"]
    cont_names = ["A", "B"]
    label_name = []

    all_features = (cat_names + cont_names >> ops.LambdaOp(
        f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull"))
    cat_features = cat_names >> ops.Categorify()

    workflow = Workflow(all_features + cat_features + label_name)

    dataset = nvt.Dataset(gdf, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(x in list(result.columns)
               for x in ["A_isnull", "B_isnull", "C_isnull"])
    assert (x in result["C"].unique()
            for x in set(gdf["C"].dropna().to_arrow()))
def test_concatenate_dataframe(tmpdir, output_model):
    # we were seeing an issue in the rossmann workflow where we dropped certain columns,
    # https://github.com/NVIDIA/NVTabular/issues/961
    df = _make_df({
        "cat": ["aaaa", "bbbb", "cccc", "aaaa", "bbbb", "aaaa"],
        "cont": [0.0, 1.0, 2.0, 3.0, 4.0, 5],
    })
    # this bug only happened with a dataframe representation: force this by using a lambda
    cats = ["cat"] >> ops.LambdaOp(lambda col: _hash_series(col) % 1000)
    conts = ["cont"] >> ops.Normalize() >> ops.FillMissing() >> ops.LogOp()

    dataset = Dataset(df)
    workflow = nvt.Workflow(cats + conts).fit_schema(dataset.infer_schema())

    if output_model == "pytorch":
        model_info = {
            "cat": {
                "columns": ["cat"],
                "dtype": "int32"
            },
            "cont": {
                "columns": ["cont"],
                "dtype": "float32"
            },
        }
    else:
        model_info = None

    _verify_workflow_on_tritonserver(tmpdir, workflow, df,
                                     "test_concatenate_dataframe",
                                     output_model, model_info)
def test_workflow_generate_columns(tmpdir, use_parquet):
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    # Stripped down dataset with geo_locaiton codes like in outbrains
    df = nvt.dispatch._make_df(
        {"geo_location": ["US>CA", "CA>BC", "US>TN>659"]})

    # defining a simple workflow that strips out the country code from the first two digits of the
    # geo_location code and sticks in a new 'geo_location_country' field
    country = (["geo_location"] >> ops.LambdaOp(
        f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country"))
    cat_features = ["geo_location"] + country >> ops.Categorify()

    workflow = Workflow(cat_features)

    if use_parquet:
        df.to_parquet(path)
        dataset = nvt.Dataset(path)
    else:
        dataset = nvt.Dataset(df)

    # just make sure this works without errors
    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(out_path)
Exemple #7
0
def test_workflow_generate_columns(tmpdir, use_parquet):
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    # Stripped down dataset with geo_locaiton codes like in outbrains
    df = cudf.DataFrame({"geo_location": ["US>CA", "CA>BC", "US>TN>659"]})

    # defining a simple workflow that strips out the country code from the first two digits of the
    # geo_location code and sticks in a new 'geo_location_country' field
    cat_names = ["geo_location", "geo_location_country"]
    workflow = nvt.Workflow(cat_names=cat_names, cont_names=[], label_name=[])
    workflow.add_feature(
        [
            ops.LambdaOp(
                op_name="country",
                f=lambda col, gdf: col.str.slice(0, 2),
                columns=["geo_location"],
                replace=False,
            ),
            ops.Categorify(replace=False),
        ]
    )
    workflow.finalize()

    if use_parquet:
        df.to_parquet(path)
        dataset = nvt.Dataset(path)
    else:
        dataset = nvt.Dataset(df)

    # just make sure this owrks without errors
    workflow.apply(dataset, output_path=out_path)
def test_transform_geolocation():
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = nvt.dispatch._make_df({"geo_location": raw})

    geo_location = ColumnSelector(["geo_location"])
    state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >>
             ops.Rename(postfix="_state"))
    country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >>
               ops.Rename(postfix="_country"))
    geo_features = state + country + geo_location >> ops.HashBucket(
        num_buckets=100)

    # for this workflow we don't have any statoperators, so we can get away without fitting
    workflow = Workflow(geo_features)
    transformed = workflow.transform(Dataset(data)).to_ddf().compute()

    expected = nvt.dispatch._make_df()
    expected["geo_location_state"] = data["geo_location"].str.slice(
        0, 5).hash_values() % 100
    expected["geo_location_country"] = data["geo_location"].str.slice(
        0, 2).hash_values() % 100
    expected["geo_location"] = data["geo_location"].hash_values() % 100
    assert_eq(expected, transformed)
Exemple #9
0
def test_target_encode_group():
    df = dispatch._make_df({
        "Cost":
        range(15),
        "Post": [1, 2, 3, 4, 5] * 3,
        "Author": ["A"] * 5 + ["B"] * 5 + ["C"] * 2 + ["D"] * 3,
        "Engaging_User":
        ["A"] * 5 + ["B"] * 3 + ["E"] * 2 + ["D"] * 3 + ["G"] * 2,
    })

    cat_groups = ["Author", "Engaging_User"]
    labels = ColumnSelector(
        ["Post"]) >> ops.LambdaOp(lambda col: (col > 3).astype("int8"))
    te_features = cat_groups >> ops.TargetEncoding(
        labels,
        out_path="./",
        kfold=1,
        out_dtype="float32",
        drop_folds=False,  # Keep folds to validate
    )

    workflow = nvt.Workflow(te_features + ["Author", "Engaging_User"])
    workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")
Exemple #10
0
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine, client):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y"]
    label_name = ["label"]
    columns = mycols_pq if engine == "parquet" else mycols_csv

    df_copy = df.copy()

    config = nvt.workflow.get_new_config()

    processor = nvtabular.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        client=client,
    )

    columns_ctx = {}
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = cont_names
    columns_ctx["all"] = {}
    columns_ctx["all"]["base"] = columns

    # Substring
    # Replacement
    op = ops.LambdaOp(
        op_name="slice",
        f=lambda col, gdf: col.str.slice(1, 3),
        columns=["name-cat", "name-string"],
        replace=True,
    )

    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.slice(1, 3))
    assert new_gdf["name-string"].equals(df_copy["name-string"].str.slice(
        1, 3))

    # No Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(
        op_name="slice",
        f=lambda col, gdf: col.str.slice(1, 3),
        columns=["name-cat", "name-string"],
        replace=False,
    )
    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat_slice"].equals(df_copy["name-cat"].str.slice(
        1, 3))
    assert new_gdf["name-string_slice"].equals(
        df_copy["name-string"].str.slice(1, 3))
    assert new_gdf["name-cat"].equals(df_copy["name-cat"])
    assert new_gdf["name-string"].equals(df_copy["name-string"])

    # Replace
    # Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(
        op_name="replace",
        f=lambda col, gdf: col.str.replace("e", "XX"),
        columns=["name-cat", "name-string"],
        replace=True,
    )

    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.replace(
        "e", "XX"))
    assert new_gdf["name-string"].equals(df_copy["name-string"].str.replace(
        "e", "XX"))

    # No Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(
        op_name="replace",
        f=lambda col, gdf: col.str.replace("e", "XX"),
        columns=["name-cat", "name-string"],
        replace=False,
    )
    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["name-cat_replace"].equals(df_copy["name-cat"].str.replace(
        "e", "XX"))
    assert new_gdf["name-string_replace"].equals(
        df_copy["name-string"].str.replace("e", "XX"))
    assert new_gdf["name-cat"].equals(df_copy["name-cat"])
    assert new_gdf["name-string"].equals(df_copy["name-string"])

    # astype
    # Replacement
    df = df_copy.copy()
    op = ops.LambdaOp(op_name="astype",
                      f=lambda col, gdf: col.astype(float),
                      columns=["id"],
                      replace=True)
    new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None)
    assert new_gdf["id"].dtype == "float64"

    # Workflow
    # Replacement
    import glob

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.LambdaOp(
            op_name="slice",
            f=lambda col, gdf: col.astype(str).str.slice(0, 1),
            columns=["name-cat"],
            replace=True,
        ),
        ops.Categorify(),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out1")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert is_integer_dtype(df_pp["name-cat"].dtype)

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.Categorify(),
        ops.LambdaOp(op_name="add100",
                     f=lambda col, gdf: col + 100,
                     replace=True),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out2")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert np.sum(df_pp["name-cat"] < 100) == 0

    # Workflow
    # No Replacement
    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.LambdaOp(
            op_name="slice",
            f=lambda col, gdf: col.astype(str).str.slice(0, 1),
            columns=["name-cat"],
            replace=False,
        ),
        ops.Categorify(),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out3")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)
    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    assert df_pp["name-cat"].dtype == "O"
    print(df_pp)
    assert is_integer_dtype(df_pp["name-cat_slice"].dtype)
    assert np.sum(df_pp["name-cat_slice"] == 0) == 0

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.Categorify(),
        ops.LambdaOp(op_name="add100",
                     f=lambda col, gdf: col + 100,
                     replace=False),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out4")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert is_integer_dtype(df_pp["name-cat_add100"].dtype)
    assert np.sum(df_pp["name-cat_add100"] < 100) == 0

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess([
        ops.LambdaOp(op_name="mul0",
                     f=lambda col, gdf: col * 0,
                     columns=["x"],
                     replace=False),
        ops.LambdaOp(op_name="add100",
                     f=lambda col, gdf: col + 100,
                     replace=False),
    ])
    processor.finalize()
    processor.update_stats(dataset)
    outdir = tmpdir.mkdir("out5")
    processor.write_to_dataset(outdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle=nvt.io.Shuffle.PER_PARTITION,
                               apply_ops=True)

    dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"),
                                     part_mem_fraction=gpu_memory_frac)
    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)
    assert np.sum(df_pp["x_mul0_add100"] < 100) == 0
def test_lambdaop(tmpdir, df, paths, gpu_memory_frac, engine, cpu):
    dataset = nvt.Dataset(paths, cpu=cpu)
    df_copy = df.copy()

    # Substring
    # Replacement
    substring = ColumnSelector([
        "name-cat", "name-string"
    ]) >> ops.LambdaOp(lambda col: col.str.slice(1, 3))
    processor = nvtabular.Workflow(substring)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(new_gdf["name-cat"],
                 df_copy["name-cat"].str.slice(1, 3),
                 check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"].str.slice(1, 3),
                 check_index=False)

    # No Replacement from old API (skipped for other examples)
    substring = (ColumnSelector(["name-cat", "name-string"]) >>
                 ops.LambdaOp(lambda col: col.str.slice(1, 3)) >>
                 ops.Rename(postfix="_slice"))
    processor = nvtabular.Workflow(substring + ["name-cat", "name-string"])
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(
        new_gdf["name-cat_slice"],
        df_copy["name-cat"].str.slice(1, 3),
        check_index=False,
        check_names=False,
    )
    assert_eq_dd(
        new_gdf["name-string_slice"],
        df_copy["name-string"].str.slice(1, 3),
        check_index=False,
        check_names=False,
    )
    assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"], check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"],
                 check_index=False)

    # Replace
    # Replacement
    oplambda = ColumnSelector([
        "name-cat", "name-string"
    ]) >> ops.LambdaOp(lambda col: col.str.replace("e", "XX"))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(new_gdf["name-cat"],
                 df_copy["name-cat"].str.replace("e", "XX"),
                 check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"].str.replace("e", "XX"),
                 check_index=False)

    # astype
    # Replacement
    oplambda = ColumnSelector(
        ["id"]) >> ops.LambdaOp(lambda col: col.astype(float))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert new_gdf["id"].dtype == "float64"

    # Workflow
    # Replacement
    oplambda = (ColumnSelector(["name-cat"]) >>
                ops.LambdaOp(lambda col: col.astype(str).str.slice(0, 1)) >>
                ops.Categorify())
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert is_integer_dtype(new_gdf["name-cat"].dtype)

    oplambda = (
        ColumnSelector(["name-cat", "name-string"]) >> ops.Categorify() >>
        (lambda col: col + 100))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert is_integer_dtype(new_gdf["name-cat"].dtype)
    assert np.sum(new_gdf["name-cat"] < 100) == 0