Esempio n. 1
0
def test_lambdaop_misalign(cpu):
    size = 12
    df0 = pd.DataFrame({
        "a":
        np.arange(size),
        "b":
        np.random.choice(["apple", "banana", "orange"], size),
        "c":
        np.random.choice([0, 1], size),
    })

    ddf0 = dd.from_pandas(df0, npartitions=4)

    cont_names = ColumnGroup(["a"])
    cat_names = ColumnGroup(["b"])
    label = ColumnGroup(["c"])
    if cpu:
        label_feature = label >> (lambda col: np.where(col == 4, 0, 1))
    else:
        label_feature = label >> (lambda col: cp.where(col == 4, 0, 1))
    workflow = nvt.Workflow(cat_names + cont_names + label_feature)

    dataset = nvt.Dataset(ddf0, cpu=cpu)
    transformed = workflow.transform(dataset)
    assert_eq_dd(
        df0[["a", "b"]],
        transformed.to_ddf().compute()[["a", "b"]],
        check_index=False,
    )
Esempio n. 2
0
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine):
    df_copy = df.copy()

    # Substring
    # Replacement
    substring = ColumnGroup(["name-cat", "name-string"
                             ]) >> (lambda col: col.str.slice(1, 3))
    processor = nvtabular.Workflow(substring)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(new_gdf["name-cat"],
                 df_copy["name-cat"].str.slice(1, 3),
                 check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"].str.slice(1, 3),
                 check_index=False)

    # No Replacement from old API (skipped for other examples)
    substring = (
        ColumnGroup(["name-cat", "name-string"]) >>
        (lambda col: col.str.slice(1, 3)) >> ops.Rename(postfix="_slice"))
    processor = nvtabular.Workflow(["name-cat", "name-string"] + substring)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(
        new_gdf["name-cat_slice"],
        df_copy["name-cat"].str.slice(1, 3),
        check_index=False,
        check_names=False,
    )
    assert_eq_dd(
        new_gdf["name-string_slice"],
        df_copy["name-string"].str.slice(1, 3),
        check_index=False,
        check_names=False,
    )
    assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"], check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"],
                 check_index=False)

    # Replace
    # Replacement
    oplambda = ColumnGroup(["name-cat", "name-string"
                            ]) >> (lambda col: col.str.replace("e", "XX"))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(new_gdf["name-cat"],
                 df_copy["name-cat"].str.replace("e", "XX"),
                 check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"].str.replace("e", "XX"),
                 check_index=False)

    # astype
    # Replacement
    oplambda = ColumnGroup(["id"]) >> (lambda col: col.astype(float))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert new_gdf["id"].dtype == "float64"

    # Workflow
    # Replacement
    oplambda = (
        ColumnGroup(["name-cat"]) >>
        (lambda col: col.astype(str).str.slice(0, 1)) >> ops.Categorify())
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert is_integer_dtype(new_gdf["name-cat"].dtype)

    oplambda = (ColumnGroup(["name-cat", "name-string"]) >> ops.Categorify() >>
                (lambda col: col + 100))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert is_integer_dtype(new_gdf["name-cat"].dtype)
    assert np.sum(new_gdf["name-cat"] < 100) == 0