Exemple #1
0
def test_joingroupby_multi(tmpdir, groups):

    df = pd.DataFrame({
        "Author": ["User_A", "User_A", "User_A", "User_B"],
        "Engaging-User": ["User_B", "User_B", "User_C", "User_C"],
        "Cost": [100.0, 200.0, 300.0, 400.0],
        "Post": [1, 2, 3, 4],
    })

    groupby_features = groups >> ops.JoinGroupby(
        out_path=str(tmpdir), stats=["sum"], cont_cols=["Cost"])
    workflow = nvt.Workflow(groupby_features + "Post")

    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    if isinstance(groups, list):
        # Join on ["Author", "Engaging-User"]
        assert df_out["Author_Engaging-User_Cost_sum"].to_arrow().to_pylist(
        ) == [
            300.0,
            300.0,
            300.0,
            400.0,
        ]
    else:
        # Join on ["Author"]
        assert df_out["Author_Cost_sum"].to_arrow().to_pylist() == [
            600.0, 600.0, 600.0, 400.0
        ]
def test_chaining_3():
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    platform_features = ["platform"] >> ops.Dropna()
    joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    joined_lambda = (
        joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >>
        ops.Rename(postfix="_ctr"))

    workflow = Workflow(platform_features + joined + joined_lambda)

    dataset = nvt.Dataset(gdf_test, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(
        x in result.columns
        for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
Exemple #3
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction,
                                use_client):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client if use_client else None,
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
    )

    processor.add_preprocess(
        ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True))

    processor.add_cat_feature(
        ops.JoinGroupby(cont_names=cont_names,
                        stats=["count", "sum"],
                        out_path=str(tmpdir)))

    processor.finalize()
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)

    processor.apply(dataset, output_path=str(tmpdir))
    result = processor.get_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
def test_joingroupby_dependency(tmpdir, cpu):
    df = pd.DataFrame({
        "Author": ["User_A", "User_A", "User_A", "User_B", "User_B"],
        "Cost": [100.0, 200.0, 300.0, 400.0, 400.0],
    })

    normalized_cost = ["Cost"] >> nvt.ops.NormalizeMinMax() >> nvt.ops.Rename(
        postfix="_normalized")
    groupby_features = ["Author"] >> ops.JoinGroupby(
        out_path=str(tmpdir), stats=["sum"], cont_cols=normalized_cost)
    workflow = nvt.Workflow(groupby_features)

    df_out = workflow.fit_transform(nvt.Dataset(df,
                                                cpu=cpu)).to_ddf().compute()
    if cpu:
        assert df_out["Author_Cost_normalized_sum"].to_list() == [
            1.0, 1.0, 1.0, 2.0, 2.0
        ]
    else:
        assert df_out["Author_Cost_normalized_sum"].to_arrow().to_pylist() == [
            1.0,
            1.0,
            1.0,
            2.0,
            2.0,
        ]
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction,
                                use_client):
    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]

    cats = ColumnSelector(cat_names)
    cat_features = cats >> ops.Categorify(
        out_path=str(tmpdir), freq_threshold=10, on_host=True)
    groupby_features = cats >> ops.JoinGroupby(
        cont_cols=cont_names, stats=["count", "sum"], out_path=str(tmpdir))

    # We have a global dask client defined in this context, so NVTabular
    # should warn us if we initialize a `Workflow` with `client=None`
    workflow = run_in_context(
        Workflow,
        cat_features + groupby_features,
        context=None if use_client else pytest.warns(UserWarning),
        client=client if use_client else None,
    )
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
Exemple #6
0
def test_joingroupby_multi(tmpdir, groups):

    df = pd.DataFrame(
        {
            "Author": ["User_A", "User_A", "User_A", "User_B"],
            "Engaging-User": ["User_B", "User_B", "User_C", "User_C"],
            "Cost": [100.0, 200.0, 300.0, 400.0],
            "Post": [1, 2, 3, 4],
        }
    )

    cat_names = ["Author", "Engaging-User"]
    cont_names = ["Cost"]
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_preprocess(
        ops.JoinGroupby(columns=groups, out_path=str(tmpdir), stats=["sum"], cont_names=["Cost"])
    )
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    if isinstance(groups, list):
        # Join on ["Author", "Engaging-User"]
        assert df_out["Author_Engaging-User_Cost_sum"].to_arrow().to_pylist() == [
            300.0,
            300.0,
            300.0,
            400.0,
        ]
    else:
        # Join on ["Author"]
        assert df_out["Author_Cost_sum"].to_arrow().to_pylist() == [600.0, 600.0, 600.0, 400.0]
Exemple #7
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)
    )

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name, client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64),
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "min"
    assert_eq(
        result[["name-string", "name-string_x_min"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_min"],
        df0.groupby("name-string").agg({"x": "min"})["x"],
        check_index=False,
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_cols=cont_names,
        stats=["count", "sum", "std", "min"],
        out_path=str(tmpdir))

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name,
                        client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check results.  Need to sort for direct comparison
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    gb_e = expect.groupby("name-cat").aggregate({
        "name-cat": "count",
        "x": ["sum", "min", "std"]
    })
    gb_e.columns = ["count", "sum", "min", "std"]
    df_check = got.merge(gb_e,
                         left_on="name-cat",
                         right_index=True,
                         how="left")
    assert_eq(df_check["name-cat_count"],
              df_check["count"].astype("int64"),
              check_names=False)
    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False)
    assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False)
    assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
Exemple #9
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client):
    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]

    cats = ColumnGroup(cat_names)
    cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)
    groupby_features = cats >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir)
    )

    workflow = Workflow(cat_features + groupby_features, client=client)
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
# filter within the workflow by tags
# test tags correct at output
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("col1"),
        ops.FillMissing(),
        ops.Groupby("col1"),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby("col1"),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding("col1"),
    ],
)
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)
Exemple #11
0
@pytest.mark.parametrize("properties", [{}, {"p1": "1"}])
@pytest.mark.parametrize("tags", [[], ["TAG1", "TAG2"]])
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("1"),
        ops.FillMissing(),
        ops.Groupby(["1"]),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby(["1"]),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding(["1"]),
        ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}),
        ops.ValueCount(),
    ],
)
@pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]])
def test_schema_out(tags, properties, selection, op):
    # Create columnSchemas
    column_schemas = []
    all_cols = []
    for x in range(5):
        all_cols.append(str(x))