def test_seq_etl_tf_model(tmpdir, output_model):
    size = 100
    max_length = 10
    df = _make_df({
        "id": np.random.choice([0, 1], size=size),
        "item_id": np.random.randint(1, 10, size),
        "ts": np.linspace(0.0, 10.0, num=size).astype(np.float32),
        "y": np.linspace(0.0, 10.0, num=size).astype(np.float32),
    })

    groupby_features = ColumnSelector(["id", "item_id", "ts", "y"
                                       ]) >> ops.Groupby(
                                           groupby_cols=["id"],
                                           sort_cols=["ts"],
                                           aggs={
                                               "item_id": ["list"],
                                               "y": ["list"],
                                           },
                                           name_sep="-",
                                       )
    feats_list = groupby_features["item_id-list", "y-list"]
    feats_trim = feats_list >> ops.ListSlice(0, max_length)
    selected_features = groupby_features["id"] + feats_trim

    workflow = nvt.Workflow(selected_features)

    model_info = None
    sparse_max = {"item_id-list": max_length, "y-list": max_length}

    _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby",
                                     output_model, model_info, sparse_max)
Ejemplo n.º 2
0
def test_groupby_op(keys, cpu):
    # Initial timeseries dataset
    size = 60
    df1 = nvt.dispatch._make_df(
        {
            "name": np.random.choice(["Dave", "Zelda"], size=size),
            "id": np.random.choice([0, 1], size=size),
            "ts": np.linspace(0.0, 10.0, num=size),
            "x": np.arange(size),
            "y": np.linspace(0.0, 10.0, num=size),
            "shuffle": np.random.uniform(low=0.0, high=10.0, size=size),
        }
    )
    df1 = df1.sort_values("shuffle").drop(columns="shuffle").reset_index(drop=True)

    # Create a ddf, and be sure to shuffle by the groupby keys
    ddf1 = dd.from_pandas(df1, npartitions=3).shuffle(keys)
    dataset = nvt.Dataset(ddf1, cpu=cpu)
    dataset.schema.column_schemas["x"] = (
        dataset.schema.column_schemas["name"].with_name("x").with_tags("custom_tag")
    )
    # Define Groupby Workflow
    groupby_features = ColumnSelector(["name", "id", "ts", "x", "y"]) >> ops.Groupby(
        groupby_cols=keys,
        sort_cols=["ts"],
        aggs={
            "x": ["list", "sum"],
            "y": ["first", "last"],
            "ts": ["min"],
        },
        name_sep="-",
    )
    processor = nvtabular.Workflow(groupby_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert "custom_tag" in processor.output_schema.column_schemas["x-list"].tags

    if not cpu:
        # Make sure we are capturing the list type in `output_dtypes`
        assert processor.output_dtypes["x-list"] == cudf.core.dtypes.ListDtype("int64")

    # Check list-aggregation ordering
    x = new_gdf["x-list"]
    x = x.to_pandas() if hasattr(x, "to_pandas") else x
    sums = []
    for el in x.values:
        _el = pd.Series(el)
        sums.append(_el.sum())
        assert _el.is_monotonic_increasing

    # Check that list sums match sum aggregation
    x = new_gdf["x-sum"]
    x = x.to_pandas() if hasattr(x, "to_pandas") else x
    assert list(x) == sums

    # Check basic behavior or "y" column
    assert (new_gdf["y-first"] < new_gdf["y-last"]).all()
def test_groupby_model(tmpdir, output_model):
    size = 20
    df = _make_df({
        "id": np.random.choice([0, 1], size=size),
        "ts": np.linspace(0.0, 10.0, num=size),
        "x": np.arange(size),
        "y": np.linspace(0.0, 10.0, num=size),
    })

    groupby_features = ColumnSelector(["id", "ts", "x", "y"]) >> ops.Groupby(
        groupby_cols=["id"],
        sort_cols=["ts"],
        aggs={
            "x": ["sum"],
            "y": ["first"],
        },
        name_sep="-",
    )
    workflow = nvt.Workflow(groupby_features)

    if output_model == "pytorch":
        model_info = {
            "x-sum": {
                "columns": ["x-sum"],
                "dtype": "int64"
            },
            "y-first": {
                "columns": ["y-first"],
                "dtype": "float64"
            },
            "id": {
                "columns": ["id"],
                "dtype": "int64"
            },
        }
    else:
        model_info = None

    _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby",
                                     output_model, model_info)

# initial column selector works with tags
# filter within the workflow by tags
# test tags correct at output
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("col1"),
        ops.FillMissing(),
        ops.Groupby("col1"),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby("col1"),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding("col1"),
    ],
)
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])
Ejemplo n.º 5
0
from nvtabular.dispatch import HAS_GPU


@pytest.mark.parametrize("properties", [{}, {"p1": "1"}])
@pytest.mark.parametrize("tags", [[], ["TAG1", "TAG2"]])
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("1"),
        ops.FillMissing(),
        ops.Groupby(["1"]),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby(["1"]),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding(["1"]),
        ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}),
        ops.ValueCount(),
    ],
)
@pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]])
def test_schema_out(tags, properties, selection, op):
    # Create columnSchemas
    column_schemas = []