def test_seq_etl_tf_model(tmpdir, output_model): size = 100 max_length = 10 df = _make_df({ "id": np.random.choice([0, 1], size=size), "item_id": np.random.randint(1, 10, size), "ts": np.linspace(0.0, 10.0, num=size).astype(np.float32), "y": np.linspace(0.0, 10.0, num=size).astype(np.float32), }) groupby_features = ColumnSelector(["id", "item_id", "ts", "y" ]) >> ops.Groupby( groupby_cols=["id"], sort_cols=["ts"], aggs={ "item_id": ["list"], "y": ["list"], }, name_sep="-", ) feats_list = groupby_features["item_id-list", "y-list"] feats_trim = feats_list >> ops.ListSlice(0, max_length) selected_features = groupby_features["id"] + feats_trim workflow = nvt.Workflow(selected_features) model_info = None sparse_max = {"item_id-list": max_length, "y-list": max_length} _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby", output_model, model_info, sparse_max)
def test_groupby_op(keys, cpu): # Initial timeseries dataset size = 60 df1 = nvt.dispatch._make_df( { "name": np.random.choice(["Dave", "Zelda"], size=size), "id": np.random.choice([0, 1], size=size), "ts": np.linspace(0.0, 10.0, num=size), "x": np.arange(size), "y": np.linspace(0.0, 10.0, num=size), "shuffle": np.random.uniform(low=0.0, high=10.0, size=size), } ) df1 = df1.sort_values("shuffle").drop(columns="shuffle").reset_index(drop=True) # Create a ddf, and be sure to shuffle by the groupby keys ddf1 = dd.from_pandas(df1, npartitions=3).shuffle(keys) dataset = nvt.Dataset(ddf1, cpu=cpu) dataset.schema.column_schemas["x"] = ( dataset.schema.column_schemas["name"].with_name("x").with_tags("custom_tag") ) # Define Groupby Workflow groupby_features = ColumnSelector(["name", "id", "ts", "x", "y"]) >> ops.Groupby( groupby_cols=keys, sort_cols=["ts"], aggs={ "x": ["list", "sum"], "y": ["first", "last"], "ts": ["min"], }, name_sep="-", ) processor = nvtabular.Workflow(groupby_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert "custom_tag" in processor.output_schema.column_schemas["x-list"].tags if not cpu: # Make sure we are capturing the list type in `output_dtypes` assert processor.output_dtypes["x-list"] == cudf.core.dtypes.ListDtype("int64") # Check list-aggregation ordering x = new_gdf["x-list"] x = x.to_pandas() if hasattr(x, "to_pandas") else x sums = [] for el in x.values: _el = pd.Series(el) sums.append(_el.sum()) assert _el.is_monotonic_increasing # Check that list sums match sum aggregation x = new_gdf["x-sum"] x = x.to_pandas() if hasattr(x, "to_pandas") else x assert list(x) == sums # Check basic behavior or "y" column assert (new_gdf["y-first"] < new_gdf["y-last"]).all()
def test_groupby_model(tmpdir, output_model): size = 20 df = _make_df({ "id": np.random.choice([0, 1], size=size), "ts": np.linspace(0.0, 10.0, num=size), "x": np.arange(size), "y": np.linspace(0.0, 10.0, num=size), }) groupby_features = ColumnSelector(["id", "ts", "x", "y"]) >> ops.Groupby( groupby_cols=["id"], sort_cols=["ts"], aggs={ "x": ["sum"], "y": ["first"], }, name_sep="-", ) workflow = nvt.Workflow(groupby_features) if output_model == "pytorch": model_info = { "x-sum": { "columns": ["x-sum"], "dtype": "int64" }, "y-first": { "columns": ["y-first"], "dtype": "float64" }, "id": { "columns": ["id"], "dtype": "int64" }, } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby", output_model, model_info)
# initial column selector works with tags # filter within the workflow by tags # test tags correct at output @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("col1"), ops.FillMissing(), ops.Groupby("col1"), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby("col1"), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding("col1"), ], ) def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3])
from nvtabular.dispatch import HAS_GPU @pytest.mark.parametrize("properties", [{}, {"p1": "1"}]) @pytest.mark.parametrize("tags", [[], ["TAG1", "TAG2"]]) @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("1"), ops.FillMissing(), ops.Groupby(["1"]), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby(["1"]), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding(["1"]), ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}), ops.ValueCount(), ], ) @pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]]) def test_schema_out(tags, properties, selection, op): # Create columnSchemas column_schemas = []