Esempio n. 1
0
def test_difference_lag():
    df = cudf.DataFrame({
        "userid": [0, 0, 0, 1, 1, 2],
        "timestamp": [1000, 1005, 1100, 2000, 2001, 3000]
    })

    diff_features = ["timestamp"] >> ops.DifferenceLag(
        partition_cols=["userid"], shift=[1, -1])
    dataset = nvt.Dataset(df)
    processor = nvtabular.Workflow(diff_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert new_gdf["timestamp_difference_lag_1"][0] is (cudf.NA if hasattr(
        cudf, "NA") else None)
    assert new_gdf["timestamp_difference_lag_1"][1] == 5
    assert new_gdf["timestamp_difference_lag_1"][2] == 95
    assert new_gdf["timestamp_difference_lag_1"][3] is (cudf.NA if hasattr(
        cudf, "NA") else None)

    assert new_gdf["timestamp_difference_lag_-1"][0] == -5
    assert new_gdf["timestamp_difference_lag_-1"][1] == -95
    assert new_gdf["timestamp_difference_lag_-1"][2] is (cudf.NA if hasattr(
        cudf, "NA") else None)
    assert new_gdf["timestamp_difference_lag_-1"][3] == -1
    assert new_gdf["timestamp_difference_lag_-1"][5] is (cudf.NA if hasattr(
        cudf, "NA") else None)
Esempio n. 2
0
def test_difference_lag(cpu):
    lib = pd if cpu else cudf
    df = lib.DataFrame(
        {"userid": [0, 0, 0, 1, 1, 2], "timestamp": [1000, 1005, 1100, 2000, 2001, 3000]}
    )

    diff_features = ["timestamp"] >> ops.DifferenceLag(partition_cols=["userid"], shift=[1, -1])
    dataset = nvt.Dataset(df, cpu=cpu)
    processor = nvtabular.Workflow(diff_features)
    processor.fit(dataset)
    new_df = processor.transform(dataset).to_ddf().compute()

    assert new_df["timestamp_difference_lag_1"][1] == 5
    assert new_df["timestamp_difference_lag_1"][2] == 95
    if cpu:
        assert lib.isna(new_df["timestamp_difference_lag_1"][0])
        assert lib.isna(new_df["timestamp_difference_lag_1"][3])
    else:
        assert new_df["timestamp_difference_lag_1"][0] is (lib.NA if hasattr(lib, "NA") else None)
        assert new_df["timestamp_difference_lag_1"][3] is (lib.NA if hasattr(lib, "NA") else None)

    assert new_df["timestamp_difference_lag_-1"][0] == -5
    assert new_df["timestamp_difference_lag_-1"][1] == -95
    assert new_df["timestamp_difference_lag_-1"][3] == -1
    if cpu:
        assert lib.isna(new_df["timestamp_difference_lag_-1"][2])
        assert lib.isna(new_df["timestamp_difference_lag_-1"][5])
    else:
        assert new_df["timestamp_difference_lag_-1"][2] is (lib.NA if hasattr(lib, "NA") else None)
        assert new_df["timestamp_difference_lag_-1"][5] is (lib.NA if hasattr(lib, "NA") else None)
Esempio n. 3
0
def test_difference_lag():
    df = cudf.DataFrame(
        {"userid": [0, 0, 0, 1, 1, 2], "timestamp": [1000, 1005, 1100, 2000, 2001, 3000]}
    )

    columns = ["userid", "timestamp"]
    columns_ctx = {}
    columns_ctx["all"] = {}
    columns_ctx["all"]["base"] = columns

    op = ops.DifferenceLag("userid", columns=["timestamp"])
    new_gdf = op.apply_op(df, columns_ctx, "all", target_cols=["timestamp"])

    assert new_gdf["timestamp_DifferenceLag"][0] is None
    assert new_gdf["timestamp_DifferenceLag"][1] == 5
    assert new_gdf["timestamp_DifferenceLag"][2] == 95
    assert new_gdf["timestamp_DifferenceLag"][3] is None
        "TE_x_cost_renamed", "TE_y_cost_renamed"
    ]


# initial column selector works with tags
# filter within the workflow by tags
# test tags correct at output
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("col1"),
        ops.FillMissing(),
        ops.Groupby("col1"),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby("col1"),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding("col1"),
    ],
)
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
Esempio n. 5
0
import nvtabular as nvt
from nvtabular import ColumnSchema, ColumnSelector, Schema, dispatch, ops
from nvtabular.dispatch import HAS_GPU


@pytest.mark.parametrize("properties", [{}, {"p1": "1"}])
@pytest.mark.parametrize("tags", [[], ["TAG1", "TAG2"]])
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("1"),
        ops.FillMissing(),
        ops.Groupby(["1"]),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby(["1"]),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding(["1"]),
        ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}),
        ops.ValueCount(),
    ],
)
@pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]])
def test_schema_out(tags, properties, selection, op):