Beispiel #1
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu):
    df = dispatch._make_df({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    if cpu:
        df = dd.from_pandas(
            df if isinstance(df, pd.DataFrame) else df.to_pandas(),
            npartitions=3)
    else:
        df = dask_cudf.from_cudf(df, npartitions=3)

    cont_names = ["Cost"]
    te_features = cat_groups >> ops.TargetEncoding(
        cont_names,
        out_path=str(tmpdir),
        kfold=kfold,
        out_dtype="float32",
        fold_seed=fold_seed,
        drop_folds=False,  # Keep folds to validate
    )

    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()
    workflow = nvt.Workflow(te_features + cont_features +
                            ["Author", "Engaging-User"])
    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    df_lib = dispatch.get_lib()
    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]

        check = df_lib.read_parquet(te_features.op.stats[name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check, check_dtype=False)
Beispiel #2
0
def test_nested_workflow_node():
    df = dispatch._make_df({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })
    dataset = Dataset(df)

    geo_selector = ColumnSelector(["geo"])
    country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >>
               Rename(postfix="_country"))
    # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1")
    # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2")
    user = "******"
    # user2 = "user2"

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = country + user + [country + user] >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    workflow.fit_schema(dataset.infer_schema())

    df_out = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")
Beispiel #3
0
def test_categorify_size(tmpdir, cpu, include_nulls):
    num_rows = 50
    num_distinct = 10

    possible_session_ids = list(range(num_distinct))
    if include_nulls:
        possible_session_ids.append(None)

    df = dispatch._make_df(
        {
            "session_id":
            [random.choice(possible_session_ids) for _ in range(num_rows)]
        },
        device="cpu" if cpu else None,
    )

    cat_features = ["session_id"] >> nvt.ops.Categorify(out_path=str(tmpdir))
    workflow = nvt.Workflow(cat_features)
    workflow.fit_transform(nvt.Dataset(df, cpu=cpu)).to_ddf().compute()

    vals = df["session_id"].value_counts()
    vocab = dispatch._read_dispatch(cpu=cpu)(os.path.join(
        tmpdir, "categories", "unique.session_id.parquet"))

    if cpu:
        expected = dict(zip(vals.index, vals))
        computed = {
            session: size
            for session, size in zip(vocab["session_id"],
                                     vocab["session_id_size"]) if size
        }
    else:
        expected = dict(zip(vals.index.values_host, vals.values_host))
        computed = {
            session: size
            for session, size in zip(vocab["session_id"].values_host,
                                     vocab["session_id_size"].values_host)
            if size
        }
    first_key = list(computed.keys())[0]
    if pd.isna(first_key):
        computed.pop(first_key)
    assert computed == expected
Beispiel #4
0
def test_categorify_single_table():
    df = dispatch._make_df({
        "Authors": [None, "User_A", "User_A", "User_E", "User_B", "User_C"],
        "Engaging_User":
        [None, "User_B", "User_B", "User_A", "User_D", "User_D"],
        "Post": [1, 2, 3, 4, None, 5],
    })
    cat_names = ["Authors", "Engaging_User"]
    dataset = nvt.Dataset(df)
    features = cat_names >> ops.Categorify(single_table=True)
    processor = nvt.Workflow(features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    old_max = 0
    for name in cat_names:
        curr_min = new_gdf[name].min()
        assert old_max <= curr_min
        curr_max = new_gdf[name].max()
        old_max += curr_max
Beispiel #5
0
def test_normalize_lists(tmpdir, cpu):
    df = dispatch._make_df(device="cpu" if cpu else "gpu")
    df["vals"] = [
        [0.0, 1.0, 2.0],
        [
            3.0,
            4.0,
        ],
        [5.0],
    ]

    features = ["vals"] >> nvt.ops.Normalize()
    workflow = nvt.Workflow(features)
    transformed = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    expected = _flatten_list_column_values(df["vals"]).astype("float32")
    expected = (expected - expected.mean()) / expected.std()
    expected_df = type(transformed)({"vals": expected})

    assert_eq(expected_df, _flatten_list_column(transformed["vals"]))
Beispiel #6
0
    def _create_tensors(self, gdf):
        """
        Breaks a dataframe down into the relevant
        categorical, continuous, and label tensors.
        Can be overrideen
        """
        column_groups = (self.cat_names, self.cont_names, self.label_names)
        dtypes = (self._LONG_DTYPE, self._FLOAT32_DTYPE, self._FLOAT32_DTYPE)
        tensors = []
        offsets = _make_df(device=self.device)
        for column_names, dtype in zip(column_groups, dtypes):
            if len(column_names) == 0:
                tensors.append(None)
                continue

            gdf_i = gdf[column_names]
            gdf.drop(columns=column_names, inplace=True)

            scalars, lists = self._separate_list_columns(gdf_i)

            x = None
            if scalars:
                # should always return dict column_name: values, offsets (optional)
                x = self._to_tensor(gdf_i[scalars], dtype)
            if lists:
                list_tensors = OrderedDict()
                for column_name in lists:
                    column = gdf_i.pop(column_name)
                    leaves, offsets[column_name] = _pull_apart_list(column)
                    list_tensors[column_name] = self._to_tensor(leaves, dtype)
                x = x, list_tensors
            tensors.append(x)

        if not offsets.empty:
            offsets_tensor = self._to_tensor(offsets, self._LONG_DTYPE)
            if len(offsets_tensor.shape) == 1:
                offsets_tensor = offsets_tensor[:, None]
            tensors.append(offsets_tensor)
        del gdf, offsets

        return tensors
Beispiel #7
0
def test_categorify_hash_bucket(cpu):
    df = dispatch._make_df({
        "Authors": ["User_A", "User_A", "User_E", "User_B", "User_C"],
        "Engaging_User": ["User_B", "User_B", "User_A", "User_D", "User_D"],
        "Post": [1, 2, 3, 4, 5],
    })
    cat_names = ["Authors", "Engaging_User"]
    buckets = 10
    dataset = nvt.Dataset(df, cpu=cpu)
    hash_features = cat_names >> ops.Categorify(num_buckets=buckets)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check hashed values
    assert new_gdf["Authors"].max() <= (buckets - 1)
    assert new_gdf["Engaging_User"].max() <= (buckets - 1)
    # check embedding size is equal to the num_buckets after hashing
    assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == buckets
    assert nvt.ops.get_embedding_sizes(
        processor)["Engaging_User"][0] == buckets
Beispiel #8
0
def test_na_value_count(tmpdir):
    gdf = dispatch._make_df({
        "productID": ["B00406YHLI"] * 5 + ["B002YXS8E6"] * 5 +
        ["B00011KM38"] * 2 + [np.nan] * 3,
        "brand":
        ["Coby"] * 5 + [np.nan] * 5 + ["Cooler Master"] * 2 + ["Asus"] * 3,
    })

    cat_features = ["brand", "productID"] >> nvt.ops.Categorify()
    workflow = nvt.Workflow(cat_features)
    train_dataset = nvt.Dataset(gdf, engine="parquet")
    workflow.fit(train_dataset)
    workflow.transform(train_dataset).to_ddf().compute()

    single_cat = dispatch._read_dispatch("./categories/unique.brand.parquet")(
        "./categories/unique.brand.parquet")
    second_cat = dispatch._read_dispatch(
        "./categories/unique.productID.parquet")(
            "./categories/unique.productID.parquet")
    assert single_cat["brand_size"][0] == 5
    assert second_cat["productID_size"][0] == 3
def test_groupby_model(tmpdir, output_model):
    size = 20
    df = _make_df({
        "id": np.random.choice([0, 1], size=size),
        "ts": np.linspace(0.0, 10.0, num=size),
        "x": np.arange(size),
        "y": np.linspace(0.0, 10.0, num=size),
    })

    groupby_features = ColumnSelector(["id", "ts", "x", "y"]) >> ops.Groupby(
        groupby_cols=["id"],
        sort_cols=["ts"],
        aggs={
            "x": ["sum"],
            "y": ["first"],
        },
        name_sep="-",
    )
    workflow = nvt.Workflow(groupby_features)

    if output_model == "pytorch":
        model_info = {
            "x-sum": {
                "columns": ["x-sum"],
                "dtype": "int64"
            },
            "y-first": {
                "columns": ["y-first"],
                "dtype": "float64"
            },
            "id": {
                "columns": ["id"],
                "dtype": "int64"
            },
        }
    else:
        model_info = None

    _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby",
                                     output_model, model_info)
Beispiel #10
0
def test_ops_list_vc(properties, tags, op_routine):
    column_schemas = []
    all_cols = []
    for x in range(5):
        all_cols.append(str(x))
        column_schemas.append(ColumnSchema(str(x), tags=tags, properties=properties))

    # Turn to Schema
    schema = Schema(column_schemas)
    df_dict = {}
    num_rows = 10000
    for column_name in schema.column_names:
        df_dict[column_name] = np.random.randint(1, 1000, num_rows)
        df_dict[column_name] = [[x] * np.random.randint(1, 10) for x in df_dict[column_name]]

    df = dispatch._make_df(df_dict)
    dataset = nvt.Dataset(df)
    test_node = ColumnSelector(schema.column_names) >> op_routine[0]
    for op in op_routine[1:]:
        test_node = test_node >> op
    processor = nvt.Workflow(test_node)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    workflow_schema_out = processor.output_node.output_schema
    for column_name in workflow_schema_out.column_names:
        schema1 = workflow_schema_out.column_schemas[column_name]
        assert "domain" in schema1.properties
        embeddings_info = schema1.properties["domain"]
        # should always exist, represents unknown
        assert embeddings_info["min"] == 0
        if HAS_GPU:
            assert embeddings_info["max"] == new_gdf[column_name]._column.elements.max() + 1
        else:
            list_vals = nvt.dispatch._pull_apart_list(new_gdf[column_name])[0]
            assert embeddings_info["max"] == list_vals.max() + 1
        assert "value_count" in schema1.properties
        val_c = schema1.properties["value_count"]
        assert val_c["min"] == op_routine[-1].stats[column_name]["value_count"]["min"]
        assert val_c["max"] == op_routine[-1].stats[column_name]["value_count"]["max"]
def test_numeric_dtypes(tmpdir, output_model):
    if output_model == "pytorch":
        model_info = dict()
    else:
        model_info = None

    dtypes = []
    for width in [8, 16, 32, 64]:
        dtype = f"int{width}"
        dtypes.append((dtype, np.iinfo(dtype)))
        if output_model == "pytorch":
            model_info[dtype] = {"columns": [dtype], "dtype": dtype}

        dtype = f"uint{width}"
        dtypes.append((dtype, np.iinfo(dtype)))
        if output_model == "pytorch":
            model_info[dtype] = {"columns": [dtype], "dtype": dtype}

    for width in [32, 64]:
        dtype = f"float{width}"
        dtypes.append((dtype, np.finfo(dtype)))
        if output_model == "pytorch":
            model_info[dtype] = {"columns": [dtype], "dtype": dtype}

    def check_dtypes(col):
        assert str(col.dtype) == col.name
        return col

    # simple transform to make sure we can round-trip the min/max values for each dtype,
    # through triton, with the 'transform' here just checking that the dtypes are correct
    df = _make_df({
        dtype: np.array([limits.max, 0, limits.min], dtype=dtype)
        for dtype, limits in dtypes
    })
    features = nvt.ColumnSelector(df.columns) >> check_dtypes
    workflow = nvt.Workflow(features)
    _verify_workflow_on_tritonserver(tmpdir, workflow, df,
                                     "test_numeric_dtypes", output_model,
                                     model_info)
Beispiel #12
0
def test_hash_bucket_lists(tmpdir):
    df = dispatch._make_df(
        {
            "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )
    cat_names = ["Authors"]  # , "Engaging User"]

    dataset = nvt.Dataset(df)
    hash_features = cat_names >> ops.HashBucket(num_buckets=10)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check to make sure that the same strings are hashed the same
    authors = new_gdf["Authors"].to_arrow().to_pylist()
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    assert nvt.ops.get_embedding_sizes(processor)[1]["Authors"][0] == 10
Beispiel #13
0
def test_categorify_lists_with_start_index(tmpdir, cpu, start_index):
    df = dispatch._make_df({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors", "Engaging User"]
    label_name = ["Post"]
    dataset = nvt.Dataset(df, cpu=cpu)
    cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir),
                                               start_index=start_index)
    processor = nvt.Workflow(cat_features + label_name)
    processor.fit(dataset)
    df_out = processor.transform(dataset).to_ddf().compute()

    if cpu:
        compare = [list(row) for row in df_out["Authors"].tolist()]
    else:
        compare = df_out["Authors"].to_arrow().to_pylist()

    # Note that start_index is the start_index of the range of encoding, which
    # includes both an initial value for the encoding for out-of-vocabulary items,
    # as well as the values for the rest of the in-vocabulary items.
    # In this group of tests below, there are no out-of-vocabulary items, so our start index
    # value does not appear in the expected comparison object.
    if start_index == 0:
        assert compare == [[1], [1, 4], [3, 2], [2]]
    elif start_index == 1:
        assert compare == [[2], [2, 5], [4, 3], [3]]
    elif start_index == 16:
        assert compare == [[17], [17, 20], [19, 18], [18]]

    # We expect five entries in the embedding size, one for each author,
    # plus start_index many additional entries for our offset start_index.
    embeddings = nvt.ops.get_embedding_sizes(processor)

    assert embeddings[1]["Authors"][0] == (5 + start_index)
def test_generate_triton_multihot(tmpdir):
    df = _make_df({
        "userId": ["a", "a", "b"],
        "movieId": ["1", "2", "2"],
        "genres": [["action", "adventure"], ["action", "comedy"], ["comedy"]],
    })

    cats = ["userId", "movieId", "genres"] >> nvt.ops.Categorify()
    workflow = nvt.Workflow(cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(workflow, "model", repo)
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    assert_eq(expected, transformed)
Beispiel #15
0
 def create_df(
     self,
     size,
     cols,
     entries=False,
 ):
     conts_rep = cols["conts"] if "conts" in cols else None
     cats_rep = cols["cats"] if "cats" in cols else None
     labs_rep = cols["labels"] if "labels" in cols else None
     df = _make_df()
     if conts_rep:
         df = _concat([df, self.create_conts(size, conts_rep)], axis=1)
     if cats_rep:
         df = _concat(
             [
                 df,
                 self.create_cats(size, cats_rep=cats_rep, entries=entries),
             ],
             axis=1,
         )
     if labs_rep:
         df = _concat([df, self.create_labels(size, labs_rep)], axis=1)
     return df
Beispiel #16
0
def test_normalize_upcastfloat64(tmpdir, dataset, gpu_memory_frac, engine,
                                 op_columns):
    df = dispatch._make_df({
        "x": [1.9e10, 2.3e16, 3.4e18, 1.6e19],
        "label": [1.0, 0.0, 1.0, 0.0]
    })

    cont_features = op_columns >> ops.Normalize()
    processor = nvtabular.Workflow(cont_features)
    dataset = nvt.Dataset(df)
    processor.fit(dataset)

    new_gdf = processor.transform(dataset).to_ddf().compute()

    for col in op_columns:
        assert math.isclose(df[col].mean(),
                            processor.output_node.op.means[col],
                            rel_tol=1e-4)
        assert math.isclose(df[col].std(),
                            processor.output_node.op.stds[col],
                            rel_tol=1e-4)
        df[col] = (df[col] - processor.output_node.op.means[col]
                   ) / processor.output_node.op.stds[col]
        assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2)
Beispiel #17
0
def test_cat_rep(num_rows, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols, entries=True)
    df_cats = df_uni[cats]
    assert df_cats.shape[1] == len(cats)
    assert df_cats.shape[0] == num_rows
    cats_rep = cols["cats"]
    for idx, cat in enumerate(cats[1:]):
        assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality
        assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    if HAS_GPU:
        check_ser = _make_df(list(df_uni[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = df_uni[cats[0]]
    if isinstance(check_ser[0], (list, np.ndarray)):
        check_ser = _pull_apart_list(check_ser)[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
Beispiel #18
0
def test_target_encode_group():
    df = dispatch._make_df({
        "Cost":
        range(15),
        "Post": [1, 2, 3, 4, 5] * 3,
        "Author": ["A"] * 5 + ["B"] * 5 + ["C"] * 2 + ["D"] * 3,
        "Engaging_User":
        ["A"] * 5 + ["B"] * 3 + ["E"] * 2 + ["D"] * 3 + ["G"] * 2,
    })

    cat_groups = ["Author", "Engaging_User"]
    labels = ColumnSelector(
        ["Post"]) >> ops.LambdaOp(lambda col: (col > 3).astype("int8"))
    te_features = cat_groups >> ops.TargetEncoding(
        labels,
        out_path="./",
        kfold=1,
        out_dtype="float32",
        drop_folds=False,  # Keep folds to validate
    )

    workflow = nvt.Workflow(te_features + ["Author", "Engaging_User"])
    workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")
Beispiel #19
0
def convert_triton_output_to_df(columns, response):
    return _make_df({col: response.as_numpy(col) for col in columns})
Beispiel #20
0
def test_categorify_freq_limit(tmpdir, freq_limit, buckets, search_sort, cpu):
    if search_sort and cpu:
        # invalid combination - don't test
        return

    df = dispatch._make_df({
        "Author": [
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_B",
            "User_C",
        ],
        "Engaging User": [
            "User_B",
            "User_B",
            "User_A",
            "User_D",
            "User_B",
            "User_c",
            "User_A",
            "User_D",
            "User_D",
            "User_D",
        ],
    })

    isfreqthr = freq_limit > 0 if isinstance(freq_limit, int) else isinstance(
        freq_limit, dict)

    if (not search_sort and isfreqthr) or (search_sort and not isfreqthr):
        cat_names = ["Author", "Engaging User"]

        cats = cat_names >> ops.Categorify(
            freq_threshold=freq_limit,
            out_path=str(tmpdir),
            search_sorted=search_sort,
            num_buckets=buckets,
        )

        workflow = nvt.Workflow(cats)
        df_out = (workflow.fit_transform(nvt.Dataset(
            df, cpu=cpu)).to_ddf().compute(scheduler="synchronous"))

        if freq_limit and not buckets:
            # Column combinations are encoded
            if isinstance(freq_limit, dict):
                assert df_out["Author"].max() == 2
                assert df_out["Engaging User"].max() == 1
            else:
                assert len(df["Author"].unique()) == df_out["Author"].max()
                assert len(df["Engaging User"].unique()
                           ) == df_out["Engaging User"].max()
        elif not freq_limit and buckets:
            if isinstance(buckets, dict):
                assert df_out["Author"].max() <= 9
                assert df_out["Engaging User"].max() <= 19
            else:
                assert df_out["Author"].max() <= 9
                assert df_out["Engaging User"].max() <= 9
        elif freq_limit and buckets:
            if (isinstance(buckets, dict) and isinstance(buckets, dict)
                    and not isinstance(df, pd.DataFrame)):
                assert (
                    df_out["Author"].max() <=
                    (df["Author"].hash_values() % buckets["Author"]).max() +
                    2 + 1)
                assert (df_out["Engaging User"].max() <=
                        (df["Engaging User"].hash_values() %
                         buckets["Engaging User"]).max() + 1 + 1)
Beispiel #21
0
 def create_col(self, num_rows, dtype=np.float32, min_val=0, max_val=1):
     ser = _make_df(np.random.uniform(min_val, max_val, size=num_rows))[0]
     ser = ser.astype(dtype)
     return ser