Beispiel #1
0
def test_lambdaop_misalign(cpu):
    size = 12
    df0 = pd.DataFrame({
        "a":
        np.arange(size),
        "b":
        np.random.choice(["apple", "banana", "orange"], size),
        "c":
        np.random.choice([0, 1], size),
    })

    ddf0 = dd.from_pandas(df0, npartitions=4)

    cont_names = ColumnGroup(["a"])
    cat_names = ColumnGroup(["b"])
    label = ColumnGroup(["c"])
    if cpu:
        label_feature = label >> (lambda col: np.where(col == 4, 0, 1))
    else:
        label_feature = label >> (lambda col: cp.where(col == 4, 0, 1))
    workflow = nvt.Workflow(cat_names + cont_names + label_feature)

    dataset = nvt.Dataset(ddf0, cpu=cpu)
    transformed = workflow.transform(dataset)
    assert_eq_dd(
        df0[["a", "b"]],
        transformed.to_ddf().compute()[["a", "b"]],
        check_index=False,
    )
Beispiel #2
0
def test_workflow_move_saved(tmpdir):
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = cudf.DataFrame({"geo": raw})

    geo_location = ColumnGroup(["geo"])
    state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename(
        postfix="_state")
    country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename(
        postfix="_country")
    geo_features = state + country + geo_location >> ops.Categorify()

    # create the workflow and transform the input
    workflow = Workflow(geo_features)
    expected = workflow.fit_transform(Dataset(data)).to_ddf().compute()

    # save the workflow (including categorical mapping parquet files)
    # and then verify we can load the saved workflow after moving the directory
    out_path = os.path.join(tmpdir, "output", "workflow")
    workflow.save(out_path)

    moved_path = os.path.join(tmpdir, "output", "workflow2")
    shutil.move(out_path, moved_path)
    workflow2 = Workflow.load(moved_path)

    # also check that when transforming our input we get the same results after loading
    transformed = workflow2.transform(Dataset(data)).to_ddf().compute()
    assert_eq(expected, transformed)
Beispiel #3
0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    cats = ColumnGroup(["ad_id", "source_id", "platform"])
    cat_features = cats >> ops.Categorify
    cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize
    te_features = cats >> ops.TargetEncoding(
        "clicked", kfold=5, fold_seed=42, p_smooth=20)

    p = Workflow(cat_features + cont_features + te_features, client=client)
    p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
Beispiel #4
0
def test_groupby_op(keys, cpu):

    # Initial timeseries dataset
    size = 60
    df1 = pd.DataFrame({
        "name":
        np.random.choice(["Dave", "Zelda"], size=size),
        "id":
        np.random.choice([0, 1], size=size),
        "ts":
        np.linspace(0.0, 10.0, num=size),
        "x":
        np.arange(size),
        "y":
        np.linspace(0.0, 10.0, num=size),
        "shuffle":
        np.random.uniform(low=0.0, high=10.0, size=size),
    })
    df1 = df1.sort_values("shuffle").drop(columns="shuffle").reset_index(
        drop=True)

    # Create a ddf, and be sure to shuffle by the groupby keys
    ddf1 = dd.from_pandas(df1, npartitions=3).shuffle(keys)
    dataset = nvt.Dataset(ddf1, cpu=cpu)

    # Define Groupby Workflow
    groupby_features = ColumnGroup(["name", "id", "ts", "x", "y"
                                    ]) >> ops.Groupby(
                                        groupby_cols=keys,
                                        sort_cols=["ts"],
                                        aggs={
                                            "x": ["list", "sum"],
                                            "y": ["first", "last"],
                                            "ts": ["min"],
                                        },
                                        name_sep="-",
                                    )
    processor = nvtabular.Workflow(groupby_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # Check list-aggregation ordering
    x = new_gdf["x-list"]
    x = x.to_pandas() if hasattr(x, "to_pandas") else x
    sums = []
    for el in x.values:
        _el = pd.Series(el)
        sums.append(_el.sum())
        assert _el.is_monotonic_increasing

    # Check that list sums match sum aggregation
    x = new_gdf["x-sum"]
    x = x.to_pandas() if hasattr(x, "to_pandas") else x
    assert list(x) == sums

    # Check basic behavior or "y" column
    assert (new_gdf["y-first"] < new_gdf["y-last"]).all()
Beispiel #5
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client):
    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]

    cats = ColumnGroup(cat_names)
    cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)
    groupby_features = cats >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir)
    )

    workflow = Workflow(cat_features + groupby_features, client=client)
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
Beispiel #6
0
def test_column_group_select():
    df = cudf.DataFrame({
        "a": [1, 4, 9, 16, 25],
        "b": [0, 1, 2, 3, 4],
        "c": [25, 16, 9, 4, 1]
    })

    input_features = ColumnGroup(["a", "b", "c"])
    sqrt_features = input_features[["a", "c"]] >> cudf.sqrt
    plus_one_features = input_features["b"] >> (lambda col: col + 1)
    features = sqrt_features + plus_one_features

    workflow = Workflow(features)
    df_out = workflow.fit_transform(
        Dataset(df)).to_ddf().compute(scheduler="synchronous")

    expected = cudf.DataFrame()
    expected["a"] = cudf.sqrt(df["a"])
    expected["c"] = cudf.sqrt(df["c"])
    expected["b"] = df["b"] + 1

    assert_eq(expected, df_out)
Beispiel #7
0
def test_transform_geolocation():
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = cudf.DataFrame({"geo_location": raw})

    geo_location = ColumnGroup(["geo_location"])
    state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename(
        postfix="_state")
    country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename(
        postfix="_country")
    geo_features = state + country + geo_location >> ops.HashBucket(
        num_buckets=100)

    # for this workflow we don't have any statoperators, so we can get away without fitting
    workflow = Workflow(geo_features)
    transformed = workflow.transform(Dataset(data)).to_ddf().compute()

    expected = cudf.DataFrame()
    expected["geo_location_state"] = data["geo_location"].str.slice(
        0, 5).hash_values() % 100
    expected["geo_location_country"] = data["geo_location"].str.slice(
        0, 2).hash_values() % 100
    expected["geo_location"] = data["geo_location"].hash_values() % 100
    assert_eq(expected, transformed)
Beispiel #8
0
def test_nested_column_group():
    df = cudf.DataFrame({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })

    country = (ColumnGroup(["geo"]) >>
               (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country"))

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = [country + "user"] + country + "user" >> Categorify(
        encode_type="combo")

    workflow = Workflow(cats)
    df_out = workflow.fit_transform(
        Dataset(df)).to_ddf().compute(scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")
Beispiel #9
0
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine):
    df_copy = df.copy()

    # Substring
    # Replacement
    substring = ColumnGroup(["name-cat", "name-string"
                             ]) >> (lambda col: col.str.slice(1, 3))
    processor = nvtabular.Workflow(substring)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(new_gdf["name-cat"],
                 df_copy["name-cat"].str.slice(1, 3),
                 check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"].str.slice(1, 3),
                 check_index=False)

    # No Replacement from old API (skipped for other examples)
    substring = (
        ColumnGroup(["name-cat", "name-string"]) >>
        (lambda col: col.str.slice(1, 3)) >> ops.Rename(postfix="_slice"))
    processor = nvtabular.Workflow(["name-cat", "name-string"] + substring)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(
        new_gdf["name-cat_slice"],
        df_copy["name-cat"].str.slice(1, 3),
        check_index=False,
        check_names=False,
    )
    assert_eq_dd(
        new_gdf["name-string_slice"],
        df_copy["name-string"].str.slice(1, 3),
        check_index=False,
        check_names=False,
    )
    assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"], check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"],
                 check_index=False)

    # Replace
    # Replacement
    oplambda = ColumnGroup(["name-cat", "name-string"
                            ]) >> (lambda col: col.str.replace("e", "XX"))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert_eq_dd(new_gdf["name-cat"],
                 df_copy["name-cat"].str.replace("e", "XX"),
                 check_index=False)
    assert_eq_dd(new_gdf["name-string"],
                 df_copy["name-string"].str.replace("e", "XX"),
                 check_index=False)

    # astype
    # Replacement
    oplambda = ColumnGroup(["id"]) >> (lambda col: col.astype(float))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert new_gdf["id"].dtype == "float64"

    # Workflow
    # Replacement
    oplambda = (
        ColumnGroup(["name-cat"]) >>
        (lambda col: col.astype(str).str.slice(0, 1)) >> ops.Categorify())
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert is_integer_dtype(new_gdf["name-cat"].dtype)

    oplambda = (ColumnGroup(["name-cat", "name-string"]) >> ops.Categorify() >>
                (lambda col: col + 100))
    processor = nvtabular.Workflow(oplambda)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    assert is_integer_dtype(new_gdf["name-cat"].dtype)
    assert np.sum(new_gdf["name-cat"] < 100) == 0
Beispiel #10
0
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory,
                    dask):
    rmm.reinitialize(managed_memory=False)
    documents_categories_path = os.path.join(data_bucket_folder,
                                             "documents_categories.csv")
    documents_topics_path = os.path.join(data_bucket_folder,
                                         "documents_topics.csv")
    documents_entities_path = os.path.join(data_bucket_folder,
                                           "documents_entities.csv")

    documents_categories_cudf = cudf.read_csv(documents_categories_path)
    documents_topics_cudf = cudf.read_csv(documents_topics_path)
    documents_entities_cudf = cudf.read_csv(documents_entities_path)
    documents_entities_cudf["entity_id"] = (
        documents_entities_cudf["entity_id"].astype("category").cat.codes)

    categories = _df_to_coo(documents_categories_cudf, col="category_id")
    topics = _df_to_coo(documents_topics_cudf, col="topic_id")
    entities = _df_to_coo(documents_entities_cudf, col="entity_id")

    del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
    ctr_thresh = {
        "ad_id": 5,
        "source_id_promo": 10,
        "publisher_id_promo": 10,
        "advertiser_id": 10,
        "campaign_id": 10,
        "document_id_promo": 5,
    }

    ctr_inputs = ColumnGroup(CTR_INPUTS)
    cat_cols = ColumnGroup(CATEGORICAL_COLUMNS)

    geo_location = ColumnGroup(["geo_location"])
    country = (geo_location >>
               (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country"))
    state = (geo_location >>
             (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state"))
    geo_features = geo_location + country + state

    dates = ["publish_time", "publish_time_promo"]
    date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp

    stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    ctr_cols = (stat_cols - [
        column + "_count" for column in ctr_inputs.flattened_columns
    ] >> LambdaOp(
        f=lambda col, gdf:
        ((col) / (gdf[col.name.replace("_clicked_sum", "_count")])).where(
            gdf[col.name.replace("_clicked_sum", "_count")] >= ctr_thresh[
                col.name.replace("_clicked_sum", "")],
            0,
        ),
        dependency=stat_cols -
        [column + "clicked_sum" for column in ctr_inputs.flattened_columns],
    ) >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr")))

    stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize()
    ctr_cols = ctr_cols >> FillMissing()

    cat_cols = cat_cols + geo_features >> HashBucket(hash_spec)

    features = (date_features + ctr_cols + stat_cols + cat_cols +
                ["clicked", "display_id"])
    sim_features_categ = (
        [["document_id", "document_id_promo"]] >> ColumnSimilarity(
            categories, metric="tfidf", on_device=False) >>
        Rename(postfix="_categories"))
    sim_features_topics = (
        [["document_id", "document_id_promo"]
         ] >> ColumnSimilarity(topics, metric="tfidf",
                               on_device=False) >> Rename(postfix="_topics"))
    sim_features_entities = (
        [["document_id", "document_id_promo"]
         ] >> ColumnSimilarity(entities, metric="tfidf",
                               on_device=False) >> Rename(postfix="_entities"))
    sim_features = sim_features_categ + sim_features_topics + sim_features_entities

    client = create_client(devices=devices,
                           local_directory=local_directory) if dask else None

    workflow = nvt.Workflow(column_group=features + sim_features,
                            client=client)

    return workflow