Esempio n. 1
0
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    normalize = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> normalize
    workflow = Workflow(conts + cat_names + label_name, client=client)

    dataset = Dataset(paths, engine)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    for name in cont_names:
        assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3)
        assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
Esempio n. 2
0
def test_workflow_move_saved(tmpdir):
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = cudf.DataFrame({"geo": raw})

    geo_location = ColumnGroup(["geo"])
    state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename(
        postfix="_state")
    country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename(
        postfix="_country")
    geo_features = state + country + geo_location >> ops.Categorify()

    # create the workflow and transform the input
    workflow = Workflow(geo_features)
    expected = workflow.fit_transform(Dataset(data)).to_ddf().compute()

    # save the workflow (including categorical mapping parquet files)
    # and then verify we can load the saved workflow after moving the directory
    out_path = os.path.join(tmpdir, "output", "workflow")
    workflow.save(out_path)

    moved_path = os.path.join(tmpdir, "output", "workflow2")
    shutil.move(out_path, moved_path)
    workflow2 = Workflow.load(moved_path)

    # also check that when transforming our input we get the same results after loading
    transformed = workflow2.transform(Dataset(data)).to_ddf().compute()
    assert_eq(expected, transformed)
Esempio n. 3
0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    cats = ColumnGroup(["ad_id", "source_id", "platform"])
    cat_features = cats >> ops.Categorify
    cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize
    te_features = cats >> ops.TargetEncoding(
        "clicked", kfold=5, fold_seed=42, p_smooth=20)

    p = Workflow(cat_features + cont_features + te_features, client=client)
    p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
Esempio n. 4
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)
    )

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name, client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64),
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "min"
    assert_eq(
        result[["name-string", "name-string_x_min"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_min"],
        df0.groupby("name-string").agg({"x": "min"})["x"],
        check_index=False,
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
Esempio n. 5
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_cols=cont_names,
        stats=["count", "sum", "std", "min"],
        out_path=str(tmpdir))

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name,
                        client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check results.  Need to sort for direct comparison
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    gb_e = expect.groupby("name-cat").aggregate({
        "name-cat": "count",
        "x": ["sum", "min", "std"]
    })
    gb_e.columns = ["count", "sum", "min", "std"]
    df_check = got.merge(gb_e,
                         left_on="name-cat",
                         right_index=True,
                         how="left")
    assert_eq(df_check["name-cat_count"],
              df_check["count"].astype("int64"),
              check_names=False)
    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False)
    assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False)
    assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
Esempio n. 6
0
def test_chaining_1():
    df = cudf.DataFrame({
        "cont01": np.random.randint(1, 100, 100),
        "cont02": np.random.random(100) * 100,
        "cat01": np.random.randint(0, 10, 100),
        "label": np.random.randint(0, 3, 100),
    })
    df["cont01"][:10] = None

    cont1 = "cont01" >> ops.FillMissing()
    conts = cont1 + "cont02" >> ops.NormalizeMinMax()
    workflow = Workflow(conts + "cat01" + "label")

    result = workflow.fit_transform(Dataset(df)).to_ddf().compute()

    assert result["cont01"].max() <= 1.0
    assert result["cont02"].max() <= 1.0
Esempio n. 7
0
def test_nested_workflow_node():
    df = dispatch._make_df({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })
    dataset = Dataset(df)

    geo_selector = ColumnSelector(["geo"])
    country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >>
               Rename(postfix="_country"))
    # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1")
    # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2")
    user = "******"
    # user2 = "user2"

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = country + user + [country + user] >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    workflow.fit_schema(dataset.infer_schema())

    df_out = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")
Esempio n. 8
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client):
    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]

    cats = ColumnGroup(cat_names)
    cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)
    groupby_features = cats >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir)
    )

    workflow = Workflow(cat_features + groupby_features, client=client)
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
Esempio n. 9
0
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_size="1MB", cpu=cpu)
    else:
        dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu)

    # Simple transform (normalize)
    cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]
    conts = cont_names >> ops.FillMissing() >> ops.Normalize()
    workflow = Workflow(conts + cat_names + label_name, client=client)
    transformed = workflow.fit_transform(dataset)

    # Write out dataset
    output_path = os.path.join(tmpdir, "processed")
    transformed.to_parquet(output_path=output_path,
                           shuffle=shuffle,
                           out_files_per_proc=4)

    # Check the final result
    df_disk = dd_read_parquet(output_path, engine="pyarrow").compute()
    assert_eq(
        df0.sort_values(["id", "x"])[["name-string", "label"]],
        df_disk.sort_values(["id", "x"])[["name-string", "label"]],
        check_index=False,
    )
Esempio n. 10
0
def test_column_group_select():
    df = cudf.DataFrame({
        "a": [1, 4, 9, 16, 25],
        "b": [0, 1, 2, 3, 4],
        "c": [25, 16, 9, 4, 1]
    })

    input_features = ColumnGroup(["a", "b", "c"])
    sqrt_features = input_features[["a", "c"]] >> cudf.sqrt
    plus_one_features = input_features["b"] >> (lambda col: col + 1)
    features = sqrt_features + plus_one_features

    workflow = Workflow(features)
    df_out = workflow.fit_transform(
        Dataset(df)).to_ddf().compute(scheduler="synchronous")

    expected = cudf.DataFrame()
    expected["a"] = cudf.sqrt(df["a"])
    expected["c"] = cudf.sqrt(df["c"])
    expected["b"] = df["b"] + 1

    assert_eq(expected, df_out)
Esempio n. 11
0
def test_nested_column_group(tmpdir):
    df = cudf.DataFrame(
        {
            "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
            "user": ["User_A", "User_A", "User_A", "User_B"],
        }
    )

    country = (
        ColumnGroup(["geo"]) >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")
    )

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = [country + "user"] + country + "user" >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    df_out = workflow.fit_transform(Dataset(df)).to_ddf().compute(scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"] >> Categorify(encode_type="combo")
Esempio n. 12
0
def test_dask_workflow_api_dlrm(
    client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle
):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    cats = cat_names >> ops.Categorify(
        freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host
    )

    conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp()

    workflow = Workflow(cats + conts + label_name, client=client)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    else:
        dataset = Dataset(paths, names=allcols_csv, part_mem_fraction=part_mem_fraction)

    output_path = os.path.join(tmpdir, "processed")

    transformed = workflow.fit_transform(dataset)
    transformed.to_parquet(output_path=output_path, shuffle=shuffle)

    # Can still access the final ddf if we didn't shuffle
    if not shuffle:
        result = transformed.to_ddf().compute()
        assert len(df0) == len(result)
        assert result["x"].min() == 0.0
        assert result["x"].isna().sum() == 0
        assert result["y"].min() == 0.0

        assert result["y"].isna().sum() == 0

        # Check category counts
        cat_expect = df0.groupby("name-string").agg({"name-string": "count"}).reset_index(drop=True)
        cat_result = (
            result.groupby("name-string").agg({"name-string": "count"}).reset_index(drop=True)
        )
        if freq_threshold:
            cat_expect = cat_expect[cat_expect["name-string"] >= freq_threshold]
            # Note that we may need to skip the 0th element in result (null mapping)
            assert_eq(
                cat_expect,
                cat_result.iloc[1:] if len(cat_result) > len(cat_expect) else cat_result,
                check_index=False,
            )
        else:
            assert_eq(cat_expect, cat_result)

        # Read back from disk
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        for col in df_disk:
            assert_eq(result[col], df_disk[col])

    else:
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        assert len(df0) == len(df_disk)
Esempio n. 13
0
def test_dask_workflow_api_dlrm(
    client,
    tmpdir,
    datasets,
    freq_threshold,
    part_mem_fraction,
    engine,
    cat_cache,
    on_host,
    shuffle,
    cpu,
):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    paths = sorted(paths)
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)
    df0 = df0.to_pandas() if cpu else df0

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    cats = cat_names >> ops.Categorify(freq_threshold=freq_threshold,
                                       out_path=str(tmpdir),
                                       cat_cache=cat_cache,
                                       on_host=on_host)

    conts = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()

    workflow = Workflow(cats + conts + label_name, client=client)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, cpu=cpu, part_mem_fraction=part_mem_fraction)
    else:
        dataset = Dataset(paths,
                          cpu=cpu,
                          names=allcols_csv,
                          part_mem_fraction=part_mem_fraction)

    output_path = os.path.join(tmpdir, "processed")

    transformed = workflow.fit_transform(dataset)
    transformed.to_parquet(output_path=output_path,
                           shuffle=shuffle,
                           out_files_per_proc=1)

    result = transformed.to_ddf().compute()
    assert len(df0) == len(result)
    assert result["x"].min() == 0.0
    assert result["x"].isna().sum() == 0
    assert result["y"].min() == 0.0
    assert result["y"].isna().sum() == 0

    # Check categories.  Need to sort first to make sure we are comparing
    # "apples to apples"
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    dfm = expect.merge(got, on="index",
                       how="inner")[["name-string_x", "name-string_y"]]
    dfm_gb = dfm.groupby(["name-string_x", "name-string_y"]).agg({
        "name-string_x":
        "count",
        "name-string_y":
        "count"
    })
    if freq_threshold:
        dfm_gb = dfm_gb[dfm_gb["name-string_x"] >= freq_threshold]
    assert_eq(dfm_gb["name-string_x"],
              dfm_gb["name-string_y"],
              check_names=False)

    # Read back from disk
    if cpu:
        df_disk = dd_read_parquet(output_path).compute()
    else:
        df_disk = dask_cudf.read_parquet(output_path).compute()

    # we don't have a deterministic ordering here, especially when using
    # a dask client with multiple workers - so we need to sort the values here
    columns = ["label", "x", "y", "id"] + cat_names
    got = result.sort_values(columns).reset_index(drop=True)
    expect = df_disk.sort_values(columns).reset_index(drop=True)
    assert_eq(got, expect)