def test_rename(cpu): DataFrame = pd.DataFrame if cpu else cudf.DataFrame df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [6, 7, 8, 9, 10]}) selector = ColumnSelector(["x", "y"]) op = ops.Rename(f=lambda name: name.upper()) transformed = op.transform(selector, df) expected = DataFrame({"X": [1, 2, 3, 4, 5], "Y": [6, 7, 8, 9, 10]}) assert_eq(transformed, expected) op = ops.Rename(postfix="_lower") transformed = op.transform(selector, df) expected = DataFrame({"x_lower": [1, 2, 3, 4, 5], "y_lower": [6, 7, 8, 9, 10]}) assert_eq(transformed, expected) selector = ColumnSelector(["x"]) op = ops.Rename(name="z") transformed = op.transform(selector, df) expected = DataFrame({"z": [1, 2, 3, 4, 5]}) assert_eq(transformed, expected) op = nvt.ops.Rename(f=lambda name: name.upper()) transformed = op.transform(selector, df) expected = DataFrame({"X": [1, 2, 3, 4, 5]}) assert_eq(transformed, expected)
def test_workflow_move_saved(tmpdir): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo": raw}) geo_location = ColumnGroup(["geo"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.Categorify() # create the workflow and transform the input workflow = Workflow(geo_features) expected = workflow.fit_transform(Dataset(data)).to_ddf().compute() # save the workflow (including categorical mapping parquet files) # and then verify we can load the saved workflow after moving the directory out_path = os.path.join(tmpdir, "output", "workflow") workflow.save(out_path) moved_path = os.path.join(tmpdir, "output", "workflow2") shutil.move(out_path, moved_path) workflow2 = Workflow.load(moved_path) # also check that when transforming our input we get the same results after loading transformed = workflow2.transform(Dataset(data)).to_ddf().compute() assert_eq(expected, transformed)
def test_fit_schema_works_when_subtracting_missing_nodes(): schema = Schema(["x", "y", "id", "baseball"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) subtract_features = ["y", "baseball"] >> ops.Rename(postfix="_renamed") workflow1 = Workflow(cont_features - subtract_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == ["x_renamed"]
def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) platform_features = ["platform"] >> ops.Dropna() joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) joined_lambda = ( joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >> ops.Rename(postfix="_ctr")) workflow = Workflow(platform_features + joined + joined_lambda) dataset = nvt.Dataset(gdf_test, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_chaining_2(): gdf = cudf.DataFrame({ "A": [1, 2, 2, 9, 6, np.nan, 3], "B": [2, np.nan, 4, 7, 7, 2, 5], "C": ["a", "b", "c", np.nan, np.nan, "g", "k"], }) cat_names = ["C"] cont_names = ["A", "B"] label_name = [] all_features = (cat_names + cont_names >> ops.LambdaOp( f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull")) cat_features = cat_names >> ops.Categorify() workflow = Workflow(all_features + cat_features + label_name) dataset = nvt.Dataset(gdf, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all(x in list(result.columns) for x in ["A_isnull", "B_isnull", "C_isnull"]) assert (x in result["C"].unique() for x in set(gdf["C"].dropna().to_arrow()))
def test_workflow_generate_columns(tmpdir, use_parquet): out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) # Stripped down dataset with geo_locaiton codes like in outbrains df = nvt.dispatch._make_df( {"geo_location": ["US>CA", "CA>BC", "US>TN>659"]}) # defining a simple workflow that strips out the country code from the first two digits of the # geo_location code and sticks in a new 'geo_location_country' field country = (["geo_location"] >> ops.LambdaOp( f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country")) cat_features = ["geo_location"] + country >> ops.Categorify() workflow = Workflow(cat_features) if use_parquet: df.to_parquet(path) dataset = nvt.Dataset(path) else: dataset = nvt.Dataset(df) # just make sure this works without errors workflow.fit(dataset) workflow.transform(dataset).to_parquet(out_path)
def test_fit_schema_works_with_addition_nodes(): schema = Schema(["x", "y", "id"]) x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + "y") workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y"] x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") y_node = ColumnSelector(["y"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + y_node) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y_renamed"]
def test_fit_schema_works_with_selection_nodes(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow = Workflow(cont_features["x_renamed"]) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed"]
def test_categorify_embedding_sizes(dataset, engine): cat_1 = ColumnSelector(["name-cat"]) >> ops.Categorify() cat_2 = ColumnSelector( ["name-string"]) >> ops.Categorify() >> ops.Rename(postfix="_test") workflow = nvt.Workflow(cat_1 + cat_2) workflow.fit_transform(dataset) assert get_embedding_sizes(workflow) == { "name-cat": (27, 16), "name-string_test": (27, 16) }
def test_fit_schema_works_with_node_dependencies(): schema = Schema(["x", "y", "cost"]) cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed") cat_features = ColumnSelector(["x", "y" ]) >> ops.TargetEncoding(cont_features) workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == [ "TE_x_cost_renamed", "TE_y_cost_renamed" ]
def test_transform_geolocation(): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo_location": raw}) geo_location = ColumnGroup(["geo_location"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.HashBucket( num_buckets=100) # for this workflow we don't have any statoperators, so we can get away without fitting workflow = Workflow(geo_features) transformed = workflow.transform(Dataset(data)).to_ddf().compute() expected = cudf.DataFrame() expected["geo_location_state"] = data["geo_location"].str.slice( 0, 5).hash_values() % 100 expected["geo_location_country"] = data["geo_location"].str.slice( 0, 2).hash_values() % 100 expected["geo_location"] = data["geo_location"].hash_values() % 100 assert_eq(expected, transformed)
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify() if replace: cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms else: fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >> ops.Rename(postfix="_FillMissing_1_LogOp_1")) cont_features = cont_names + fillmissing_logop >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log concat_ops = "_FillMissing_1_LogOp_1" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), norms.means["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), norms.means["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine): df_copy = df.copy() # Substring # Replacement substring = ColumnGroup(["name-cat", "name-string" ]) >> (lambda col: col.str.slice(1, 3)) processor = nvtabular.Workflow(substring) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.slice(1, 3), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.slice(1, 3), check_index=False) # No Replacement from old API (skipped for other examples) substring = ( ColumnGroup(["name-cat", "name-string"]) >> (lambda col: col.str.slice(1, 3)) >> ops.Rename(postfix="_slice")) processor = nvtabular.Workflow(["name-cat", "name-string"] + substring) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd( new_gdf["name-cat_slice"], df_copy["name-cat"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd( new_gdf["name-string_slice"], df_copy["name-string"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"], check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"], check_index=False) # Replace # Replacement oplambda = ColumnGroup(["name-cat", "name-string" ]) >> (lambda col: col.str.replace("e", "XX")) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.replace("e", "XX"), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.replace("e", "XX"), check_index=False) # astype # Replacement oplambda = ColumnGroup(["id"]) >> (lambda col: col.astype(float)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf["id"].dtype == "float64" # Workflow # Replacement oplambda = ( ColumnGroup(["name-cat"]) >> (lambda col: col.astype(str).str.slice(0, 1)) >> ops.Categorify()) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) oplambda = (ColumnGroup(["name-cat", "name-string"]) >> ops.Categorify() >> (lambda col: col + 100)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) assert np.sum(new_gdf["name-cat"] < 100) == 0
workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == [ "TE_x_cost_renamed", "TE_y_cost_renamed" ] # initial column selector works with tags # filter within the workflow by tags # test tags correct at output @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("col1"), ops.FillMissing(), ops.Groupby("col1"), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby("col1"), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding("col1"), ], )
import numpy as np import pytest import nvtabular as nvt from nvtabular import ColumnSchema, ColumnSelector, Schema, dispatch, ops from nvtabular.dispatch import HAS_GPU @pytest.mark.parametrize("properties", [{}, {"p1": "1"}]) @pytest.mark.parametrize("tags", [[], ["TAG1", "TAG2"]]) @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("1"), ops.FillMissing(), ops.Groupby(["1"]), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby(["1"]), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding(["1"]), ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}), ops.ValueCount(), ],