def test_chaining_2(): gdf = cudf.DataFrame({ "A": [1, 2, 2, 9, 6, np.nan, 3], "B": [2, np.nan, 4, 7, 7, 2, 5], "C": ["a", "b", "c", np.nan, np.nan, "g", "k"], }) cat_names = ["C"] cont_names = ["A", "B"] label_name = [] all_features = (cat_names + cont_names >> ops.LambdaOp( f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull")) cat_features = cat_names >> ops.Categorify() workflow = Workflow(all_features + cat_features + label_name) dataset = nvt.Dataset(gdf, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all(x in list(result.columns) for x in ["A_isnull", "B_isnull", "C_isnull"]) assert (x in result["C"].unique() for x in set(gdf["C"].dropna().to_arrow()))
def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) platform_features = ["platform"] >> ops.Dropna() joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) joined_lambda = ( joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >> ops.Rename(postfix="_ctr")) workflow = Workflow(platform_features + joined + joined_lambda) dataset = nvt.Dataset(gdf_test, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_workflow_node_select(): df = dispatch._make_df({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) dataset = Dataset(df) input_features = WorkflowNode(ColumnSelector(["a", "b", "c"])) # pylint: disable=unnecessary-lambda sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col)) plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) workflow.fit(dataset) df_out = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") expected = dispatch._make_df() expected["a"] = np.sqrt(df["a"]) expected["c"] = np.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] normalize = ops.Normalize() conts = cont_names >> ops.FillMissing() >> normalize workflow = Workflow(conts + cat_names + label_name, client=client) dataset = Dataset(paths, engine) result = workflow.fit_transform(dataset).to_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() for name in cont_names: assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3) assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_workflow_node_subtraction(): schema = Schema(["a", "b", "c", "d", "e", "f"]) node1 = ["a", "b", "c", "d"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["b"] >> Operator() output_node = node1 - ["c", "d"] workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 0 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = node1 - node2 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = ["a", "b", "c", "d"] - node2 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = node1 - ["c", "d"] - node3 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a"]
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir) ) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64), check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "min" assert_eq( result[["name-string", "name-string_x_min"]] .drop_duplicates() .sort_values("name-string")["name-string_x_min"], df0.groupby("name-string").agg({"x": "min"})["x"], check_index=False, check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_fit_schema_works_with_raw_column_dependencies(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost") workflow = Workflow(cat_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(ops.Normalize()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() counts = df0[cont_names].count() for name in cont_names: assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3) assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3) assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_dask_median_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.Median()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # TODO: Improve the accuracy! "tidigest" with crick could help, # but current version seems to have cupy/numpy problems here medians = result[cont_names].quantile(q=0.5) assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1) assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1) assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.MinMax()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3) assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3) assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3) assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3) assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3) assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_fit_schema_works_with_grouped_node_inputs(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y", ("x", "y")]) >> ops.TargetEncoding("cost") workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert sorted(workflow1.output_schema.column_names) == sorted( ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_addition_nodes_are_combined(): schema = Schema(["a", "b", "c", "d", "e", "f", "g", "h"]) node1 = ["a", "b"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["e", "f"] >> Operator() node4 = ["g", "h"] >> Operator() add_node = node1 + node2 + node3 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set(workflow.output_node.dependencies) == {node2, node3} assert set(workflow.output_node.output_columns.names) == { "a", "b", "c", "d", "e", "f" } add_node = node1 + "c" + "d" workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set( workflow.output_node.output_columns.names) == {"a", "b", "c", "d"} add_node = "c" + node1 + "d" workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set( workflow.output_node.output_columns.names) == {"a", "b", "c", "d"} add_node = node1 + "e" + node2 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert node2 in workflow.output_node.dependencies assert set(workflow.output_node.output_columns.names) == { "a", "b", "e", "c", "d" } add_node1 = node1 + node2 add_node2 = node3 + node4 add_node = add_node1 + add_node2 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set(workflow.output_node.dependencies) == {node2, node3, node4} assert set(workflow.output_node.output_columns.names) == { "a", "b", "c", "d", "e", "f", "g", "h", }
def test_fit_schema_works_when_subtracting_column_names(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow1 = Workflow(cont_features - "y_renamed") workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == ["x_renamed"]
def test_filtered_partition(tmpdir, cpu): # Toy DataFrame example df = pd.DataFrame({"col": range(100)}) ddf = dd_from_pandas(df, npartitions=5) dataset = Dataset(ddf, cpu=cpu) # Workflow filtered = ["col"] >> ops.Filter(lambda df: df["col"] < 75) workflow = Workflow(filtered) # Write result to disk workflow.transform(dataset).to_parquet(str(tmpdir))
def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op workflow = Workflow(cont_features) workflow.fit_schema(schema) output_cols = op.output_column_names(ColumnSelector(["col1", "col2"])) assert len(workflow.output_schema.column_names) == len(output_cols.names)
def test_fit_schema_works_with_node_dependencies(): schema = Schema(["x", "y", "cost"]) cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed") cat_features = ColumnSelector(["x", "y" ]) >> ops.TargetEncoding(cont_features) workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == [ "TE_x_cost_renamed", "TE_y_cost_renamed" ]
def test_fit_schema(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow = Workflow(cont_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == [ "x_renamed", "y_renamed", "id_renamed" ]
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_cols=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check results. Need to sort for direct comparison expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() gb_e = expect.groupby("name-cat").aggregate({ "name-cat": "count", "x": ["sum", "min", "std"] }) gb_e.columns = ["count", "sum", "min", "std"] df_check = got.merge(gb_e, left_on="name-cat", right_index=True, how="left") assert_eq(df_check["name-cat_count"], df_check["count"].astype("int64"), check_names=False) assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False) assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False) assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
def test_workflow_input_output_dtypes(): df = cudf.DataFrame({ "genre": ["drama", "comedy"], "user": ["a", "b"], "unneeded": [1, 2] }) features = [["genre", "user"], "genre" ] >> ops.Categorify(encode_type="combo") workflow = Workflow(features) workflow.fit(Dataset(df)) assert "unneeded" not in workflow.input_dtypes assert set(workflow.input_dtypes.keys()) == {"genre", "user"} assert set(workflow.output_dtypes.keys()) == {"genre_user", "genre"}
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess( ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir)) ) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset) result = processor.get_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"], check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_fit_simple(): data = cudf.DataFrame({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = cudf.DataFrame({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) assert_eq(expected, transformed)
def test_chaining_1(): df = cudf.DataFrame({ "cont01": np.random.randint(1, 100, 100), "cont02": np.random.random(100) * 100, "cat01": np.random.randint(0, 10, 100), "label": np.random.randint(0, 3, 100), }) df["cont01"][:10] = None cont1 = "cont01" >> ops.FillMissing() conts = cont1 + "cont02" >> ops.NormalizeMinMax() workflow = Workflow(conts + "cat01" + "label") result = workflow.fit_transform(Dataset(df)).to_ddf().compute() assert result["cont01"].max() <= 1.0 assert result["cont02"].max() <= 1.0
def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) cats = ColumnGroup(["ad_id", "source_id", "platform"]) cat_features = cats >> ops.Categorify cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize te_features = cats >> ops.TargetEncoding( "clicked", kfold=5, fold_seed=42, p_smooth=20) p = Workflow(cat_features + cont_features + te_features, client=client) p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_grab_additional_input_columns(dataset, engine): schema = Schema(["x", "y"]) node1 = ["x"] >> ops.FillMissing() node2 = node1 >> ops.Clip(min_value=0) add_node = node2 + ["y"] workflow = Workflow(add_node).fit_schema(schema) output_df = workflow.transform(dataset).to_ddf().compute() assert len(workflow.output_node.input_columns.names) == 2 assert workflow.output_node.input_columns.names == ["x", "y"] assert len(workflow.output_node.output_columns.names) == 2 assert workflow.output_node.output_columns.names == ["x", "y"] assert len(output_df.columns) == 2 assert output_df.columns.tolist() == ["x", "y"]
def test_nested_workflow_node(): df = dispatch._make_df({ "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], }) dataset = Dataset(df) geo_selector = ColumnSelector(["geo"]) country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1") # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2") user = "******" # user2 = "user2" # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = country + user + [country + user] >> Categorify(encode_type="combo") workflow = Workflow(cats) workflow.fit_schema(dataset.infer_schema()) df_out = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[ 0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user" ] >> Categorify(encode_type="combo")
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] cats = ColumnGroup(cat_names) cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True) groupby_features = cats >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir) ) workflow = Workflow(cat_features + groupby_features, client=client) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_fit_simple(): data = nvt.dispatch._make_df({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = nvt.dispatch._make_df({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) if not HAS_GPU: transformed["x"] = transformed["x"].astype(expected["x"].dtype) transformed["y"] = transformed["y"].astype(expected["y"].dtype) assert_eq(expected, transformed)
def test_workflow_transform_ddf_dtypes(): # Initial Dataset df = cudf.datasets.timeseries().reset_index() ddf = dask_cudf.from_cudf(df, npartitions=2) dataset = Dataset(ddf) # Create and Execute Workflow cols = ["name", "x", "y", "timestamp"] cat_cols = ["id"] >> ops.Normalize() workflow = Workflow(cols + cat_cols) workflow.fit(dataset) transformed_ddf = workflow.transform(dataset).to_ddf() # no transforms on the pass through cols, should have original dtypes for col in cols: assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col]) # Followup dask-cudf sorting used to throw an exception because of dtype issues, # check that it works now transformed_ddf.sort_values(["id", "timestamp"]).compute()
def test_workflow_move_saved(tmpdir): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo": raw}) geo_location = ColumnGroup(["geo"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.Categorify() # create the workflow and transform the input workflow = Workflow(geo_features) expected = workflow.fit_transform(Dataset(data)).to_ddf().compute() # save the workflow (including categorical mapping parquet files) # and then verify we can load the saved workflow after moving the directory out_path = os.path.join(tmpdir, "output", "workflow") workflow.save(out_path) moved_path = os.path.join(tmpdir, "output", "workflow2") shutil.move(out_path, moved_path) workflow2 = Workflow.load(moved_path) # also check that when transforming our input we get the same results after loading transformed = workflow2.transform(Dataset(data)).to_ddf().compute() assert_eq(expected, transformed)