def test_workflow_node_subtraction(): schema = Schema(["a", "b", "c", "d", "e", "f"]) node1 = ["a", "b", "c", "d"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["b"] >> Operator() output_node = node1 - ["c", "d"] workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 0 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = node1 - node2 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = ["a", "b", "c", "d"] - node2 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = node1 - ["c", "d"] - node3 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a"]
def test_addition_nodes_are_combined(): schema = Schema(["a", "b", "c", "d", "e", "f", "g", "h"]) node1 = ["a", "b"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["e", "f"] >> Operator() node4 = ["g", "h"] >> Operator() add_node = node1 + node2 + node3 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set(workflow.output_node.dependencies) == {node2, node3} assert set(workflow.output_node.output_columns.names) == { "a", "b", "c", "d", "e", "f" } add_node = node1 + "c" + "d" workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set( workflow.output_node.output_columns.names) == {"a", "b", "c", "d"} add_node = "c" + node1 + "d" workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set( workflow.output_node.output_columns.names) == {"a", "b", "c", "d"} add_node = node1 + "e" + node2 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert node2 in workflow.output_node.dependencies assert set(workflow.output_node.output_columns.names) == { "a", "b", "e", "c", "d" } add_node1 = node1 + node2 add_node2 = node3 + node4 add_node = add_node1 + add_node2 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set(workflow.output_node.dependencies) == {node2, node3, node4} assert set(workflow.output_node.output_columns.names) == { "a", "b", "c", "d", "e", "f", "g", "h", }
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] normalize = ops.Normalize() conts = cont_names >> ops.FillMissing() >> normalize workflow = Workflow(conts + cat_names + label_name, client=client) dataset = Dataset(paths, engine) result = workflow.fit_transform(dataset).to_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() for name in cont_names: assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3) assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_chaining_2(): gdf = cudf.DataFrame({ "A": [1, 2, 2, 9, 6, np.nan, 3], "B": [2, np.nan, 4, 7, 7, 2, 5], "C": ["a", "b", "c", np.nan, np.nan, "g", "k"], }) cat_names = ["C"] cont_names = ["A", "B"] label_name = [] all_features = (cat_names + cont_names >> ops.LambdaOp( f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull")) cat_features = cat_names >> ops.Categorify() workflow = Workflow(all_features + cat_features + label_name) dataset = nvt.Dataset(gdf, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all(x in list(result.columns) for x in ["A_isnull", "B_isnull", "C_isnull"]) assert (x in result["C"].unique() for x in set(gdf["C"].dropna().to_arrow()))
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client if use_client else None, cat_names=cat_names, cont_names=cont_names, label_name=label_name, ) processor.add_preprocess( ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)) processor.add_cat_feature( ops.JoinGroupby(cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir))) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset, output_path=str(tmpdir)) result = processor.get_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(ops.Normalize()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() counts = df0[cont_names].count() for name in cont_names: assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3) assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3) assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_dask_median_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.Median()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # TODO: Improve the accuracy! "tidigest" with crick could help, # but current version seems to have cupy/numpy problems here medians = result[cont_names].quantile(q=0.5) assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1) assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1) assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.MinMax()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3) assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3) assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3) assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3) assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3) assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) platform_features = ["platform"] >> ops.Dropna() joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) joined_lambda = ( joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >> ops.Rename(postfix="_ctr")) workflow = Workflow(platform_features + joined + joined_lambda) dataset = nvt.Dataset(gdf_test, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_workflow_move_saved(tmpdir): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo": raw}) geo_location = ColumnGroup(["geo"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.Categorify() # create the workflow and transform the input workflow = Workflow(geo_features) expected = workflow.fit_transform(Dataset(data)).to_ddf().compute() # save the workflow (including categorical mapping parquet files) # and then verify we can load the saved workflow after moving the directory out_path = os.path.join(tmpdir, "output", "workflow") workflow.save(out_path) moved_path = os.path.join(tmpdir, "output", "workflow2") shutil.move(out_path, moved_path) workflow2 = Workflow.load(moved_path) # also check that when transforming our input we get the same results after loading transformed = workflow2.transform(Dataset(data)).to_ddf().compute() assert_eq(expected, transformed)
def test_workflow_node_select(): df = dispatch._make_df({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) dataset = Dataset(df) input_features = WorkflowNode(ColumnSelector(["a", "b", "c"])) # pylint: disable=unnecessary-lambda sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col)) plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) workflow.fit(dataset) df_out = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") expected = dispatch._make_df() expected["a"] = np.sqrt(df["a"]) expected["c"] = np.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_schema_write_read_dataset(tmpdir, dataset, engine): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name) workflow.fit(dataset) workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, ) schema_path = Path(tmpdir) proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt") new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet")) assert """name: "name-cat"\n min: 0\n max: 27\n""" in str( proto_schema) assert new_dataset.schema == workflow.output_schema
def test_workflow_generate_columns(tmpdir, use_parquet): out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) # Stripped down dataset with geo_locaiton codes like in outbrains df = nvt.dispatch._make_df( {"geo_location": ["US>CA", "CA>BC", "US>TN>659"]}) # defining a simple workflow that strips out the country code from the first two digits of the # geo_location code and sticks in a new 'geo_location_country' field country = (["geo_location"] >> ops.LambdaOp( f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country")) cat_features = ["geo_location"] + country >> ops.Categorify() workflow = Workflow(cat_features) if use_parquet: df.to_parquet(path) dataset = nvt.Dataset(path) else: dataset = nvt.Dataset(df) # just make sure this works without errors workflow.fit(dataset) workflow.transform(dataset).to_parquet(out_path)
def test_fit_schema_works_with_addition_nodes(): schema = Schema(["x", "y", "id"]) x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + "y") workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y"] x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") y_node = ColumnSelector(["y"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + y_node) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y_renamed"]
def test_fit_schema_works_with_raw_column_dependencies(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost") workflow = Workflow(cat_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir) ) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64), check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "min" assert_eq( result[["name-string", "name-string_x_min"]] .drop_duplicates() .sort_values("name-string")["name-string_x_min"], df0.groupby("name-string").agg({"x": "min"})["x"], check_index=False, check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_fit_schema_works_with_grouped_node_inputs(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y", ("x", "y")]) >> ops.TargetEncoding("cost") workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert sorted(workflow1.output_schema.column_names) == sorted( ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_fit_schema_works_when_subtracting_column_names(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow1 = Workflow(cont_features - "y_renamed") workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == ["x_renamed"]
def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline): out_files_per_proc = 2 out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) size = 25 row_group_size = 5 cont_names = ["cont1", "cont2"] cat_names = ["cat1", "cat2"] label_name = ["label"] df = pd.DataFrame({ "cont1": np.arange(size, dtype=np.float64), "cont2": np.arange(size, dtype=np.float64), "cat1": np.arange(size, dtype=np.int32), "cat2": np.arange(size, dtype=np.int32), "label": np.arange(size, dtype=np.float64), }) df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow") dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1) cat_features = cat_names >> ops.Categorify() cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp workflow = Workflow(cat_features + cont_features + label_name, client=client if use_client else None) workflow.fit(dataset) # Force dtypes dict_dtypes = {} for col in cont_names: dict_dtypes[col] = np.float32 for col in cat_names: dict_dtypes[col] = np.float32 for col in label_name: dict_dtypes[col] = np.int64 workflow.transform(dataset).to_parquet( # apply_offline=apply_offline, Not any more? # record_stats=apply_offline, Not any more? output_path=out_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc, dtypes=dict_dtypes, ) # Check dtypes for filename in glob.glob(os.path.join(out_path, "*.parquet")): gdf = cudf.io.read_parquet(filename) assert dict(gdf.dtypes) == dict_dtypes
def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op workflow = Workflow(cont_features) workflow.fit_schema(schema) output_cols = op.output_column_names(ColumnSelector(["col1", "col2"])) assert len(workflow.output_schema.column_names) == len(output_cols.names)
def test_filtered_partition(tmpdir, cpu): # Toy DataFrame example df = pd.DataFrame({"col": range(100)}) ddf = dd_from_pandas(df, npartitions=5) dataset = Dataset(ddf, cpu=cpu) # Workflow filtered = ["col"] >> ops.Filter(lambda df: df["col"] < 75) workflow = Workflow(filtered) # Write result to disk workflow.transform(dataset).to_parquet(str(tmpdir))
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess( ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir)) ) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset) result = processor.get_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"], check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_workflow_node_addition(): schema = Schema(["a", "b", "c", "d", "e", "f"]) node1 = ["a", "b"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["e", "f"] >> Operator() output_node = node1 + node2 workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"] output_node = node1 + "c" workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == ["a", "b", "c"] output_node = node1 + "c" + "d" workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"] output_node = node1 + node2 + "e" workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == [ "a", "b", "c", "d", "e" ] output_node = node1 + node2 + node3 workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == [ "a", "b", "c", "d", "e", "f" ] # Addition with groups output_node = node1 + ["c", "d"] workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.grouped_names == [ "a", "b", "c", "d" ] output_node = node1 + [node2, "e"] workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.grouped_names == [ "a", "b", "c", "d", "e" ] output_node = node1 + [node2, node3] workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.grouped_names == [ "a", "b", "c", "d", "e", "f" ]
def test_fit_schema_works_with_node_dependencies(): schema = Schema(["x", "y", "cost"]) cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed") cat_features = ColumnSelector(["x", "y" ]) >> ops.TargetEncoding(cont_features) workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == [ "TE_x_cost_renamed", "TE_y_cost_renamed" ]
def test_fit_schema(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow = Workflow(cont_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == [ "x_renamed", "y_renamed", "id_renamed" ]
def test_workflow_input_output_dtypes(): df = cudf.DataFrame({ "genre": ["drama", "comedy"], "user": ["a", "b"], "unneeded": [1, 2] }) features = [["genre", "user"], "genre" ] >> ops.Categorify(encode_type="combo") workflow = Workflow(features) workflow.fit(Dataset(df)) assert "unneeded" not in workflow.input_dtypes assert set(workflow.input_dtypes.keys()) == {"genre", "user"} assert set(workflow.output_dtypes.keys()) == {"genre_user", "genre"}
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_cols=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check results. Need to sort for direct comparison expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() gb_e = expect.groupby("name-cat").aggregate({ "name-cat": "count", "x": ["sum", "min", "std"] }) gb_e.columns = ["count", "sum", "min", "std"] df_check = got.merge(gb_e, left_on="name-cat", right_index=True, how="left") assert_eq(df_check["name-cat_count"], df_check["count"].astype("int64"), check_names=False) assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False) assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False) assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
def test_fit_simple(): data = cudf.DataFrame({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = cudf.DataFrame({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) assert_eq(expected, transformed)
def test_chaining_1(): df = cudf.DataFrame({ "cont01": np.random.randint(1, 100, 100), "cont02": np.random.random(100) * 100, "cat01": np.random.randint(0, 10, 100), "label": np.random.randint(0, 3, 100), }) df["cont01"][:10] = None cont1 = "cont01" >> ops.FillMissing() conts = cont1 + "cont02" >> ops.NormalizeMinMax() workflow = Workflow(conts + "cat01" + "label") result = workflow.fit_transform(Dataset(df)).to_ddf().compute() assert result["cont01"].max() <= 1.0 assert result["cont02"].max() <= 1.0
def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) cats = ColumnGroup(["ad_id", "source_id", "platform"]) cat_features = cats >> ops.Categorify cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize te_features = cats >> ops.TargetEncoding( "clicked", kfold=5, fold_seed=42, p_smooth=20) p = Workflow(cat_features + cont_features + te_features, client=client) p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()