Exemple #1
0
def test_workflow_node_subtraction():
    schema = Schema(["a", "b", "c", "d", "e", "f"])

    node1 = ["a", "b", "c", "d"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["b"] >> Operator()

    output_node = node1 - ["c", "d"]
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 0
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = node1 - node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = ["a", "b", "c", "d"] - node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = node1 - ["c", "d"] - node3
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a"]
Exemple #2
0
def test_addition_nodes_are_combined():
    schema = Schema(["a", "b", "c", "d", "e", "f", "g", "h"])

    node1 = ["a", "b"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["e", "f"] >> Operator()
    node4 = ["g", "h"] >> Operator()

    add_node = node1 + node2 + node3
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(workflow.output_node.dependencies) == {node2, node3}
    assert set(workflow.output_node.output_columns.names) == {
        "a", "b", "c", "d", "e", "f"
    }

    add_node = node1 + "c" + "d"
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(
        workflow.output_node.output_columns.names) == {"a", "b", "c", "d"}

    add_node = "c" + node1 + "d"
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(
        workflow.output_node.output_columns.names) == {"a", "b", "c", "d"}

    add_node = node1 + "e" + node2
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert node2 in workflow.output_node.dependencies
    assert set(workflow.output_node.output_columns.names) == {
        "a", "b", "e", "c", "d"
    }

    add_node1 = node1 + node2
    add_node2 = node3 + node4

    add_node = add_node1 + add_node2
    workflow = Workflow(add_node).fit_schema(schema)

    assert set(workflow.output_node.parents) == {node1}
    assert set(workflow.output_node.dependencies) == {node2, node3, node4}
    assert set(workflow.output_node.output_columns.names) == {
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
    }
Exemple #3
0
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    normalize = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> normalize
    workflow = Workflow(conts + cat_names + label_name, client=client)

    dataset = Dataset(paths, engine)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    for name in cont_names:
        assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3)
        assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
def test_chaining_2():
    gdf = cudf.DataFrame({
        "A": [1, 2, 2, 9, 6, np.nan, 3],
        "B": [2, np.nan, 4, 7, 7, 2, 5],
        "C": ["a", "b", "c", np.nan, np.nan, "g", "k"],
    })

    cat_names = ["C"]
    cont_names = ["A", "B"]
    label_name = []

    all_features = (cat_names + cont_names >> ops.LambdaOp(
        f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull"))
    cat_features = cat_names >> ops.Categorify()

    workflow = Workflow(all_features + cat_features + label_name)

    dataset = nvt.Dataset(gdf, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(x in list(result.columns)
               for x in ["A_isnull", "B_isnull", "C_isnull"])
    assert (x in result["C"].unique()
            for x in set(gdf["C"].dropna().to_arrow()))
Exemple #5
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction,
                                use_client):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client if use_client else None,
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
    )

    processor.add_preprocess(
        ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True))

    processor.add_cat_feature(
        ops.JoinGroupby(cont_names=cont_names,
                        stats=["count", "sum"],
                        out_path=str(tmpdir)))

    processor.finalize()
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)

    processor.apply(dataset, output_path=str(tmpdir))
    result = processor.get_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(ops.Normalize())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    counts = df0[cont_names].count()
    for name in cont_names:
        assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3)
        assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3)
        assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
def test_dask_median_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.Median()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # TODO: Improve the accuracy! "tidigest" with crick could help,
    #       but current version seems to have cupy/numpy problems here
    medians = result[cont_names].quantile(q=0.5)
    assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1)
    assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1)
    assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.MinMax()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3)
    assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_chaining_3():
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    platform_features = ["platform"] >> ops.Dropna()
    joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    joined_lambda = (
        joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >>
        ops.Rename(postfix="_ctr"))

    workflow = Workflow(platform_features + joined + joined_lambda)

    dataset = nvt.Dataset(gdf_test, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(
        x in result.columns
        for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_workflow_move_saved(tmpdir):
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = cudf.DataFrame({"geo": raw})

    geo_location = ColumnGroup(["geo"])
    state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename(
        postfix="_state")
    country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename(
        postfix="_country")
    geo_features = state + country + geo_location >> ops.Categorify()

    # create the workflow and transform the input
    workflow = Workflow(geo_features)
    expected = workflow.fit_transform(Dataset(data)).to_ddf().compute()

    # save the workflow (including categorical mapping parquet files)
    # and then verify we can load the saved workflow after moving the directory
    out_path = os.path.join(tmpdir, "output", "workflow")
    workflow.save(out_path)

    moved_path = os.path.join(tmpdir, "output", "workflow2")
    shutil.move(out_path, moved_path)
    workflow2 = Workflow.load(moved_path)

    # also check that when transforming our input we get the same results after loading
    transformed = workflow2.transform(Dataset(data)).to_ddf().compute()
    assert_eq(expected, transformed)
Exemple #11
0
def test_workflow_node_select():
    df = dispatch._make_df({
        "a": [1, 4, 9, 16, 25],
        "b": [0, 1, 2, 3, 4],
        "c": [25, 16, 9, 4, 1]
    })
    dataset = Dataset(df)

    input_features = WorkflowNode(ColumnSelector(["a", "b", "c"]))
    # pylint: disable=unnecessary-lambda
    sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col))
    plus_one_features = input_features["b"] >> (lambda col: col + 1)
    features = sqrt_features + plus_one_features

    workflow = Workflow(features)
    workflow.fit(dataset)

    df_out = workflow.transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    expected = dispatch._make_df()
    expected["a"] = np.sqrt(df["a"])
    expected["c"] = np.sqrt(df["c"])
    expected["b"] = df["b"] + 1

    assert_eq(expected, df_out)
def test_schema_write_read_dataset(tmpdir, dataset, engine):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    cat_features = cat_names >> ops.Categorify(cat_cache="host")
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp >> norms

    workflow = Workflow(cat_features + cont_features + label_name)

    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(
        tmpdir,
        out_files_per_proc=10,
    )

    schema_path = Path(tmpdir)
    proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt")
    new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"))
    assert """name: "name-cat"\n    min: 0\n    max: 27\n""" in str(
        proto_schema)
    assert new_dataset.schema == workflow.output_schema
def test_workflow_generate_columns(tmpdir, use_parquet):
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    # Stripped down dataset with geo_locaiton codes like in outbrains
    df = nvt.dispatch._make_df(
        {"geo_location": ["US>CA", "CA>BC", "US>TN>659"]})

    # defining a simple workflow that strips out the country code from the first two digits of the
    # geo_location code and sticks in a new 'geo_location_country' field
    country = (["geo_location"] >> ops.LambdaOp(
        f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country"))
    cat_features = ["geo_location"] + country >> ops.Categorify()

    workflow = Workflow(cat_features)

    if use_parquet:
        df.to_parquet(path)
        dataset = nvt.Dataset(path)
    else:
        dataset = nvt.Dataset(df)

    # just make sure this works without errors
    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(out_path)
def test_fit_schema_works_with_addition_nodes():
    schema = Schema(["x", "y", "id"])

    x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed")

    workflow = Workflow(x_node + "y")
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["x_renamed", "y"]

    x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed")
    y_node = ColumnSelector(["y"]) >> ops.Rename(postfix="_renamed")

    workflow = Workflow(x_node + y_node)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["x_renamed", "y_renamed"]
def test_fit_schema_works_with_raw_column_dependencies():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost")

    workflow = Workflow(cat_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
Exemple #16
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)
    )

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name, client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64),
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "min"
    assert_eq(
        result[["name-string", "name-string_x_min"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_min"],
        df0.groupby("name-string").agg({"x": "min"})["x"],
        check_index=False,
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
def test_fit_schema_works_with_grouped_node_inputs():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y",
                                   ("x", "y")]) >> ops.TargetEncoding("cost")

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert sorted(workflow1.output_schema.column_names) == sorted(
        ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_fit_schema_works_when_subtracting_column_names():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(
        ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp
                     >> ops.Normalize() >> ops.Rename(postfix="_renamed"))

    workflow1 = Workflow(cont_features - "y_renamed")
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == ["x_renamed"]
def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline):
    out_files_per_proc = 2
    out_path = str(tmpdir.mkdir("processed"))
    path = str(tmpdir.join("simple.parquet"))

    size = 25
    row_group_size = 5

    cont_names = ["cont1", "cont2"]
    cat_names = ["cat1", "cat2"]
    label_name = ["label"]

    df = pd.DataFrame({
        "cont1": np.arange(size, dtype=np.float64),
        "cont2": np.arange(size, dtype=np.float64),
        "cat1": np.arange(size, dtype=np.int32),
        "cat2": np.arange(size, dtype=np.int32),
        "label": np.arange(size, dtype=np.float64),
    })
    df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow")

    dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1)

    cat_features = cat_names >> ops.Categorify()
    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp

    workflow = Workflow(cat_features + cont_features + label_name,
                        client=client if use_client else None)

    workflow.fit(dataset)

    # Force dtypes
    dict_dtypes = {}
    for col in cont_names:
        dict_dtypes[col] = np.float32
    for col in cat_names:
        dict_dtypes[col] = np.float32
    for col in label_name:
        dict_dtypes[col] = np.int64

    workflow.transform(dataset).to_parquet(
        # apply_offline=apply_offline, Not any more?
        # record_stats=apply_offline, Not any more?
        output_path=out_path,
        shuffle=shuffle,
        out_files_per_proc=out_files_per_proc,
        dtypes=dict_dtypes,
    )

    # Check dtypes
    for filename in glob.glob(os.path.join(out_path, "*.parquet")):
        gdf = cudf.io.read_parquet(filename)
        assert dict(gdf.dtypes) == dict_dtypes
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    output_cols = op.output_column_names(ColumnSelector(["col1", "col2"]))
    assert len(workflow.output_schema.column_names) == len(output_cols.names)
def test_filtered_partition(tmpdir, cpu):
    # Toy DataFrame example
    df = pd.DataFrame({"col": range(100)})
    ddf = dd_from_pandas(df, npartitions=5)
    dataset = Dataset(ddf, cpu=cpu)

    # Workflow
    filtered = ["col"] >> ops.Filter(lambda df: df["col"] < 75)
    workflow = Workflow(filtered)

    # Write result to disk
    workflow.transform(dataset).to_parquet(str(tmpdir))
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )

    processor.add_preprocess(
        ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir))
    )
    processor.finalize()

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    processor.apply(dataset)
    result = processor.get_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"],
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
Exemple #23
0
def test_workflow_node_addition():
    schema = Schema(["a", "b", "c", "d", "e", "f"])

    node1 = ["a", "b"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["e", "f"] >> Operator()

    output_node = node1 + node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"]

    output_node = node1 + "c"
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == ["a", "b", "c"]

    output_node = node1 + "c" + "d"
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"]

    output_node = node1 + node2 + "e"
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == [
        "a", "b", "c", "d", "e"
    ]

    output_node = node1 + node2 + node3
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == [
        "a", "b", "c", "d", "e", "f"
    ]

    # Addition with groups
    output_node = node1 + ["c", "d"]
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.grouped_names == [
        "a", "b", "c", "d"
    ]

    output_node = node1 + [node2, "e"]
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.grouped_names == [
        "a", "b", "c", "d", "e"
    ]

    output_node = node1 + [node2, node3]
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.grouped_names == [
        "a", "b", "c", "d", "e", "f"
    ]
def test_fit_schema_works_with_node_dependencies():
    schema = Schema(["x", "y", "cost"])

    cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed")
    cat_features = ColumnSelector(["x", "y"
                                   ]) >> ops.TargetEncoding(cont_features)

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == [
        "TE_x_cost_renamed", "TE_y_cost_renamed"
    ]
def test_fit_schema():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing()
                     >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize()
                     >> ops.Rename(postfix="_renamed"))

    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == [
        "x_renamed", "y_renamed", "id_renamed"
    ]
def test_workflow_input_output_dtypes():
    df = cudf.DataFrame({
        "genre": ["drama", "comedy"],
        "user": ["a", "b"],
        "unneeded": [1, 2]
    })
    features = [["genre", "user"], "genre"
                ] >> ops.Categorify(encode_type="combo")
    workflow = Workflow(features)
    workflow.fit(Dataset(df))

    assert "unneeded" not in workflow.input_dtypes
    assert set(workflow.input_dtypes.keys()) == {"genre", "user"}
    assert set(workflow.output_dtypes.keys()) == {"genre_user", "genre"}
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_cols=cont_names,
        stats=["count", "sum", "std", "min"],
        out_path=str(tmpdir))

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name,
                        client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check results.  Need to sort for direct comparison
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    gb_e = expect.groupby("name-cat").aggregate({
        "name-cat": "count",
        "x": ["sum", "min", "std"]
    })
    gb_e.columns = ["count", "sum", "min", "std"]
    df_check = got.merge(gb_e,
                         left_on="name-cat",
                         right_index=True,
                         how="left")
    assert_eq(df_check["name-cat_count"],
              df_check["count"].astype("int64"),
              check_names=False)
    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False)
    assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False)
    assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
def test_fit_simple():
    data = cudf.DataFrame({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = cudf.DataFrame({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    assert_eq(expected, transformed)
def test_chaining_1():
    df = cudf.DataFrame({
        "cont01": np.random.randint(1, 100, 100),
        "cont02": np.random.random(100) * 100,
        "cat01": np.random.randint(0, 10, 100),
        "label": np.random.randint(0, 3, 100),
    })
    df["cont01"][:10] = None

    cont1 = "cont01" >> ops.FillMissing()
    conts = cont1 + "cont02" >> ops.NormalizeMinMax()
    workflow = Workflow(conts + "cat01" + "label")

    result = workflow.fit_transform(Dataset(df)).to_ddf().compute()

    assert result["cont01"].max() <= 1.0
    assert result["cont02"].max() <= 1.0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    cats = ColumnGroup(["ad_id", "source_id", "platform"])
    cat_features = cats >> ops.Categorify
    cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize
    te_features = cats >> ops.TargetEncoding(
        "clicked", kfold=5, fold_seed=42, p_smooth=20)

    p = Workflow(cat_features + cont_features + te_features, client=client)
    p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()