def test_fit_schema_works_with_raw_column_dependencies(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost") workflow = Workflow(cat_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
def test_fit_schema_works_when_subtracting_column_names(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow1 = Workflow(cont_features - "y_renamed") workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == ["x_renamed"]
def test_fit_schema_works_with_grouped_node_inputs(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y", ("x", "y")]) >> ops.TargetEncoding("cost") workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert sorted(workflow1.output_schema.column_names) == sorted( ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op workflow = Workflow(cont_features) workflow.fit_schema(schema) output_cols = op.output_column_names(ColumnSelector(["col1", "col2"])) assert len(workflow.output_schema.column_names) == len(output_cols.names)
def test_fit_schema(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow = Workflow(cont_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == [ "x_renamed", "y_renamed", "id_renamed" ]
def test_fit_schema_works_with_node_dependencies(): schema = Schema(["x", "y", "cost"]) cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed") cat_features = ColumnSelector(["x", "y" ]) >> ops.TargetEncoding(cont_features) workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == [ "TE_x_cost_renamed", "TE_y_cost_renamed" ]
def test_fit_schema_works_with_addition_nodes(): schema = Schema(["x", "y", "id"]) x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + "y") workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y"] x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") y_node = ColumnSelector(["y"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + y_node) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y_renamed"]
def test_nested_workflow_node(): df = dispatch._make_df({ "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], }) dataset = Dataset(df) geo_selector = ColumnSelector(["geo"]) country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1") # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2") user = "******" # user2 = "user2" # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = country + user + [country + user] >> Categorify(encode_type="combo") workflow = Workflow(cats) workflow.fit_schema(dataset.infer_schema()) df_out = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[ 0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user" ] >> Categorify(encode_type="combo")