def test_gpipeline_graphviz(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": PassThrough(), }, edges=[("ColNum", "Pt"), ("ColCat", "Pt")], ) gpipeline.fit(dfX, y) assert isinstance(gpipeline.graphviz, graphviz.dot.Digraph) gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": PassThrough(), }, edges=[("ColCat", "Pt"), ("ColNum", "Pt")], ) assert isinstance( gpipeline.graphviz, graphviz.dot.Digraph) # graphviz even before fit is called
def test_graphpipeline_merging_node(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColNum", "Pt"), ("ColCat", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"] assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all() assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] # concatenation in the other oreder gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColCat", "Pt"), ("ColNum", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"] # Concanteation in the order of the edges assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
def test_graphpipeline_edge_not_in_models(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "PtNum": PassThrough(), "PtCat": PassThrough(), }, edges=[("ColNum", "PtNummm"), ("ColCat", "PtCat")], ) with pytest.raises(ValueError): gpipeline.fit(dfX, y) # ValueError "the node 'PtNummm' isn't in the dictionnary of models"
def test_graphpipeline_more_than_one_terminal_node(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "PtNum": PassThrough(), "PtCat": PassThrough(), }, edges=[("ColNum", "PtNum"), ("ColCat", "PtCat")], ) with pytest.raises(ValueError): gpipeline.fit(dfX, y) # ValueError the graph should have only one terminal node, instead i got 2
def test_graphpipeline_concat_names(): df = get_sample_df(size=100, seed=123) gpipeline = GraphPipeline( models={ "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]), "vec": CountVectorizerWrapper(columns_to_use=["text_col"]), "pt": PassThrough(), }, edges=[("sel", "pt"), ("vec", "pt")], ) gpipeline.fit(df) df_res = gpipeline.transform(df) assert list(df_res.columns) == [ "float_col", "int_col", "text_col__BAG__aaa", "text_col__BAG__bbb", "text_col__BAG__ccc", "text_col__BAG__ddd", "text_col__BAG__eee", "text_col__BAG__fff", "text_col__BAG__jjj", ] assert gpipeline.get_feature_names() == list(df_res.columns)
def test_graphpipeline_get_features_names(): dfX = pd.DataFrame( { "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"], "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"], "num1": [0, 1, 2, 3], "num2": [1.1, 1.5, -2, -3.5], "num3": [-1, 1, 25, 4], "cat1": ["A", "B", "A", "D"], "cat2": ["toto", "tata", "truc", "toto"], } ) ### Test 1 ### model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")]) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2"] # features at ending nodeC assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2"] assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 2 ### model = GraphPipeline( {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()}, edges=[("sel1", "pt"), ("sel2", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 3 ### model = GraphPipeline( { "sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "sel12": ColumnsSelector(["cat1", "num1"]), "pt": PassThrough(), }, edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "num1"] assert model.get_feature_names_at_node("pt") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "num1"] assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]