def test_graphpipeline_merging_node(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColNum", "Pt"), ("ColCat", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"] assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all() assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] # concatenation in the other oreder gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColCat", "Pt"), ("ColNum", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"] # Concanteation in the order of the edges assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
def test_graphpipeline_nodes_concat_order(): cols = list(dfX.columns) ### 1 pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), }, edges = [("pt1","pt3"),("pt2","pt3")] ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT3__PT1__" + c for c in cols] + ["PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 2 : reverse order pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), }, edges = [("pt2","pt3"),("pt1","pt3")] ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT3__PT2__" + c for c in cols] + ["PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 3 : with 4 nodes for edges in ( [("pt1","pt3","pt4"),("pt2","pt3","pt4")] , [("pt1","pt3","pt4"),("pt2","pt3")] ): pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} , edges = edges ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT4__PT3__PT1__" + c for c in cols] + ["PT4__PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 4 : reverse order for edges in ( [("pt2","pt3","pt4"),("pt1","pt3","pt4")] , [("pt2","pt3","pt4"),("pt1","pt3")] ): pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} , edges = edges ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT4__PT3__PT2__" + c for c in cols] + ["PT4__PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names()
def test_graphpipeline_concat_names(): df = get_sample_df(size=100, seed=123) gpipeline = GraphPipeline( models={ "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]), "vec": CountVectorizerWrapper(columns_to_use=["text_col"]), "pt": PassThrough(), }, edges=[("sel", "pt"), ("vec", "pt")], ) gpipeline.fit(df) df_res = gpipeline.transform(df) assert list(df_res.columns) == [ "float_col", "int_col", "text_col__BAG__aaa", "text_col__BAG__bbb", "text_col__BAG__ccc", "text_col__BAG__ddd", "text_col__BAG__eee", "text_col__BAG__fff", "text_col__BAG__jjj", ] assert gpipeline.get_feature_names() == list(df_res.columns)
def test_graphpipeline_get_features_names_with_input_features(): xx = np.random.randn(10, 5) df = pd.DataFrame(xx, columns=["COL_%d" % j for j in range(xx.shape[1])]) model = GraphPipeline( {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")}, edges=[("pt1", "pt2")], ) model.fit(df) ### Test 1 : without input_features ### assert model.get_feature_names() == [ "PT2__PT1__COL_0", "PT2__PT1__COL_1", "PT2__PT1__COL_2", "PT2__PT1__COL_3", "PT2__PT1__COL_4", ] assert model.get_feature_names_at_node("pt2") == [ "PT2__PT1__COL_0", "PT2__PT1__COL_1", "PT2__PT1__COL_2", "PT2__PT1__COL_3", "PT2__PT1__COL_4", ] assert model.get_feature_names_at_node("pt1") == [ "PT1__COL_0", "PT1__COL_1", "PT1__COL_2", "PT1__COL_3", "PT1__COL_4", ] assert model.get_input_features_at_node("pt2") == [ "PT1__COL_0", "PT1__COL_1", "PT1__COL_2", "PT1__COL_3", "PT1__COL_4", ] assert model.get_input_features_at_node("pt1") == ["COL_0", "COL_1", "COL_2", "COL_3", "COL_4"] ### Test 2 : with input feautres ### assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "a", "b", "c", "d", "e", ] ### Test 3 : with numpy array ### model = GraphPipeline( {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")}, edges=[("pt1", "pt2")], ) model.fit(xx) assert model.get_feature_names() is None assert model.get_feature_names_at_node("pt2") is None assert model.get_feature_names_at_node("pt1") is None assert model.get_input_features_at_node("pt2") is None assert model.get_input_features_at_node("pt1") is None assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "a", "b", "c", "d", "e", ]
def test_graphpipeline_get_features_names(): dfX = pd.DataFrame( { "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"], "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"], "num1": [0, 1, 2, 3], "num2": [1.1, 1.5, -2, -3.5], "num3": [-1, 1, 25, 4], "cat1": ["A", "B", "A", "D"], "cat2": ["toto", "tata", "truc", "toto"], } ) ### Test 1 ### model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")]) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2"] # features at ending nodeC assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2"] assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 2 ### model = GraphPipeline( {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()}, edges=[("sel1", "pt"), ("sel2", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 3 ### model = GraphPipeline( { "sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "sel12": ColumnsSelector(["cat1", "num1"]), "pt": PassThrough(), }, edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "num1"] assert model.get_feature_names_at_node("pt") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "num1"] assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]