def test_gpipeline_regression(): gpipeline = GraphPipeline({ "PT": PassThrough(), "Ridge": Ridge() }, [("PT", "Ridge")]) X = dfX.loc[:, ["num1", "num2", "num3"]] gpipeline.fit(X, y) yhat = gpipeline.predict(X) yhat2 = gpipeline.models["Ridge"].predict(X) assert yhat.shape == y.shape assert (yhat == yhat2).all() with pytest.raises(AttributeError): gpipeline.predict_proba(X) with pytest.raises(AttributeError): gpipeline.predict_log_proba(X) assert gpipeline.get_feature_names_at_node("PT") == list(X.columns) assert gpipeline.get_input_features_at_node("PT") == list(X.columns) assert gpipeline.get_input_features_at_node("Ridge") == list(X.columns) with pytest.raises(ValueError): assert gpipeline.get_feature_names_at_node("DONTEXIST")
def test_graphpipeline_merging_node(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColNum", "Pt"), ("ColCat", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"] assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all() assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] # concatenation in the other oreder gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColCat", "Pt"), ("ColNum", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"] # Concanteation in the order of the edges assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
def test_graphpipeline_get_features_names_with_input_features(): xx = np.random.randn(10, 5) df = pd.DataFrame(xx, columns=["COL_%d" % j for j in range(xx.shape[1])]) model = GraphPipeline( {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")}, edges=[("pt1", "pt2")], ) model.fit(df) ### Test 1 : without input_features ### assert model.get_feature_names() == [ "PT2__PT1__COL_0", "PT2__PT1__COL_1", "PT2__PT1__COL_2", "PT2__PT1__COL_3", "PT2__PT1__COL_4", ] assert model.get_feature_names_at_node("pt2") == [ "PT2__PT1__COL_0", "PT2__PT1__COL_1", "PT2__PT1__COL_2", "PT2__PT1__COL_3", "PT2__PT1__COL_4", ] assert model.get_feature_names_at_node("pt1") == [ "PT1__COL_0", "PT1__COL_1", "PT1__COL_2", "PT1__COL_3", "PT1__COL_4", ] assert model.get_input_features_at_node("pt2") == [ "PT1__COL_0", "PT1__COL_1", "PT1__COL_2", "PT1__COL_3", "PT1__COL_4", ] assert model.get_input_features_at_node("pt1") == ["COL_0", "COL_1", "COL_2", "COL_3", "COL_4"] ### Test 2 : with input feautres ### assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "a", "b", "c", "d", "e", ] ### Test 3 : with numpy array ### model = GraphPipeline( {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")}, edges=[("pt1", "pt2")], ) model.fit(xx) assert model.get_feature_names() is None assert model.get_feature_names_at_node("pt2") is None assert model.get_feature_names_at_node("pt1") is None assert model.get_input_features_at_node("pt2") is None assert model.get_input_features_at_node("pt1") is None assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT2__PT1__a", "PT2__PT1__b", "PT2__PT1__c", "PT2__PT1__d", "PT2__PT1__e", ] assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [ "PT1__a", "PT1__b", "PT1__c", "PT1__d", "PT1__e", ] assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [ "a", "b", "c", "d", "e", ]
def test_graphpipeline_get_features_names(): dfX = pd.DataFrame( { "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"], "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"], "num1": [0, 1, 2, 3], "num2": [1.1, 1.5, -2, -3.5], "num3": [-1, 1, 25, 4], "cat1": ["A", "B", "A", "D"], "cat2": ["toto", "tata", "truc", "toto"], } ) ### Test 1 ### model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")]) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2"] # features at ending nodeC assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2"] assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 2 ### model = GraphPipeline( {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()}, edges=[("sel1", "pt"), ("sel2", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 3 ### model = GraphPipeline( { "sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "sel12": ColumnsSelector(["cat1", "num1"]), "pt": PassThrough(), }, edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "num1"] assert model.get_feature_names_at_node("pt") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "num1"] assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]