Exemple #1
0
def test_gpipeline_regression():
    gpipeline = GraphPipeline({
        "PT": PassThrough(),
        "Ridge": Ridge()
    }, [("PT", "Ridge")])

    X = dfX.loc[:, ["num1", "num2", "num3"]]

    gpipeline.fit(X, y)
    yhat = gpipeline.predict(X)
    yhat2 = gpipeline.models["Ridge"].predict(X)

    assert yhat.shape == y.shape
    assert (yhat == yhat2).all()

    with pytest.raises(AttributeError):
        gpipeline.predict_proba(X)

    with pytest.raises(AttributeError):
        gpipeline.predict_log_proba(X)

    assert gpipeline.get_feature_names_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("Ridge") == list(X.columns)

    with pytest.raises(ValueError):
        assert gpipeline.get_feature_names_at_node("DONTEXIST")
def test_graphpipeline_merging_node():

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColNum", "Pt"), ("ColCat", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"]
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all()

    assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]

    # concatenation in the other oreder
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColCat", "Pt"), ("ColNum", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"]  # Concanteation in the order of the edges
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
def test_graphpipeline_get_features_names_with_input_features():

    xx = np.random.randn(10, 5)
    df = pd.DataFrame(xx, columns=["COL_%d" % j for j in range(xx.shape[1])])

    model = GraphPipeline(
        {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")},
        edges=[("pt1", "pt2")],
    )
    model.fit(df)

    ### Test 1 : without input_features ###
    assert model.get_feature_names() == [
        "PT2__PT1__COL_0",
        "PT2__PT1__COL_1",
        "PT2__PT1__COL_2",
        "PT2__PT1__COL_3",
        "PT2__PT1__COL_4",
    ]
    assert model.get_feature_names_at_node("pt2") == [
        "PT2__PT1__COL_0",
        "PT2__PT1__COL_1",
        "PT2__PT1__COL_2",
        "PT2__PT1__COL_3",
        "PT2__PT1__COL_4",
    ]
    assert model.get_feature_names_at_node("pt1") == [
        "PT1__COL_0",
        "PT1__COL_1",
        "PT1__COL_2",
        "PT1__COL_3",
        "PT1__COL_4",
    ]

    assert model.get_input_features_at_node("pt2") == [
        "PT1__COL_0",
        "PT1__COL_1",
        "PT1__COL_2",
        "PT1__COL_3",
        "PT1__COL_4",
    ]
    assert model.get_input_features_at_node("pt1") == ["COL_0", "COL_1", "COL_2", "COL_3", "COL_4"]

    ### Test 2 : with input feautres ###
    assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]

    assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]
    assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "a",
        "b",
        "c",
        "d",
        "e",
    ]

    ### Test 3 :  with numpy array ###
    model = GraphPipeline(
        {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")},
        edges=[("pt1", "pt2")],
    )
    model.fit(xx)

    assert model.get_feature_names() is None
    assert model.get_feature_names_at_node("pt2") is None
    assert model.get_feature_names_at_node("pt1") is None
    assert model.get_input_features_at_node("pt2") is None
    assert model.get_input_features_at_node("pt1") is None

    assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]

    assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]
    assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "a",
        "b",
        "c",
        "d",
        "e",
    ]
def test_graphpipeline_get_features_names():

    dfX = pd.DataFrame(
        {
            "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
            "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"],
            "num1": [0, 1, 2, 3],
            "num2": [1.1, 1.5, -2, -3.5],
            "num3": [-1, 1, 25, 4],
            "cat1": ["A", "B", "A", "D"],
            "cat2": ["toto", "tata", "truc", "toto"],
        }
    )

    ###  Test 1  ###
    model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")])

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2"]  # features at ending nodeC

    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2"]
    assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 2  ###
    model = GraphPipeline(
        {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()},
        edges=[("sel1", "pt"), ("sel2", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 3  ###
    model = GraphPipeline(
        {
            "sel1": ColumnsSelector(["cat1", "cat2"]),
            "sel2": ColumnsSelector(["num1", "num2"]),
            "sel12": ColumnsSelector(["cat1", "num1"]),
            "pt": PassThrough(),
        },
        edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "num1"]

    assert model.get_feature_names_at_node("pt") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "num1"]
    assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]