Example #1
0
def test_graphpipeline_merging_node():

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColNum", "Pt"), ("ColCat", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"]
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all()

    assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]

    # concatenation in the other oreder
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColCat", "Pt"), ("ColNum", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"]  # Concanteation in the order of the edges
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
Example #2
0
def test_graphpipeline_nodes_concat_order():
    
    cols = list(dfX.columns)
    
    ### 1
    pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                              "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                              "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                              },
                              edges = [("pt1","pt3"),("pt2","pt3")]
                              )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT1__" + c for c in cols] + ["PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()

    ### 2 : reverse order    
    pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                              "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                              "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                              },
                              edges = [("pt2","pt3"),("pt1","pt3")]
                              )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT2__" + c for c in cols] + ["PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()


    ### 3 : with 4 nodes
    for edges in ( [("pt1","pt3","pt4"),("pt2","pt3","pt4")] , [("pt1","pt3","pt4"),("pt2","pt3")] ):
        pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                                  "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                                  "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                                  "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} ,
                                  edges = edges
                                  )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT1__" + c for c in cols] + ["PT4__PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()

    
    ### 4 : reverse order
    for edges in ( [("pt2","pt3","pt4"),("pt1","pt3","pt4")] , [("pt2","pt3","pt4"),("pt1","pt3")] ):
        pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                                  "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                                  "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                                  "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} ,
                                  edges = edges
                                  )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT2__" + c for c in cols] + ["PT4__PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()
Example #3
0
def test_graphpipeline_concat_names():

    df = get_sample_df(size=100, seed=123)
    gpipeline = GraphPipeline(
        models={
            "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]),
            "vec": CountVectorizerWrapper(columns_to_use=["text_col"]),
            "pt": PassThrough(),
        },
        edges=[("sel", "pt"), ("vec", "pt")],
    )

    gpipeline.fit(df)
    df_res = gpipeline.transform(df)

    assert list(df_res.columns) == [
        "float_col",
        "int_col",
        "text_col__BAG__aaa",
        "text_col__BAG__bbb",
        "text_col__BAG__ccc",
        "text_col__BAG__ddd",
        "text_col__BAG__eee",
        "text_col__BAG__fff",
        "text_col__BAG__jjj",
    ]

    assert gpipeline.get_feature_names() == list(df_res.columns)
Example #4
0
def test_graphpipeline_get_features_names_with_input_features():

    xx = np.random.randn(10, 5)
    df = pd.DataFrame(xx, columns=["COL_%d" % j for j in range(xx.shape[1])])

    model = GraphPipeline(
        {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")},
        edges=[("pt1", "pt2")],
    )
    model.fit(df)

    ### Test 1 : without input_features ###
    assert model.get_feature_names() == [
        "PT2__PT1__COL_0",
        "PT2__PT1__COL_1",
        "PT2__PT1__COL_2",
        "PT2__PT1__COL_3",
        "PT2__PT1__COL_4",
    ]
    assert model.get_feature_names_at_node("pt2") == [
        "PT2__PT1__COL_0",
        "PT2__PT1__COL_1",
        "PT2__PT1__COL_2",
        "PT2__PT1__COL_3",
        "PT2__PT1__COL_4",
    ]
    assert model.get_feature_names_at_node("pt1") == [
        "PT1__COL_0",
        "PT1__COL_1",
        "PT1__COL_2",
        "PT1__COL_3",
        "PT1__COL_4",
    ]

    assert model.get_input_features_at_node("pt2") == [
        "PT1__COL_0",
        "PT1__COL_1",
        "PT1__COL_2",
        "PT1__COL_3",
        "PT1__COL_4",
    ]
    assert model.get_input_features_at_node("pt1") == ["COL_0", "COL_1", "COL_2", "COL_3", "COL_4"]

    ### Test 2 : with input feautres ###
    assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]

    assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]
    assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "a",
        "b",
        "c",
        "d",
        "e",
    ]

    ### Test 3 :  with numpy array ###
    model = GraphPipeline(
        {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")},
        edges=[("pt1", "pt2")],
    )
    model.fit(xx)

    assert model.get_feature_names() is None
    assert model.get_feature_names_at_node("pt2") is None
    assert model.get_feature_names_at_node("pt1") is None
    assert model.get_input_features_at_node("pt2") is None
    assert model.get_input_features_at_node("pt1") is None

    assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]

    assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]
    assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "a",
        "b",
        "c",
        "d",
        "e",
    ]
Example #5
0
def test_graphpipeline_get_features_names():

    dfX = pd.DataFrame(
        {
            "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
            "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"],
            "num1": [0, 1, 2, 3],
            "num2": [1.1, 1.5, -2, -3.5],
            "num3": [-1, 1, 25, 4],
            "cat1": ["A", "B", "A", "D"],
            "cat2": ["toto", "tata", "truc", "toto"],
        }
    )

    ###  Test 1  ###
    model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")])

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2"]  # features at ending nodeC

    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2"]
    assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 2  ###
    model = GraphPipeline(
        {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()},
        edges=[("sel1", "pt"), ("sel2", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 3  ###
    model = GraphPipeline(
        {
            "sel1": ColumnsSelector(["cat1", "cat2"]),
            "sel2": ColumnsSelector(["num1", "num2"]),
            "sel12": ColumnsSelector(["cat1", "num1"]),
            "pt": PassThrough(),
        },
        edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "num1"]

    assert model.get_feature_names_at_node("pt") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "num1"]
    assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]