コード例 #1
0
def test_gpipeline_graphviz():

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": PassThrough(),
        },
        edges=[("ColNum", "Pt"), ("ColCat", "Pt")],
    )

    gpipeline.fit(dfX, y)
    assert isinstance(gpipeline.graphviz, graphviz.dot.Digraph)

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": PassThrough(),
        },
        edges=[("ColCat", "Pt"), ("ColNum", "Pt")],
    )

    assert isinstance(
        gpipeline.graphviz,
        graphviz.dot.Digraph)  # graphviz even before fit is called
コード例 #2
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_cycle():
    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()},
        edges=[("A", "B", "C"), ("C", "A"), ("C", "D")],
    )

    with pytest.raises(ValueError):
        gpipeline.fit(X, y)  # ValueError: The graph shouldn't have any cycle
コード例 #3
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_set_params():

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": DebugPassThrough(debug=True)}, edges=[("A", "B", "C")]
    )

    assert gpipeline.models["C"].debug is True
    gpipeline.set_params(C__debug=False)
    assert gpipeline.models["C"].debug is False
コード例 #4
0
def test_graphpipeline_no_terminal_node():
    gpipeline = GraphPipeline(
        {
            "A": PassThrough(),
            "B": PassThrough(),
            "C": PassThrough()
        },
        edges=[("A", "B", "C"), ("C", "A")])
    with pytest.raises(ValueError):
        gpipeline.fit(
            X, y
        )  # ValueError: the graph should have only one terminal node, instead i got 0
コード例 #5
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_edge_not_in_models():
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "PtNum": PassThrough(),
            "PtCat": PassThrough(),
        },
        edges=[("ColNum", "PtNummm"), ("ColCat", "PtCat")],
    )

    with pytest.raises(ValueError):
        gpipeline.fit(dfX, y)  # ValueError "the node 'PtNummm' isn't in the dictionnary of models"
コード例 #6
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_more_than_one_terminal_node():
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "PtNum": PassThrough(),
            "PtCat": PassThrough(),
        },
        edges=[("ColNum", "PtNum"), ("ColCat", "PtCat")],
    )

    with pytest.raises(ValueError):
        gpipeline.fit(dfX, y)  # ValueError the graph should have only one terminal node, instead i got 2
コード例 #7
0
def test_gpipeline_regression():
    gpipeline = GraphPipeline({
        "PT": PassThrough(),
        "Ridge": Ridge()
    }, [("PT", "Ridge")])

    X = dfX.loc[:, ["num1", "num2", "num3"]]

    gpipeline.fit(X, y)
    yhat = gpipeline.predict(X)
    yhat2 = gpipeline.models["Ridge"].predict(X)

    assert yhat.shape == y.shape
    assert (yhat == yhat2).all()

    with pytest.raises(AttributeError):
        gpipeline.predict_proba(X)

    with pytest.raises(AttributeError):
        gpipeline.predict_log_proba(X)

    assert gpipeline.get_feature_names_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("Ridge") == list(X.columns)

    with pytest.raises(ValueError):
        assert gpipeline.get_feature_names_at_node("DONTEXIST")
コード例 #8
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_concat_names():

    df = get_sample_df(size=100, seed=123)
    gpipeline = GraphPipeline(
        models={
            "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]),
            "vec": CountVectorizerWrapper(columns_to_use=["text_col"]),
            "pt": PassThrough(),
        },
        edges=[("sel", "pt"), ("vec", "pt")],
    )

    gpipeline.fit(df)
    df_res = gpipeline.transform(df)

    assert list(df_res.columns) == [
        "float_col",
        "int_col",
        "text_col__BAG__aaa",
        "text_col__BAG__bbb",
        "text_col__BAG__ccc",
        "text_col__BAG__ddd",
        "text_col__BAG__eee",
        "text_col__BAG__fff",
        "text_col__BAG__jjj",
    ]

    assert gpipeline.get_feature_names() == list(df_res.columns)
コード例 #9
0
def test_gpipeline_raise_not_fitted():
    gpipeline = GraphPipeline({
        "PT": PassThrough(),
        "Ridge": Ridge()
    }, [("PT", "Ridge")])

    with pytest.raises(NotFittedError):
        gpipeline.predict(X)
コード例 #10
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_gpipeline_clustering():

    gpipeline = GraphPipeline({"PT": PassThrough(), "kmeans": KMeans(n_clusters=2)}, [("PT", "kmeans")])
    gpipeline.fit(X)

    yhat = gpipeline.predict(X)
    yhat2 = gpipeline.models["kmeans"].predict(X)

    assert (yhat == yhat2).all()
コード例 #11
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_gpipeline_classification():

    gpipeline = GraphPipeline({"PT": PassThrough(), "Logit": LogisticRegression()}, [("PT", "Logit")])
    gpipeline.fit(X, yc)

    yhat_proba = gpipeline.predict_proba(X)
    yhat_proba2 = gpipeline.models["Logit"].predict_proba(X)

    assert yhat_proba.shape == (X.shape[0], 2)
    assert (yhat_proba == yhat_proba2).all()
    assert list(gpipeline.classes_) == [0, 1]
コード例 #12
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_gpipeline_clone():
    gpipeline = GraphPipeline({"PT": PassThrough(), "Ridge": Ridge()}, [("PT", "Ridge")])
    gpipeline.fit(X, y)

    cloned_gpipeline = clone(gpipeline)

    with pytest.raises(NotFittedError):
        cloned_gpipeline.predict(X)

    for m in gpipeline.models.keys():
        assert m in cloned_gpipeline.models
        assert id(gpipeline.models[m]) != id(cloned_gpipeline.models[m])
コード例 #13
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_GraphPipeline_from_sklearn():
    
    np.random.seed(123)
    X = np.random.randn(100,10)
    y = 1*(np.random.randn(100)>0)
    
    sk_pipeline = Pipeline(steps=[("pt", PassThrough()),
                                  ("dt", DecisionTreeClassifier(random_state=123))
                                  ])


    # Case 1 
    # from a non fitted sklearn Pipeline

    gpipeline = GraphPipeline.from_sklearn(sk_pipeline)
    
    assert isinstance(gpipeline, GraphPipeline)
    assert not gpipeline._already_fitted
        
    gpipeline.fit(X, y)
    yhat = gpipeline.predict(X)
    yhat_proba = gpipeline.predict_proba(X)
    
    
    yhat2 = sk_pipeline.fit(X, y).predict(X)
    yhat_proba2 = sk_pipeline.predict_proba(X)

    
    assert (yhat == yhat2).all()
    assert (yhat_proba == yhat_proba2).all()

    # Case 2
    # from an already fitted pipeline
    gpipeline = GraphPipeline.from_sklearn(sk_pipeline)
    yhat = gpipeline.predict(X)
    yhat_proba = gpipeline.predict_proba(X)
    
    
    yhat2 = sk_pipeline.predict(X)
    yhat_proba2 = sk_pipeline.predict_proba(X)
    
    assert (yhat == yhat2).all()
    assert (yhat_proba == yhat_proba2).all()
コード例 #14
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_other_input_syntaxes():

    # regular syntax
    gpipeline = GraphPipeline({"A": PassThrough(), "B": PassThrough(), "C": PassThrough()}, edges=[("A", "B", "C")])
    gpipeline._complete_init()

    expected_nodes = {"A", "B", "C"}
    expected_edges = {("A", "B"), ("B", "C")}

    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    # pipeline syntax
    gpipeline = GraphPipeline([("A", PassThrough()), ("B", PassThrough()), ("C", PassThrough())])

    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    ## with a merge
    expected_nodes = {"A", "B", "C", "D"}
    expected_edges = {("A", "B"), ("B", "D"), ("C", "D")}

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()},
        edges=[("A", "B", "D"), ("C", "D")],
    )

    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()},
        edges=[("A", "B"), ("B", "D"), ("C", "D")],
    )
    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()}, edges="A - B - D ; C - D"
    )
    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges
コード例 #15
0
def test_PassThrough():

    df = get_sample_df(100, seed=123)
    pt = PassThrough()

    pt.fit(df)

    df2 = pt.transform(df)

    assert df2.shape == df.shape
    assert (df2 == df).all().all()
    assert id(df) == id(df2)

    assert pt.get_feature_names() == list(df.columns)

    with pytest.raises(ValueError):
        pt.transform(df.values)

    with pytest.raises(ValueError):
        pt.transform(df.iloc[:, [0, 1]])

    X = np.random.randn(20, 5)
    input_features = ["COL_%d" % i for i in range(5)]
    pt = PassThrough()
    pt.fit(X)

    X2 = pt.transform(X)

    assert X.shape == X2.shape  # same shape
    assert (X == X2).all()  # same value
    assert id(X) == id(X2)  # no copy

    assert pt.get_feature_names() == [0, 1, 2, 3, 4]
    assert pt.get_feature_names(input_features=input_features) == [
        "COL_0", "COL_1", "COL_2", "COL_3", "COL_4"
    ]
コード例 #16
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_get_features_names():

    dfX = pd.DataFrame(
        {
            "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
            "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"],
            "num1": [0, 1, 2, 3],
            "num2": [1.1, 1.5, -2, -3.5],
            "num3": [-1, 1, 25, 4],
            "cat1": ["A", "B", "A", "D"],
            "cat2": ["toto", "tata", "truc", "toto"],
        }
    )

    ###  Test 1  ###
    model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")])

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2"]  # features at ending nodeC

    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2"]
    assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 2  ###
    model = GraphPipeline(
        {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()},
        edges=[("sel1", "pt"), ("sel2", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 3  ###
    model = GraphPipeline(
        {
            "sel1": ColumnsSelector(["cat1", "cat2"]),
            "sel2": ColumnsSelector(["num1", "num2"]),
            "sel12": ColumnsSelector(["cat1", "num1"]),
            "pt": PassThrough(),
        },
        edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "num1"]

    assert model.get_feature_names_at_node("pt") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "num1"]
    assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
コード例 #17
0
                    'AD_CONTENTS', 'CONTENTS_COVER', 'BUILDINGS_COVER', 'P1_MAR_STATUS', 'P1_POLICY_REFUSED', 'P1_SEX',
                    'APPR_ALARM', 'APPR_LOCKS', 'FLOODING', 'NEIGH_WATCH', 'OCC_STATUS', 'SAFE_INSTALLED',
                    'SEC_DISC_REQ', 'SUBSIDENCE', 'PAYMENT_METHOD', 'LEGAL_ADDON_PRE_REN', 'LEGAL_ADDON_POST_REN',
                    'HOME_EM_ADDON_PRE_REN', 'HOME_EM_ADDON_POST_REN', 'GARDEN_ADDON_PRE_REN', 'GARDEN_ADDON_POST_REN',
                    'KEYCARE_ADDON_PRE_REN', 'KEYCARE_ADDON_POST_REN', 'HP1_ADDON_PRE_REN', 'HP1_ADDON_POST_REN',
                    'HP2_ADDON_PRE_REN', 'HP2_ADDON_POST_REN', 'HP3_ADDON_PRE_REN', 'HP3_ADDON_POST_REN', 'MTA_FLAG'],
    desired_output_type='DataFrame', drop_unused_columns=True,
    drop_used_columns=True, encoding_type='dummy',
    max_cum_proba=0.95, max_modalities_number=100,
    max_na_percentage=0.05, min_modalities_number=20,
    min_nb_observations=10, regex_match=False)

binary_columns_cleaner = BinaryColumnsCleaner()

# this one does nothing but is used to use the pipeline without the classifier (for shap):
pass_through = PassThrough()
classifier = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                            importance_type='split', learning_rate=0.1, max_depth=-1,
                            min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                            n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                            reg_alpha=0.0, reg_lambda=0.0, silent=True,
                            subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

pipeline = GraphPipeline(edges=[("ColumnsSelector", "NumImputer"),
                                ("NumericalEncoder", "NumImputer", "BinaryColumnsCleaner", "PassThrough",
                                 "LGBMClassifier")],
                         models={"ColumnsSelector": columns_selector,
                                 "NumericalEncoder": numerical_encoder,
                                 "NumImputer": imputer,
                                 "BinaryColumnsCleaner": binary_columns_cleaner,
                                 "PassThrough": pass_through,