Exemple #1
0
def test_graphpipeline_no_concat():

    gpipeline = GraphPipeline(
        {
            "A": DebugPassThrough(debug=True),
            "B": DebugPassThrough(debug=True),
            "C": DebugPassThrough(debug=True)
        },
        edges=[("A", "C"), ("B", "C")],
        no_concat_nodes={"C"},
    )

    Xtransformed = gpipeline.fit_transform(X)
    assert isinstance(Xtransformed, dict)
    assert set(Xtransformed.keys()) == {"A", "B"}
    assert (Xtransformed["A"] == X).all().all()
    assert (Xtransformed["B"] == X).all().all()

    gpipeline = GraphPipeline(
        {
            "A": DebugPassThrough(debug=True),
            "B": DebugPassThrough(debug=True),
            "C": TransformToBlockManager()
        },
        edges=[("A", "C"), ("B", "C")],
        no_concat_nodes={"C"},
    )

    Xtransformed = gpipeline.fit_transform(X)
    assert isinstance(Xtransformed, BlockManager)
    assert (Xtransformed["A"] == X).all().all()
    assert (Xtransformed["B"] == X).all().all()
Exemple #2
0
 def get_pipeline():
     pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                               "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                               "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                               "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} ,
                               edges = [("pt1","pt3","pt4"),("pt2","pt3","pt4")]
                               )
     return pipeline
def test_graphpipeline_merging_node():

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColNum", "Pt"), ("ColCat", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"]
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all()

    assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]

    # concatenation in the other oreder
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColCat", "Pt"), ("ColNum", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"]  # Concanteation in the order of the edges
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
def test_approx_cross_validation_transformer(x_data_type, shuffle, graph_pipeline, with_groups):

    if graph_pipeline:
        estimator = GraphPipeline({"ptA": DebugPassThrough(), "ptB": DebugPassThrough()}, edges=[("ptA", "ptB")])
    else:
        estimator = DebugPassThrough()

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["accuracy", "neg_log_loss"]

    ##################
    ### Score only ###
    ##################
    with pytest.raises(Exception):
        cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)
        # shouldn't work since DebugPassThrough can't be scored

    #################
    ### Transform ###
    #################
    cv_res, Xhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert type(Xhat) == type(X)
    assert cv_res is None
    assert Xhat.shape == X.shape

    if isinstance(X, pd.DataFrame):
        assert (Xhat.index == X.index).all()
        assert (Xhat.columns == X.columns).all()

    if isinstance(X, pd.DataFrame):
        assert np.abs(Xhat - X).max().max() <= 10 ** (10 - 10)
    else:
        assert np.max(np.abs(Xhat - X)) <= 10 ** (-10)
def test_GraphPipeline_add_nodes():
    
    pipeline = get_pipeline()
    
    new_pipeline = pipeline.add_nodes({"pt0":DebugPassThrough(column_prefix="PT0_", debug=True)}, [("pt0", "pt1")])
    
    assert isinstance(new_pipeline, GraphPipeline)
    Xres= new_pipeline.fit_transform(dfX, y)
    assert Xres.columns[0] == "PT4__PT3__PT1__PT0__text1"
    
    Xres2 = pipeline.fit_transform(dfX)
    assert Xres2.columns[0] == "PT4__PT3__PT1__text1"
    
    with pytest.raises(ValueError):
        pipeline.add_nodes({"pt1": DebugPassThrough(column_prefix="newPT1_", debug=True)}, new_edges=[("pt0", "pt1")])
def test_graphpipeline_fit_params():

    gpipeline = GraphPipeline(
        {"A": DebugPassThrough(debug=True), "B": DebugPassThrough(debug=True), "C": DebugPassThrough(debug=True)},
        edges=[("A", "B", "C")],
    )

    gpipeline.fit(X, y)
    assert gpipeline.models["A"].fit_params == {}
    assert gpipeline.models["B"].fit_params == {}
    assert gpipeline.models["C"].fit_params == {}

    gpipeline.fit(X, y, A__fitparam_A="paramA")
    assert gpipeline.models["A"].fit_params == {"fitparam_A": "paramA"}
    assert gpipeline.models["B"].fit_params == {}
    assert gpipeline.models["C"].fit_params == {}
def test_approx_cross_validation_cv(approximate_cv):
    X, y = make_classification()

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

    estimator = DebugPassThrough()

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups=None,
        cv=cv,
        verbose=1,
        fit_params={},
        return_predict=True,
        method="transform",
        no_scoring=True,
        stopping_round=None,
        stopping_threshold=None,
        approximate_cv=approximate_cv,
    )
    assert cv_res is None
    assert yhat.ndim == 2
    assert yhat.shape == X.shape
def test_graphpipeline_set_params():

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": DebugPassThrough(debug=True)}, edges=[("A", "B", "C")]
    )

    assert gpipeline.models["C"].debug is True
    gpipeline.set_params(C__debug=False)
    assert gpipeline.models["C"].debug is False
def test_graphpipeline_passing_of_groups():
    gpipeline = GraphPipeline({"A": TransformerFailNoGroups(), "B": DebugPassThrough(debug=True)}, edges=[("A", "B")])

    with pytest.raises(ValueError):
        gpipeline.fit(X, y)

    groups = np.zeros(len(y))

    gpipeline.fit(X, y, groups)  # check that it didn't failed
Exemple #10
0
def test_GraphPipeline_substitute_nodes():
    
    pipeline = get_pipeline()
    new_pipeline = pipeline.substitute_nodes({"pt1": DebugPassThrough(column_prefix="newPT1_", debug=True)})
    
    assert isinstance(new_pipeline, GraphPipeline)
    new_pipeline.fit(dfX, y)
    Xres = new_pipeline.transform(dfX)
    assert Xres.columns[0] == "PT4__PT3__newPT1__text1"
    
    with pytest.raises(NotFittedError):
        pipeline.transform(dfX)
        
    Xres2 = pipeline.fit_transform(dfX)
    assert Xres2.columns[0] == "PT4__PT3__PT1__text1"
    
    with pytest.raises(ValueError):
        pipeline.substitute_nodes({"doesntexist": DebugPassThrough(column_prefix="newPT1_", debug=True)})
        
    with pytest.raises(ValueError):
        pipeline.substitute_nodes({"pt1": "this_is_not_a_model"})
def test_cross_val_predict():
    X, y = make_classification(n_samples=100)
    X = pd.DataFrame(X, columns=["col_%d" % i for i in range(X.shape[1])])

    ii = np.arange(X.shape[0])
    np.random.seed(123)
    np.random.shuffle(ii)

    pt = DebugPassThrough()
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

    Xhat = cross_val_predict(pt, X, y, cv=cv, method="transform")
    assert type(Xhat) == type(X)  # Fail : cross_val_predict change the type
def test_fit_and_predict_transfrom():
    X, y = make_classification(n_samples=100)
    X = pd.DataFrame(X, columns=["col_%d" % i for i in range(X.shape[1])])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

    for train, test in cv.split(X, y):

        pt = DebugPassThrough()
        predictions, _ = sklearn.model_selection._validation._fit_and_predict(
            pt, X, y, train, test, verbose=1, fit_params=None, method="transform"
        )

        assert predictions.shape[0] == test.shape[0]
        assert predictions.shape[1] == X.shape[1]

        assert type(predictions) == type(X)
def test_score_from_params(x_data_type, shuffle, graph_pipeline):
    np.random.seed(123)
    X = np.random.randn(100, 10)

    X = convert_generic(X, output_type=x_data_type)

    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = KMeans(n_clusters=3, random_state=123)

    ##################
    ### Only score ###
    ##################

    res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ##########################
    ### Score + Prediction ###
    ##########################
    res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    assert isinstance(label, np.ndarray)

    assert len(np.unique(label)) == 3

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    res, label = score_from_params_clustering(
        estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert len(np.unique(label)) == 3
    assert res is None

    with pytest.raises(NotFittedError):
        estimator.predict(X)
def test_approx_cross_validation_early_stop(
    add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups
):

    X, y = make_classification(n_samples=100, random_state=123)

    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    if add_third_class:
        y[0:2] = 2

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    if y_string_class:
        y = np.array(["CL_%d" % i for i in y])

    if add_third_class:
        scoring = ["accuracy"]
    else:
        scoring = ["accuracy", "neg_log_loss"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": LogisticRegression(C=1, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = LogisticRegression(C=1, random_state=123)

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        stopping_round=1,
        stopping_threshold=1.01,  # So that accuracy is sure to be bellow
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 2
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat is None

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        stopping_round=1,
        stopping_threshold=0.0,
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups):

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if add_third_class:
        y[0:2] = 2

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    if y_string_class:
        y = np.array(["CL_%d" % i for i in y])

    if add_third_class:
        scoring = ["accuracy"]
    else:
        scoring = ["accuracy", "neg_log_loss"]

    if graph_pipeline:
        estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")])
    else:
        estimator = LogisticRegression()

    ##################
    ### Only score ###
    ##################

    cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #####################
    ### Score + Proba ###
    #####################
    cv_res, yhat_proba = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert isinstance(yhat_proba, pd.DataFrame)
    if isinstance(X, pd.DataFrame):
        assert (yhat_proba.index == X.index).all()

    assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class)
    assert yhat_proba.min().min() >= 0
    assert yhat_proba.max().max() <= 1
    assert list(yhat_proba.columns) == list(np.sort(np.unique(y)))

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #######################
    ### Score + Predict ###
    #######################
    cv_res, yhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict"
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    assert yhat.shape[0] == y.shape[0]

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        no_scoring=True,
    )

    assert yhat.shape[0] == y.shape[0]

    assert cv_res is None
    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    with pytest.raises(NotFittedError):
        estimator.predict(X)
Exemple #16
0
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()
Exemple #17
0
def test_graphpipeline_nodes_concat_order():

    cols = list(dfX.columns)

    ### 1
    pipeline = GraphPipeline(
        {
            "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
            "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
            "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
        },
        edges=[("pt1", "pt3"), ("pt2", "pt3")],
    )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT1__" + c for c in cols] + [
        "PT3__PT2__" + c for c in cols
    ]  # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()

    ### 2 : reverse order
    pipeline = GraphPipeline(
        {
            "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
            "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
            "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
        },
        edges=[("pt2", "pt3"), ("pt1", "pt3")],
    )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT2__" + c for c in cols] + [
        "PT3__PT1__" + c for c in cols
    ]  # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()

    ### 3 : with 4 nodes
    for edges in ([("pt1", "pt3", "pt4"), ("pt2", "pt3", "pt4")], [("pt1", "pt3", "pt4"), ("pt2", "pt3")]):
        pipeline = GraphPipeline(
            {
                "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
                "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
                "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
                "pt4": DebugPassThrough(column_prefix="PT4_", debug=True),
            },
            edges=edges,
        )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT1__" + c for c in cols] + [
            "PT4__PT3__PT2__" + c for c in cols
        ]  # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()

    ### 4 : reverse order
    for edges in ([("pt2", "pt3", "pt4"), ("pt1", "pt3", "pt4")], [("pt2", "pt3", "pt4"), ("pt1", "pt3")]):
        pipeline = GraphPipeline(
            {
                "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
                "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
                "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
                "pt4": DebugPassThrough(column_prefix="PT4_", debug=True),
            },
            edges=edges,
        )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT2__" + c for c in cols] + [
            "PT4__PT3__PT1__" + c for c in cols
        ]  # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()
def test_try_to_find_features_names():

    list_of_words = ["aa bb", "bb bb cc", "dd aa cc", "ee"]
    vec = CountVectorizer()
    vec.fit_transform(list_of_words)

    assert try_to_find_features_names(vec) == ["aa", "bb", "cc", "dd", "ee"]

    pipe = Pipeline([("nothing", DebugPassThrough()),
                     ("vec", CountVectorizer())])

    pipe.fit_transform(list_of_words)

    assert try_to_find_features_names(pipe) == ["aa", "bb", "cc", "dd", "ee"]

    union = FeatureUnion(transformer_list=[(
        "bagword",
        CountVectorizer()), ("bagchar", CountVectorizer(analyzer="char"))])
    union.fit_transform(list_of_words)

    assert try_to_find_features_names(union) == [
        "bagword__aa",
        "bagword__bb",
        "bagword__cc",
        "bagword__dd",
        "bagword__ee",
        "bagchar__ ",
        "bagchar__a",
        "bagchar__b",
        "bagchar__c",
        "bagchar__d",
        "bagchar__e",
    ]

    pipe1 = Pipeline([("nothing", DebugPassThrough()),
                      ("vec", CountVectorizer())])

    pipe2 = Pipeline([("nothing", DebugPassThrough()),
                      ("vec", CountVectorizer(analyzer="char"))])

    union = FeatureUnion(transformer_list=[("bagword",
                                            pipe1), ("bagchar", pipe2)])
    union.fit_transform(list_of_words)

    assert try_to_find_features_names(union) == [
        "bagword__aa",
        "bagword__bb",
        "bagword__cc",
        "bagword__dd",
        "bagword__ee",
        "bagchar__ ",
        "bagchar__a",
        "bagchar__b",
        "bagchar__c",
        "bagchar__d",
        "bagchar__e",
    ]

    class DummyModelAcceptInputFeature(object):
        def get_feature_names(self, input_features=None):
            if input_features is None:
                return [0, 1, 2, 3]
            else:
                return input_features

    class DummyModelDontInputFeature(object):
        def get_feature_names(self):
            return [0, 1, 2, 3]

    class DummyModelDoesntHaveGetFeatures(object):
        pass

    m = DummyModelAcceptInputFeature()
    assert try_to_find_features_names(m) == [0, 1, 2, 3]
    assert try_to_find_features_names(
        m, input_features=["a", "b", "c", "d"]) == ["a", "b", "c", "d"]

    m = DummyModelDontInputFeature()
    assert try_to_find_features_names(m) == [0, 1, 2, 3]
    assert try_to_find_features_names(m, input_features=["a", "b", "c",
                                                         "d"]) == [0, 1, 2, 3]

    m = DummyModelDoesntHaveGetFeatures()
    assert try_to_find_features_names(m) is None
    assert try_to_find_features_names(m, input_features=["a", "b", "c", "d"
                                                         ]) is None