コード例 #1
0
def test_graphpipeline_blockselector_cv():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({
        "text1": get_random_strings(100),
        "text2": get_random_strings(100)
    })

    ### X = dico
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    from sklearn.model_selection import cross_val_score

    with pytest.raises(ValueError):
        cv_res = cross_val_score(graphpipeline,
                                 X,
                                 y,
                                 scoring="accuracy",
                                 cv=10)
        # doesn't work, can't subset dictionnary

    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10)

    assert len(cv_res) == 10
コード例 #2
0
ファイル: test_pipeline.py プロジェクト: fabien-vavrand/aikit
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()
コード例 #3
0
def test_BlockSelector():

    np.random.seed(123)

    df = pd.DataFrame({
        "a": np.arange(10),
        "b": ["aaa", "bbb", "ccc"] * 3 + ["ddd"]
    })
    arr = np.random.randn(df.shape[0], 5)

    input_features = {
        "df": df.columns,
        "arr": ["COL_%d" % i for i in range(arr.shape[1])]
    }

    # Dictionnary

    block_selector1 = BlockSelector("df")
    block_selector2 = BlockSelector("arr")

    X = {"df": df, "arr": arr}
    Xres1 = block_selector1.fit_transform(X)
    Xres2 = block_selector2.fit_transform(X)

    assert block_selector1.get_feature_names() == ["a", "b"]
    assert block_selector1.get_feature_names(
        input_features=input_features) == ["a", "b"]

    assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4]
    assert block_selector2.get_feature_names(
        input_features=input_features) == [
            "COL_0",
            "COL_1",
            "COL_2",
            "COL_3",
            "COL_4",
        ]

    assert id(Xres1) == id(df)
    assert id(Xres2) == id(arr)  # no copy

    # List
    X = [df, arr]
    input_features = [df.columns, ["COL_%d" % i for i in range(arr.shape[1])]]

    block_selector1 = BlockSelector(0)
    block_selector2 = BlockSelector(1)

    Xres1 = block_selector1.fit_transform(X)
    Xres2 = block_selector2.fit_transform(X)

    assert block_selector1.get_feature_names() == ["a", "b"]
    assert block_selector1.get_feature_names(
        input_features=input_features) == ["a", "b"]

    assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4]
    assert block_selector2.get_feature_names(
        input_features=input_features) == [
            "COL_0",
            "COL_1",
            "COL_2",
            "COL_3",
            "COL_4",
        ]

    assert id(Xres1) == id(df)
    assert id(Xres2) == id(arr)

    # BlockManager
    X = BlockManager({"df": df, "arr": arr})
    input_features = {
        "df": df.columns,
        "arr": ["COL_%d" % i for i in range(arr.shape[1])]
    }

    block_selector1 = BlockSelector("df")
    block_selector2 = BlockSelector("arr")

    X = {"df": df, "arr": arr}
    Xres1 = block_selector1.fit_transform(X)
    Xres2 = block_selector2.fit_transform(X)

    assert block_selector1.get_feature_names() == ["a", "b"]
    assert block_selector1.get_feature_names(
        input_features=input_features) == ["a", "b"]

    assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4]
    assert block_selector2.get_feature_names(
        input_features=input_features) == [
            "COL_0",
            "COL_1",
            "COL_2",
            "COL_3",
            "COL_4",
        ]

    assert id(Xres1) == id(df)
    assert id(Xres2) == id(arr)  # no copy

    # Check not fitted

    block_selector1 = BlockSelector(0)
    with pytest.raises(NotFittedError):
        block_selector1.transform(X)
コード例 #4
0
    def _approx_cross_validation_create_sub_graph_pipeline(self, data_dico, X):
        """ this sub-method create the new graph-pipeline that should be fully cross-validated,
        it also create the new data on which to cv 
        
        Returns
        -------
        new_graph_pipeline
        
        new_data
        """
        ### Create a new GraphPipeline with only the remaning Nodes ###

        dones_nodes = set()
        for k, v in data_dico.items():
            if v is not None:
                dones_nodes.add(k)

        newG = nx.DiGraph()
        new_models = {}
        new_datas = {}
        block_selector_nodes = set()

        for n1, n2 in self.complete_graph.edges:

            if n1 in dones_nodes and n2 in dones_nodes:
                pass

            elif n1 in dones_nodes and n2 not in dones_nodes:

                newG.add_edge("_data_%s" % n1, n2)

                new_models[n2] = self._models[n2]
                new_models["_data_%s" % n1] = BlockSelector("_data_%s" % n1)

                new_datas["_data_%s" % n1] = data_dico[n1]

                block_selector_nodes.add("_data_%s" % n1)
                # Add a BlockSelector

            elif n1 not in dones_nodes and n2 not in dones_nodes:
                newG.add_edge(n1, n2)

                new_models[n1] = self._models[n1]
                new_models[n2] = self._models[n2]

            else:
                raise ValueError("Should never go there")

        nodes = list(newG.nodes)  # copy because I'll modify the graph
        for n in nodes:
            preds = list(newG.predecessors(n))
            if len(preds) == 0 and n not in block_selector_nodes:

                newG.add_edge("_data_", n)
                new_models["_data_"] = BlockSelector("_data_")

                new_datas["_data_"] = X

        new_data_dtm = BlockManager(new_datas)

        new_graph_pipeline = GraphPipeline(models=new_models,
                                           edges=edges_from_graph(newG))

        return new_graph_pipeline, new_data_dtm