Example #1
0
def test_BlockManager_retrieve():

    np.random.seed(123)

    df = pd.DataFrame({
        "a": np.arange(10),
        "b": ["aaa", "bbb", "ccc"] * 3 + ["ddd"]
    })
    arr = np.random.randn(df.shape[0], 5)

    X = BlockManager({"df": df, "arr": arr})

    assert X.shape == (10, 7)

    df1 = X["df"]
    arr1 = X["arr"]

    assert id(df1) == id(df)
    assert id(arr1) == id(arr)

    assert "df" in X
    assert "arr" in X

    with pytest.raises(KeyError):
        X["toto"]

    assert "toto" not in X

    with pytest.raises(KeyError):
        X[0]

    assert 0 not in X

    X = BlockManager([df, arr])
    df1 = X[0]
    arr1 = X[1]

    assert X.shape == (10, 7)

    assert id(df1) == id(df)
    assert id(arr1) == id(arr)

    with pytest.raises(KeyError):
        X["toto"]

    with pytest.raises(KeyError):
        X[3]

    assert 0 in X
    assert 1 in X
    assert 3 not in X
    assert "toto" not in X
Example #2
0
def test_BlockManager_subset():
    np.random.seed(123)

    df = pd.DataFrame({
        "a": np.arange(10),
        "b": ["aaa", "bbb", "ccc"] * 3 + ["ddd"]
    })
    arr = np.random.randn(df.shape[0], 5)

    X = BlockManager({"df": df, "arr": arr})

    Xsubset = X.iloc[0:3, :]

    assert isinstance(Xsubset, BlockManager)
    assert (Xsubset["df"] == df.iloc[0:3, :]).all().all()
    assert (Xsubset["arr"] == arr[0:3, :]).all()
    assert np.may_share_memory(Xsubset["arr"], arr)

    Xsubset = X.iloc[[0, 1, 2]]
    assert isinstance(Xsubset, BlockManager)
    assert (Xsubset["df"] == df.iloc[0:3, :]).all().all()
    assert (Xsubset["arr"] == arr[0:3, :]).all()

    Xsubset = X.iloc[np.array([0, 1, 2])]
    assert isinstance(Xsubset, BlockManager)
    assert (Xsubset["df"] == df.iloc[0:3, :]).all().all()
    assert (Xsubset["arr"] == arr[0:3, :]).all()

    Xsubset = safe_indexing(X, np.array([0, 1, 2]))
    assert isinstance(Xsubset, BlockManager)
    assert (Xsubset["df"] == df.iloc[0:3, :]).all().all()
    assert (Xsubset["arr"] == arr[0:3, :]).all()
Example #3
0
def test_BlockManager_subset_with_sparse():
    np.random.seed(123)
    X = np.random.randn(100, 10)
    X[X < 0] = 0

    Xs = sps.coo_matrix(X)

    bm = BlockManager({"X": Xs})

    bm2 = bm.iloc[0:10, :]

    assert (bm2["X"].todense() == X[0:10, :]).all()
Example #4
0
def test_graphpipeline_blockselector_cv():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({
        "text1": get_random_strings(100),
        "text2": get_random_strings(100)
    })

    ### X = dico
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    from sklearn.model_selection import cross_val_score

    with pytest.raises(ValueError):
        cv_res = cross_val_score(graphpipeline,
                                 X,
                                 y,
                                 scoring="accuracy",
                                 cv=10)
        # doesn't work, can't subset dictionnary

    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10)

    assert len(cv_res) == 10
Example #5
0
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()
Example #6
0
def test_BlockSelector():

    np.random.seed(123)

    df = pd.DataFrame({
        "a": np.arange(10),
        "b": ["aaa", "bbb", "ccc"] * 3 + ["ddd"]
    })
    arr = np.random.randn(df.shape[0], 5)

    input_features = {
        "df": df.columns,
        "arr": ["COL_%d" % i for i in range(arr.shape[1])]
    }

    # Dictionnary

    block_selector1 = BlockSelector("df")
    block_selector2 = BlockSelector("arr")

    X = {"df": df, "arr": arr}
    Xres1 = block_selector1.fit_transform(X)
    Xres2 = block_selector2.fit_transform(X)

    assert block_selector1.get_feature_names() == ["a", "b"]
    assert block_selector1.get_feature_names(
        input_features=input_features) == ["a", "b"]

    assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4]
    assert block_selector2.get_feature_names(
        input_features=input_features) == [
            "COL_0",
            "COL_1",
            "COL_2",
            "COL_3",
            "COL_4",
        ]

    assert id(Xres1) == id(df)
    assert id(Xres2) == id(arr)  # no copy

    # List
    X = [df, arr]
    input_features = [df.columns, ["COL_%d" % i for i in range(arr.shape[1])]]

    block_selector1 = BlockSelector(0)
    block_selector2 = BlockSelector(1)

    Xres1 = block_selector1.fit_transform(X)
    Xres2 = block_selector2.fit_transform(X)

    assert block_selector1.get_feature_names() == ["a", "b"]
    assert block_selector1.get_feature_names(
        input_features=input_features) == ["a", "b"]

    assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4]
    assert block_selector2.get_feature_names(
        input_features=input_features) == [
            "COL_0",
            "COL_1",
            "COL_2",
            "COL_3",
            "COL_4",
        ]

    assert id(Xres1) == id(df)
    assert id(Xres2) == id(arr)

    # BlockManager
    X = BlockManager({"df": df, "arr": arr})
    input_features = {
        "df": df.columns,
        "arr": ["COL_%d" % i for i in range(arr.shape[1])]
    }

    block_selector1 = BlockSelector("df")
    block_selector2 = BlockSelector("arr")

    X = {"df": df, "arr": arr}
    Xres1 = block_selector1.fit_transform(X)
    Xres2 = block_selector2.fit_transform(X)

    assert block_selector1.get_feature_names() == ["a", "b"]
    assert block_selector1.get_feature_names(
        input_features=input_features) == ["a", "b"]

    assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4]
    assert block_selector2.get_feature_names(
        input_features=input_features) == [
            "COL_0",
            "COL_1",
            "COL_2",
            "COL_3",
            "COL_4",
        ]

    assert id(Xres1) == id(df)
    assert id(Xres2) == id(arr)  # no copy

    # Check not fitted

    block_selector1 = BlockSelector(0)
    with pytest.raises(NotFittedError):
        block_selector1.transform(X)
Example #7
0
    def _approx_cross_validation_create_sub_graph_pipeline(self, data_dico, X):
        """ this sub-method create the new graph-pipeline that should be fully cross-validated,
        it also create the new data on which to cv 
        
        Returns
        -------
        new_graph_pipeline
        
        new_data
        """
        ### Create a new GraphPipeline with only the remaning Nodes ###

        dones_nodes = set()
        for k, v in data_dico.items():
            if v is not None:
                dones_nodes.add(k)

        newG = nx.DiGraph()
        new_models = {}
        new_datas = {}
        block_selector_nodes = set()

        for n1, n2 in self.complete_graph.edges:

            if n1 in dones_nodes and n2 in dones_nodes:
                pass

            elif n1 in dones_nodes and n2 not in dones_nodes:

                newG.add_edge("_data_%s" % n1, n2)

                new_models[n2] = self._models[n2]
                new_models["_data_%s" % n1] = BlockSelector("_data_%s" % n1)

                new_datas["_data_%s" % n1] = data_dico[n1]

                block_selector_nodes.add("_data_%s" % n1)
                # Add a BlockSelector

            elif n1 not in dones_nodes and n2 not in dones_nodes:
                newG.add_edge(n1, n2)

                new_models[n1] = self._models[n1]
                new_models[n2] = self._models[n2]

            else:
                raise ValueError("Should never go there")

        nodes = list(newG.nodes)  # copy because I'll modify the graph
        for n in nodes:
            preds = list(newG.predecessors(n))
            if len(preds) == 0 and n not in block_selector_nodes:

                newG.add_edge("_data_", n)
                new_models["_data_"] = BlockSelector("_data_")

                new_datas["_data_"] = X

        new_data_dtm = BlockManager(new_datas)

        new_graph_pipeline = GraphPipeline(models=new_models,
                                           edges=edges_from_graph(newG))

        return new_graph_pipeline, new_data_dtm