def test_graphpipeline_blockselector_cv(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({ "text1": get_random_strings(100), "text2": get_random_strings(100) }) ### X = dico X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) from sklearn.model_selection import cross_val_score with pytest.raises(ValueError): cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10) # doesn't work, can't subset dictionnary X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10) assert len(cv_res) == 10
def test_graphpipeline_blockselector(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)}) X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) graphpipeline.fit(X, y) yhat = graphpipeline.predict(X) assert yhat.ndim == 1 assert yhat.shape[0] == y.shape[0] ### X = dico ### X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = list X = [dfX_text, Xnum] graphpipeline = GraphPipeline( models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = DataManager X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all()
def test_BlockSelector(): np.random.seed(123) df = pd.DataFrame({ "a": np.arange(10), "b": ["aaa", "bbb", "ccc"] * 3 + ["ddd"] }) arr = np.random.randn(df.shape[0], 5) input_features = { "df": df.columns, "arr": ["COL_%d" % i for i in range(arr.shape[1])] } # Dictionnary block_selector1 = BlockSelector("df") block_selector2 = BlockSelector("arr") X = {"df": df, "arr": arr} Xres1 = block_selector1.fit_transform(X) Xres2 = block_selector2.fit_transform(X) assert block_selector1.get_feature_names() == ["a", "b"] assert block_selector1.get_feature_names( input_features=input_features) == ["a", "b"] assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4] assert block_selector2.get_feature_names( input_features=input_features) == [ "COL_0", "COL_1", "COL_2", "COL_3", "COL_4", ] assert id(Xres1) == id(df) assert id(Xres2) == id(arr) # no copy # List X = [df, arr] input_features = [df.columns, ["COL_%d" % i for i in range(arr.shape[1])]] block_selector1 = BlockSelector(0) block_selector2 = BlockSelector(1) Xres1 = block_selector1.fit_transform(X) Xres2 = block_selector2.fit_transform(X) assert block_selector1.get_feature_names() == ["a", "b"] assert block_selector1.get_feature_names( input_features=input_features) == ["a", "b"] assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4] assert block_selector2.get_feature_names( input_features=input_features) == [ "COL_0", "COL_1", "COL_2", "COL_3", "COL_4", ] assert id(Xres1) == id(df) assert id(Xres2) == id(arr) # BlockManager X = BlockManager({"df": df, "arr": arr}) input_features = { "df": df.columns, "arr": ["COL_%d" % i for i in range(arr.shape[1])] } block_selector1 = BlockSelector("df") block_selector2 = BlockSelector("arr") X = {"df": df, "arr": arr} Xres1 = block_selector1.fit_transform(X) Xres2 = block_selector2.fit_transform(X) assert block_selector1.get_feature_names() == ["a", "b"] assert block_selector1.get_feature_names( input_features=input_features) == ["a", "b"] assert block_selector2.get_feature_names() == [0, 1, 2, 3, 4] assert block_selector2.get_feature_names( input_features=input_features) == [ "COL_0", "COL_1", "COL_2", "COL_3", "COL_4", ] assert id(Xres1) == id(df) assert id(Xres2) == id(arr) # no copy # Check not fitted block_selector1 = BlockSelector(0) with pytest.raises(NotFittedError): block_selector1.transform(X)
def _approx_cross_validation_create_sub_graph_pipeline(self, data_dico, X): """ this sub-method create the new graph-pipeline that should be fully cross-validated, it also create the new data on which to cv Returns ------- new_graph_pipeline new_data """ ### Create a new GraphPipeline with only the remaning Nodes ### dones_nodes = set() for k, v in data_dico.items(): if v is not None: dones_nodes.add(k) newG = nx.DiGraph() new_models = {} new_datas = {} block_selector_nodes = set() for n1, n2 in self.complete_graph.edges: if n1 in dones_nodes and n2 in dones_nodes: pass elif n1 in dones_nodes and n2 not in dones_nodes: newG.add_edge("_data_%s" % n1, n2) new_models[n2] = self._models[n2] new_models["_data_%s" % n1] = BlockSelector("_data_%s" % n1) new_datas["_data_%s" % n1] = data_dico[n1] block_selector_nodes.add("_data_%s" % n1) # Add a BlockSelector elif n1 not in dones_nodes and n2 not in dones_nodes: newG.add_edge(n1, n2) new_models[n1] = self._models[n1] new_models[n2] = self._models[n2] else: raise ValueError("Should never go there") nodes = list(newG.nodes) # copy because I'll modify the graph for n in nodes: preds = list(newG.predecessors(n)) if len(preds) == 0 and n not in block_selector_nodes: newG.add_edge("_data_", n) new_models["_data_"] = BlockSelector("_data_") new_datas["_data_"] = X new_data_dtm = BlockManager(new_datas) new_graph_pipeline = GraphPipeline(models=new_models, edges=edges_from_graph(newG)) return new_graph_pipeline, new_data_dtm