def test_gpipeline_raise_not_fitted(): gpipeline = GraphPipeline({ "PT": PassThrough(), "Ridge": Ridge() }, [("PT", "Ridge")]) with pytest.raises(NotFittedError): gpipeline.predict(X)
def test_gpipeline_regression(): gpipeline = GraphPipeline({ "PT": PassThrough(), "Ridge": Ridge() }, [("PT", "Ridge")]) X = dfX.loc[:, ["num1", "num2", "num3"]] gpipeline.fit(X, y) yhat = gpipeline.predict(X) yhat2 = gpipeline.models["Ridge"].predict(X) assert yhat.shape == y.shape assert (yhat == yhat2).all() with pytest.raises(AttributeError): gpipeline.predict_proba(X) with pytest.raises(AttributeError): gpipeline.predict_log_proba(X) assert gpipeline.get_feature_names_at_node("PT") == list(X.columns) assert gpipeline.get_input_features_at_node("PT") == list(X.columns) assert gpipeline.get_input_features_at_node("Ridge") == list(X.columns) with pytest.raises(ValueError): assert gpipeline.get_feature_names_at_node("DONTEXIST")
def test_gpipeline_clustering(): gpipeline = GraphPipeline({"PT": PassThrough(), "kmeans": KMeans(n_clusters=2)}, [("PT", "kmeans")]) gpipeline.fit(X) yhat = gpipeline.predict(X) yhat2 = gpipeline.models["kmeans"].predict(X) assert (yhat == yhat2).all()
def test_graphpipeline_blockselector(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)}) X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) graphpipeline.fit(X, y) yhat = graphpipeline.predict(X) assert yhat.ndim == 1 assert yhat.shape[0] == y.shape[0] ### X = dico ### X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = list X = [dfX_text, Xnum] graphpipeline = GraphPipeline( models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = DataManager X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all()
def test_score_from_params(x_data_type, shuffle, graph_pipeline): np.random.seed(123) X = np.random.randn(100, 10) X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: ii = np.arange(X.shape[0]) np.random.shuffle(ii) if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"] if graph_pipeline: estimator = GraphPipeline( {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")] ) else: estimator = KMeans(n_clusters=3, random_state=123) ################## ### Only score ### ################## res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0) assert isinstance(res, pd.DataFrame) assert res.shape[0] == 1 for s in scoring: assert ("test_" + s) in set(res.columns) with pytest.raises(NotFittedError): estimator.predict(X) ########################## ### Score + Prediction ### ########################## res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True) assert isinstance(res, pd.DataFrame) assert res.shape[0] == 1 for s in scoring: assert ("test_" + s) in set(res.columns) assert isinstance(label, np.ndarray) assert len(np.unique(label)) == 3 with pytest.raises(NotFittedError): estimator.predict(X) #################### ### Predict only ### #################### res, label = score_from_params_clustering( estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True ) assert len(np.unique(label)) == 3 assert res is None with pytest.raises(NotFittedError): estimator.predict(X)
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups): X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if add_third_class: y[0:2] = 2 if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] if y_string_class: y = np.array(["CL_%d" % i for i in y]) if add_third_class: scoring = ["accuracy"] else: scoring = ["accuracy", "neg_log_loss"] if graph_pipeline: estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")]) else: estimator = LogisticRegression() ################## ### Only score ### ################## cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) with pytest.raises(NotFittedError): estimator.predict(X) ##################### ### Score + Proba ### ##################### cv_res, yhat_proba = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert isinstance(yhat_proba, pd.DataFrame) if isinstance(X, pd.DataFrame): assert (yhat_proba.index == X.index).all() assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class) assert yhat_proba.min().min() >= 0 assert yhat_proba.max().max() <= 1 assert list(yhat_proba.columns) == list(np.sort(np.unique(y))) with pytest.raises(NotFittedError): estimator.predict(X) ####################### ### Score + Predict ### ####################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict" ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 assert yhat.shape[0] == y.shape[0] with pytest.raises(NotFittedError): estimator.predict(X) #################### ### Predict only ### #################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", no_scoring=True, ) assert yhat.shape[0] == y.shape[0] assert cv_res is None assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 with pytest.raises(NotFittedError): estimator.predict(X)