Ejemplo n.º 1
0
def test_gpipeline_raise_not_fitted():
    gpipeline = GraphPipeline({
        "PT": PassThrough(),
        "Ridge": Ridge()
    }, [("PT", "Ridge")])

    with pytest.raises(NotFittedError):
        gpipeline.predict(X)
Ejemplo n.º 2
0
def test_gpipeline_regression():
    gpipeline = GraphPipeline({
        "PT": PassThrough(),
        "Ridge": Ridge()
    }, [("PT", "Ridge")])

    X = dfX.loc[:, ["num1", "num2", "num3"]]

    gpipeline.fit(X, y)
    yhat = gpipeline.predict(X)
    yhat2 = gpipeline.models["Ridge"].predict(X)

    assert yhat.shape == y.shape
    assert (yhat == yhat2).all()

    with pytest.raises(AttributeError):
        gpipeline.predict_proba(X)

    with pytest.raises(AttributeError):
        gpipeline.predict_log_proba(X)

    assert gpipeline.get_feature_names_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("Ridge") == list(X.columns)

    with pytest.raises(ValueError):
        assert gpipeline.get_feature_names_at_node("DONTEXIST")
Ejemplo n.º 3
0
def test_gpipeline_clustering():

    gpipeline = GraphPipeline({"PT": PassThrough(), "kmeans": KMeans(n_clusters=2)}, [("PT", "kmeans")])
    gpipeline.fit(X)

    yhat = gpipeline.predict(X)
    yhat2 = gpipeline.models["kmeans"].predict(X)

    assert (yhat == yhat2).all()
Ejemplo n.º 4
0
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()
Ejemplo n.º 5
0
def test_score_from_params(x_data_type, shuffle, graph_pipeline):
    np.random.seed(123)
    X = np.random.randn(100, 10)

    X = convert_generic(X, output_type=x_data_type)

    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = KMeans(n_clusters=3, random_state=123)

    ##################
    ### Only score ###
    ##################

    res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ##########################
    ### Score + Prediction ###
    ##########################
    res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    assert isinstance(label, np.ndarray)

    assert len(np.unique(label)) == 3

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    res, label = score_from_params_clustering(
        estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert len(np.unique(label)) == 3
    assert res is None

    with pytest.raises(NotFittedError):
        estimator.predict(X)
Ejemplo n.º 6
0
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups):

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if add_third_class:
        y[0:2] = 2

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    if y_string_class:
        y = np.array(["CL_%d" % i for i in y])

    if add_third_class:
        scoring = ["accuracy"]
    else:
        scoring = ["accuracy", "neg_log_loss"]

    if graph_pipeline:
        estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")])
    else:
        estimator = LogisticRegression()

    ##################
    ### Only score ###
    ##################

    cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #####################
    ### Score + Proba ###
    #####################
    cv_res, yhat_proba = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert isinstance(yhat_proba, pd.DataFrame)
    if isinstance(X, pd.DataFrame):
        assert (yhat_proba.index == X.index).all()

    assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class)
    assert yhat_proba.min().min() >= 0
    assert yhat_proba.max().max() <= 1
    assert list(yhat_proba.columns) == list(np.sort(np.unique(y)))

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #######################
    ### Score + Predict ###
    #######################
    cv_res, yhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict"
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    assert yhat.shape[0] == y.shape[0]

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        no_scoring=True,
    )

    assert yhat.shape[0] == y.shape[0]

    assert cv_res is None
    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    with pytest.raises(NotFittedError):
        estimator.predict(X)