예제 #1
0
def test_CountVectorizerWrapper_few_sample():
    Xtrain = load_dataset("titanic")[0]

    vect = CountVectorizerWrapper(min_df=1)

    X = Xtrain.loc[0:10, ["name", "ticket"]]
    Xres = vect.fit_transform(X)

    assert Xres.shape[0] == 11
예제 #2
0
def test_graphpipeline_concat_names():

    df = get_sample_df(size=100, seed=123)
    gpipeline = GraphPipeline(
        models={
            "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]),
            "vec": CountVectorizerWrapper(columns_to_use=["text_col"]),
            "pt": PassThrough(),
        },
        edges=[("sel", "pt"), ("vec", "pt")],
    )

    gpipeline.fit(df)
    df_res = gpipeline.transform(df)

    assert list(df_res.columns) == [
        "float_col",
        "int_col",
        "text_col__BAG__aaa",
        "text_col__BAG__bbb",
        "text_col__BAG__ccc",
        "text_col__BAG__ddd",
        "text_col__BAG__eee",
        "text_col__BAG__fff",
        "text_col__BAG__jjj",
    ]

    assert gpipeline.get_feature_names() == list(df_res.columns)
예제 #3
0
def test_graphpipeline_blockselector_cv():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({
        "text1": get_random_strings(100),
        "text2": get_random_strings(100)
    })

    ### X = dico
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    from sklearn.model_selection import cross_val_score

    with pytest.raises(ValueError):
        cv_res = cross_val_score(graphpipeline,
                                 X,
                                 y,
                                 scoring="accuracy",
                                 cv=10)
        # doesn't work, can't subset dictionnary

    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10)

    assert len(cv_res) == 10
예제 #4
0
파일: test_text.py 프로젝트: gheeraej/aikit
def test_CountVectorizerWrapper_on_Serie():

    df = get_sample_df(size=100, seed=123)

    X = df["text_col"]
    vect = CountVectorizerWrapper()

    Xres = vect.fit_transform(X)

    assert len(Xres.shape) == 2
    assert Xres.shape[0] == X.shape[0]
    assert Xres.shape[1] == len(vect.get_feature_names())

    Xres = vect.transform(X)
    assert len(Xres.shape) == 2
    assert Xres.shape[0] == X.shape[0]
    assert Xres.shape[1] == len(vect.get_feature_names())
예제 #5
0
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()
예제 #6
0
파일: test_text.py 프로젝트: gheeraej/aikit
def test_CountVectorizerWrapper():

    df = get_sample_df(size=100, seed=123)

    vect = CountVectorizerWrapper(columns_to_use=["text_col"])
    vect.fit(df)

    cols = vect.get_feature_names()
    for c in cols:
        assert c.startswith("text_col__BAG")

    vect = CountVectorizerWrapper(columns_to_use=[2])
    vect.fit(df)

    cols = vect.get_feature_names()
    for c in cols:
        assert c.startswith("text_col__BAG")

    X = df.values
    vect = CountVectorizerWrapper(columns_to_use=[2])
    vect.fit(X)
    cols = vect.get_feature_names()
    for c in cols:
        assert c.startswith("2__BAG")
예제 #7
0
파일: test_text.py 프로젝트: gheeraej/aikit
def test_CountVectorizerWrapper_output_type():
    
    vect = CountVectorizerWrapper()
    res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"]}))
    assert res.dtype == "int32"

    vect = CountVectorizerWrapper()
    res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"],"b":["xxx","zzz","xxx"]}))
    assert res.dtype == "int32"

    vect = CountVectorizerWrapper(dtype="int64")
    res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"]}))
    assert res.dtype == "int64"

    vect = CountVectorizerWrapper(dtype="int64")
    res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"],"b":["xxx","zzz","xxx"]}))
    assert res.dtype == "int64"