def test_CountVectorizerWrapper_output_type(): vect = CountVectorizerWrapper() res = vect.fit_transform(pd.DataFrame({"a": ["AA", "AAA", "bb"]})) assert res.dtype == "int32" vect = CountVectorizerWrapper() res = vect.fit_transform( pd.DataFrame({ "a": ["AA", "AAA", "bb"], "b": ["xxx", "zzz", "xxx"] })) assert res.dtype == "int32" vect = CountVectorizerWrapper(dtype="int64") res = vect.fit_transform(pd.DataFrame({"a": ["AA", "AAA", "bb"]})) assert res.dtype == "int64" vect = CountVectorizerWrapper(dtype="int64") res = vect.fit_transform( pd.DataFrame({ "a": ["AA", "AAA", "bb"], "b": ["xxx", "zzz", "xxx"] })) assert res.dtype == "int64"
def test_CountVectorizerWrapper_few_sample(): Xtrain = load_dataset("titanic")[0] vect = CountVectorizerWrapper(min_df=1) X = Xtrain.loc[0:10, ["name", "ticket"]] Xres = vect.fit_transform(X) assert Xres.shape[0] == 11
def test_CountVectorizerWrapper_on_Serie(): df = get_sample_df(size=100, seed=123) X = df["text_col"] vect = CountVectorizerWrapper() Xres = vect.fit_transform(X) assert len(Xres.shape) == 2 assert Xres.shape[0] == X.shape[0] assert Xres.shape[1] == len(vect.get_feature_names()) Xres = vect.transform(X) assert len(Xres.shape) == 2 assert Xres.shape[0] == X.shape[0] assert Xres.shape[1] == len(vect.get_feature_names())