def test_vectorizer_empty_token_case(): """ We ignore empty tokens right now but sklearn treats them as a character we might want to look into this more but this should not be a concern for most piplines """ corpus = [ "a b ", ] # we have extra null token here # we slightly diverge from sklearn here as not treating it as a token res = CountVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkCountVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) cp.testing.assert_array_equal(res.todense(), ref.toarray()) res = HashingVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkHashVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_countvectorizer_custom_vocabulary(): vocab = {"pizza": 0, "beer": 1} vocab_gpu = Series(vocab.keys()) ref = SkCountVect(vocabulary=vocab).fit_transform(DOCS) X = CountVectorizer(vocabulary=vocab_gpu).fit_transform(DOCS_GPU) cp.testing.assert_array_equal(X.todense(), ref.toarray())
def test_only_delimiters(): data = ['abc def. 123', ' ', '456 789'] data_gpu = Series(data) res = CountVectorizer().fit_transform(data_gpu) ref = SkCountVect().fit_transform(data) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_empty_doc_after_limit_features(): data = ['abc abc def', 'def abc', 'ghi'] data_gpu = Series(data) count = CountVectorizer(min_df=2).fit_transform(data_gpu) ref = SkCountVect(min_df=2).fit_transform(data) cp.testing.assert_array_equal(count.todense(), ref.toarray())
def test_count_vectorizer(): corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] res = CountVectorizer().fit_transform(Series(corpus)) ref = SkCountVect().fit_transform(corpus) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_countvectorizer_stop_words(): ref = SkCountVect(stop_words='english').fit_transform(DOCS) X = CountVectorizer(stop_words='english').fit_transform(DOCS_GPU) cp.testing.assert_array_equal(X.todense(), ref.toarray())
def test_countvectorizer_separate_fit_transform(): res = CountVectorizer().fit(DOCS_GPU).transform(DOCS_GPU) ref = SkCountVect().fit(DOCS).transform(DOCS) cp.testing.assert_array_equal(res.todense(), ref.toarray())