def test_countvectorizer_max_features_counts():
    JUNK_FOOD_DOCS_GPU = Series(JUNK_FOOD_DOCS)

    cv_1 = CountVectorizer(max_features=1)
    cv_3 = CountVectorizer(max_features=3)
    cv_None = CountVectorizer(max_features=None)

    counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)
    counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)
    counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)

    features_1 = cv_1.get_feature_names()
    features_3 = cv_3.get_feature_names()
    features_None = cv_None.get_feature_names()

    # The most common feature is "the", with frequency 7.
    assert 7 == counts_1.max()
    assert 7 == counts_3.max()
    assert 7 == counts_None.max()

    # The most common feature should be the same
    def as_index(x):
        return x.astype(cp.int32).item()
    assert "the" == features_1[as_index(cp.argmax(counts_1))]
    assert "the" == features_3[as_index(cp.argmax(counts_3))]
    assert "the" == features_None[as_index(cp.argmax(counts_None))]
def test_space_ngrams(ngram_range):
    data = ['abc      def. 123 456    789']
    data_gpu = Series(data)
    vec = CountVectorizer(ngram_range=ngram_range).fit(data_gpu)
    ref = SkCountVect(ngram_range=ngram_range).fit(data)
    assert (ref.get_feature_names()
            ) == vec.get_feature_names().to_arrow().to_pylist()
def test_countvectorizer_stop_words_ngrams():
    stop_words_doc = Series(["and me too andy andy too"])
    expected_vocabulary = ["andy andy"]

    v = CountVectorizer(ngram_range=(2, 2), stop_words='english')
    v.fit(stop_words_doc)

    assert expected_vocabulary == v.get_feature_names().to_arrow().to_pylist()
def test_character_ngrams(analyzer, ngram_range):
    data = ['ab c', '' 'edf gh']

    res = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range)
    res.fit(Series(data))

    ref = SkCountVect(analyzer=analyzer, ngram_range=ngram_range).fit(data)

    assert (ref.get_feature_names()
            ) == res.get_feature_names().to_arrow().to_pylist()
def test_non_ascii():
    non_ascii = ('This is ascii,', 'but not this Αγγλικά.')
    non_ascii_gpu = Series(non_ascii)

    cv = CountVectorizer()
    res = cv.fit_transform(non_ascii_gpu)
    ref = SkCountVect().fit_transform(non_ascii)

    assert 'αγγλικά' in set(cv.get_feature_names().to_arrow().to_pylist())
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_countvectorizer_max_features():
    expected_vocabulary = {'burger', 'beer', 'salad', 'pizza'}
    expected_stop_words = {'celeri', 'tomato', 'copyright', 'coke',
                           'sparkling', 'water', 'the'}

    # test bounded number of extracted features
    vec = CountVectorizer(max_df=0.6, max_features=4)
    vec.fit(DOCS_GPU)
    assert set(vec.get_feature_names().to_arrow().to_pylist()
               ) == expected_vocabulary
    assert set(vec.stop_words_.to_arrow().to_pylist()) == expected_stop_words
def test_count_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = Series(['aaabc', 'abbde'])
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal(['a', 'b', 'c', 'd', 'e'],
                       vect.get_feature_names().to_arrow().to_pylist())
    assert_array_equal([[3, 1, 1, 0, 0],
                        [1, 2, 0, 1, 1]], X)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal([[1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 1]], X)

    # check the ability to change the dtype
    vect = CountVectorizer(analyzer='char', max_df=1.0,
                           binary=True, dtype=cp.float32)
    X = vect.fit_transform(test_data)
    assert X.dtype == cp.float32
def test_word_analyzer(ngram_range):
    v = CountVectorizer(ngram_range=ngram_range).fit(DOCS_GPU)
    ref = SkCountVect(ngram_range=ngram_range).fit(DOCS)
    assert (
        ref.get_feature_names() == v.get_feature_names().to_arrow().to_pylist()
    )