Esempio n. 1
0
def test_word_analyzer_unigrams_and_bigrams():
    wa = CountVectorizer(analyzer="word",
                         strip_accents="unicode",
                         ngram_range=(1, 2)).build_analyzer()

    text = "J'ai mangé du kangourou  ce midi, c'était pas très bon."
    expected = [
        "ai",
        "mange",
        "du",
        "kangourou",
        "ce",
        "midi",
        "etait",
        "pas",
        "tres",
        "bon",
        "ai mange",
        "mange du",
        "du kangourou",
        "kangourou ce",
        "ce midi",
        "midi etait",
        "etait pas",
        "pas tres",
        "tres bon",
    ]
    assert wa(text) == expected
Esempio n. 2
0
def test_unicode_decode_error():
    # decode_error default to strict, so this should fail
    # First, encode (as bytes) a unicode string.
    text = "J'ai mangé du kangourou  ce midi, c'était pas très bon."
    text_bytes = text.encode("utf-8")

    # Then let the Analyzer try to decode it as ascii. It should fail,
    # because we have given it an incorrect encoding.
    wa = CountVectorizer(ngram_range=(1, 2), encoding="ascii").build_analyzer()
    with pytest.raises(UnicodeDecodeError):
        wa(text_bytes)

    ca = CountVectorizer(analyzer="char", ngram_range=(3, 6),
                         encoding="ascii").build_analyzer()
    with pytest.raises(UnicodeDecodeError):
        ca(text_bytes)
Esempio n. 3
0
def test_vectorizer_unicode():
    # tests that the count vectorizer works with cyrillic.
    document = ("Машинное обучение — обширный подраздел искусственного "
                "интеллекта, изучающий методы построения алгоритмов, "
                "способных обучаться.")

    vect = CountVectorizer()
    X_counted = vect.fit_transform([document])
    assert X_counted.shape == (1, 12)

    # No collisions on such a small dataset
    assert X_counted.nnz == X_hashed.nnz

    # When norm is None and not alternate_sign, the tokens are counted up to
    # collisions
    assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
Esempio n. 4
0
def test_count_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    # split the dataset for model development and final evaluation
    train_data, test_data, target_train, target_test = train_test_split(
        data, target, test_size=0.2, random_state=0)

    pipeline = Pipeline([("vect", CountVectorizer()), ("svc", LinearSVC())])

    parameters = {
        "vect__ngram_range": [(1, 1), (1, 2)],
        "svc__loss": ("hinge", "squared_hinge"),
    }

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, cv=3)

    # Check that the best model found by grid search is 100% correct on the
    # held out evaluation set.
    pred = grid_search.fit(train_data, target_train).predict(test_data)
    assert_array_equal(pred, target_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accuracy models
    assert grid_search.best_score_ == 1.0
    best_vectorizer = grid_search.best_estimator_.named_steps["vect"]
    assert best_vectorizer.ngram_range == (1, 1)
Esempio n. 5
0
def test_countvectorizer_custom_vocabulary_pipeline():
    what_we_like = ["pizza", "beer"]
    pipe = Pipeline([
        ("count", CountVectorizer(vocabulary=what_we_like)),
        ("tfidf", TfidfTransformer()),
    ])
    X = pipe.fit_transform(ALL_FOOD_DOCS)
    assert set(pipe.named_steps["count"].vocabulary_) == set(what_we_like)
    assert X.shape[1] == len(what_we_like)
Esempio n. 6
0
def test_char_wb_ngram_analyzer():
    cnga = CountVectorizer(analyzer="char_wb",
                           strip_accents="unicode",
                           ngram_range=(3, 6)).build_analyzer()

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = [" th", "thi", "his", "is ", " thi"]
    assert cnga(text)[:5] == expected

    expected = ["yester", "esterd", "sterda", "terday", "erday "]
    assert cnga(text)[-5:] == expected

    cnga = CountVectorizer(input="file",
                           analyzer="char_wb",
                           ngram_range=(3, 6)).build_analyzer()
    text = StringIO("A test with a file-like object!")
    expected = [" a ", " te", "tes", "est", "st ", " tes"]
    assert cnga(text)[:6] == expected
Esempio n. 7
0
def test_pickling_vectorizer():
    instances = [
        CountVectorizer(),
        CountVectorizer(preprocessor=strip_tags),
        CountVectorizer(analyzer=lazy_analyze),
        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
    ]

    for orig in instances:
        s = pickle.dumps(orig)
        copy = pickle.loads(s)
        assert type(copy) == orig.__class__
        assert copy.get_params() == orig.get_params()
        assert_array_equal(
            copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
            orig.fit_transform(JUNK_FOOD_DOCS).toarray(),
        )
Esempio n. 8
0
def test_stop_words_removal():
    # Ensure that deleting the stop_words_ attribute doesn't affect transform

    fitted_vectorizers = (
        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
    )

    for vect in fitted_vectorizers:
        vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

        vect.stop_words_ = None
        stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

        delattr(vect, "stop_words_")
        stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

        assert_array_equal(stop_None_transform, vect_transform)
        assert_array_equal(stop_del_transform, vect_transform)
Esempio n. 9
0
def test_pickling_built_processors(factory):
    """Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    """
    vec = CountVectorizer()
    function = factory(vec)
    text = "J'ai mangé du kangourou  ce midi, " "c'était pas très bon."
    roundtripped_function = pickle.loads(pickle.dumps(function))
    expected = function(text)
    result = roundtripped_function(text)
    assert result == expected
Esempio n. 10
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, "tocsr"):
        counts_train = counts_train.tocsr()
    assert counts_train[0, v1.vocabulary_["pizza"]] == 2

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, "tocsr"):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert counts_test[0, vocabulary["salad"]] == 1
        assert counts_test[0, vocabulary["tomato"]] == 1
        assert counts_test[0, vocabulary["water"]] == 1

        # stop word from the fixed list
        assert "the" not in vocabulary

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert "copyright" not in vocabulary

        # not present in the sample
        assert counts_test[0, vocabulary["coke"]] == 0
        assert counts_test[0, vocabulary["burger"]] == 0
        assert counts_test[0, vocabulary["beer"]] == 0
        assert counts_test[0, vocabulary["pizza"]] == 0
Esempio n. 11
0
def test_vectorizer_min_df():
    test_data = ["abc", "dea", "eat"]
    vect = CountVectorizer(analyzer="char", min_df=1)
    vect.fit(test_data)
    assert "a" in vect.vocabulary_.keys()
    assert len(vect.vocabulary_.keys()) == 6
    assert len(vect.stop_words_) == 0

    vect.min_df = 2
    vect.fit(test_data)
    assert "c" not in vect.vocabulary_.keys()  # {bcdt} ignored
    assert len(vect.vocabulary_.keys()) == 2  # {ae} remain
    assert "c" in vect.stop_words_
    assert len(vect.stop_words_) == 4

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert "c" not in vect.vocabulary_.keys()  # {bcdet} ignored
    assert len(vect.vocabulary_.keys()) == 1  # {a} remains
    assert "c" in vect.stop_words_
    assert len(vect.stop_words_) == 5
Esempio n. 12
0
def test_count_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ["aaabc", "abbde"]
    vect = CountVectorizer(analyzer="char", max_df=1.0)
    X = vect.fit_transform(test_data).toarray()
    assert_array_equal(["a", "b", "c", "d", "e"], vect.get_feature_names())
    assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = CountVectorizer(analyzer="char", max_df=1.0, binary=True)
    X = vect.fit_transform(test_data).toarray()
    assert_array_equal([[1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], X)

    # check the ability to change the dtype
    vect = CountVectorizer(analyzer="char",
                           max_df=1.0,
                           binary=True,
                           dtype=np.float32)
    X_sparse = vect.fit_transform(test_data)
    assert X_sparse.dtype == np.float32
Esempio n. 13
0
def test_word_ngram_analyzer():
    cnga = CountVectorizer(analyzer="word",
                           strip_accents="unicode",
                           ngram_range=(3, 6)).build_analyzer()

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = ["this is test", "is test really", "test really met"]
    assert cnga(text)[:3] == expected

    expected = [
        "test really met harry yesterday",
        "this is test really met harry",
        "is test really met harry yesterday",
    ]
    assert cnga(text)[-3:] == expected

    cnga_file = CountVectorizer(input="file",
                                analyzer="word",
                                ngram_range=(3, 6)).build_analyzer()
    file = StringIO(text)
    assert cnga_file(file) == cnga(text)
Esempio n. 14
0
def test_vectorizer_max_df():
    test_data = ["abc", "dea", "eat"]
    vect = CountVectorizer(analyzer="char", max_df=1.0)
    vect.fit(test_data)
    assert "a" in vect.vocabulary_.keys()
    assert len(vect.vocabulary_.keys()) == 6
    assert len(vect.stop_words_) == 0

    vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
    vect.fit(test_data)
    assert "a" not in vect.vocabulary_.keys()  # {ae} ignored
    assert len(vect.vocabulary_.keys()) == 4  # {bcdt} remain
    assert "a" in vect.stop_words_
    assert len(vect.stop_words_) == 2

    vect.max_df = 1
    vect.fit(test_data)
    assert "a" not in vect.vocabulary_.keys()  # {ae} ignored
    assert len(vect.vocabulary_.keys()) == 4  # {bcdt} remain
    assert "a" in vect.stop_words_
    assert len(vect.stop_words_) == 2
Esempio n. 15
0
def test_countvectorizer_vocab_sets_when_pickling():
    # ensure that vocabulary of type set is coerced to a list to
    # preserve iteration ordering after deserialization
    rng = np.random.RandomState(0)
    vocab_words = np.array([
        "beer",
        "burger",
        "celeri",
        "coke",
        "pizza",
        "salad",
        "sparkling",
        "tomato",
        "water",
    ])
    for x in range(0, 100):
        vocab_set = set(rng.choice(vocab_words, size=5, replace=False))
        cv = CountVectorizer(vocabulary=vocab_set)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert cv.get_feature_names() == unpickled_cv.get_feature_names()
Esempio n. 16
0
def test_countvectorizer_empty_vocabulary():
    try:
        vect = CountVectorizer(vocabulary=[])
        vect.fit(["foo"])
        assert False, "we shouldn't get here"
    except ValueError as e:
        assert "empty vocabulary" in str(e).lower()

    try:
        v = CountVectorizer(max_df=1.0, stop_words="english")
        # fit on stopwords only
        v.fit(["to be or not to be", "and me too", "and so do you"])
        assert False, "we shouldn't get here"
    except ValueError as e:
        assert "empty vocabulary" in str(e).lower()
Esempio n. 17
0
def test_char_ngram_analyzer():
    cnga = CountVectorizer(analyzer="char",
                           strip_accents="unicode",
                           ngram_range=(3, 6)).build_analyzer()

    text = "J'ai mangé du kangourou  ce midi, c'était pas très bon"
    expected = ["j'a", "'ai", "ai ", "i m", " ma"]
    assert cnga(text)[:5] == expected
    expected = ["s tres", " tres ", "tres b", "res bo", "es bon"]
    assert cnga(text)[-5:] == expected

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = ["thi", "his", "is ", "s i", " is"]
    assert cnga(text)[:5] == expected

    expected = [" yeste", "yester", "esterd", "sterda", "terday"]
    assert cnga(text)[-5:] == expected

    cnga = CountVectorizer(input="file", analyzer="char",
                           ngram_range=(3, 6)).build_analyzer()
    text = StringIO("This is a test with a file-like object!")
    expected = ["thi", "his", "is ", "s i", " is"]
    assert cnga(text)[:5] == expected
Esempio n. 18
0
def test_countvectorizer_vocab_dicts_when_pickling():
    rng = np.random.RandomState(0)
    vocab_words = np.array([
        "beer",
        "burger",
        "celeri",
        "coke",
        "pizza",
        "salad",
        "sparkling",
        "tomato",
        "water",
    ])
    for x in range(0, 100):
        vocab_dict = dict()
        words = rng.choice(vocab_words, size=5, replace=False)
        for y in range(0, 5):
            vocab_dict[words[y]] = y
        cv = CountVectorizer(vocabulary=vocab_dict)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert cv.get_feature_names() == unpickled_cv.get_feature_names()
Esempio n. 19
0
def test_countvectorizer_custom_vocabulary():
    vocab = {"pizza": 0, "beer": 1}
    terms = set(vocab.keys())

    # Try a few of the supported types.
    for typ in [dict, list, iter, partial(defaultdict, int)]:
        v = typ(vocab)
        vect = CountVectorizer(vocabulary=v)
        vect.fit(JUNK_FOOD_DOCS)
        if isinstance(v, Mapping):
            assert vect.vocabulary_ == vocab
        else:
            assert set(vect.vocabulary_) == terms
        X = vect.transform(JUNK_FOOD_DOCS)
        assert X.shape[1] == len(terms)
        v = typ(vocab)
        vect = CountVectorizer(vocabulary=v)
        inv = vect.inverse_transform(X)
        assert len(inv) == X.shape[0]
Esempio n. 20
0
def test_vectorizer_stop_words_inconsistent():
    lstr = "['and', 'll', 've']"
    message = ("Your stop_words may be inconsistent with your "
               "preprocessing. Tokenizing the stop words generated "
               "tokens %s not in stop_words." % lstr)
    for vec in [CountVectorizer()]:
        vec.set_params(stop_words=["you've", "you", "you'll", "AND"])
        assert_warns_message(UserWarning, message, vec.fit_transform,
                             ["hello world"])
        # reset stop word validation
        del vec._stop_words_id
        assert _check_stop_words_consistency(vec) is False

    # Only one warning per stop list
    assert_no_warnings(vec.fit_transform, ["hello world"])
    assert _check_stop_words_consistency(vec) is None

    # Test caching of inconsistency assessment
    vec.set_params(stop_words=["you've", "you", "you'll", "blah", "AND"])
    assert_warns_message(UserWarning, message, vec.fit_transform,
                         ["hello world"])
Esempio n. 21
0
def test_countvectorizer_sort_features_64bit_sparse_indices():
    """
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    """

    X = sparse.csr_matrix((5, 5), dtype=np.int64)

    # force indices and indptr to int64.
    INDICES_DTYPE = np.int64
    X.indices = X.indices.astype(INDICES_DTYPE)
    X.indptr = X.indptr.astype(INDICES_DTYPE)

    vocabulary = {"scikit-learn": 0, "is": 1, "great!": 2}

    Xs = CountVectorizer()._sort_features(X, vocabulary)

    assert INDICES_DTYPE == Xs.indices.dtype
Esempio n. 22
0
def test_non_unique_vocab():
    vocab = ["a", "b", "c", "a", "a"]
    vect = CountVectorizer(vocabulary=vocab)
    with pytest.raises(ValueError):
        vect.fit([])
Esempio n. 23
0
def test_countvectorizer_custom_vocabulary_repeated_indices():
    vocab = {"pizza": 0, "beer": 0}
    try:
        CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert "vocabulary contains repeated indices" in str(e).lower()
Esempio n. 24
0
def test_countvectorizer_custom_vocabulary_gap_index():
    vocab = {"pizza": 1, "beer": 2}
    try:
        CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert "doesn't contain index" in str(e).lower()
Esempio n. 25
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words="english")
    assert cv.get_stop_words() == ENGLISH_STOP_WORDS
    cv.set_params(stop_words="_bad_str_stop_")
    with pytest.raises(ValueError):
        cv.get_stop_words()
    cv.set_params(stop_words="_bad_unicode_stop_")
    with pytest.raises(ValueError):
        cv.get_stop_words()
    stoplist = ["some", "other", "words"]
    cv.set_params(stop_words=stoplist)
    assert cv.get_stop_words() == set(stoplist)
Esempio n. 26
0
def test_fit_countvectorizer_twice():
    cv = CountVectorizer()
    X1 = cv.fit_transform(ALL_FOOD_DOCS[:5])
    X2 = cv.fit_transform(ALL_FOOD_DOCS[5:])
    assert X1.shape[1] != X2.shape[1]
Esempio n. 27
0
def test_count_vectorizer_max_features():
    # Regression test: max_features didn't work correctly in 0.14.

    cv_1 = CountVectorizer(max_features=1)
    cv_3 = CountVectorizer(max_features=3)
    cv_None = CountVectorizer(max_features=None)

    counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
    counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
    counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)

    features_1 = cv_1.get_feature_names()
    features_3 = cv_3.get_feature_names()
    features_None = cv_None.get_feature_names()

    # The most common feature is "the", with frequency 7.
    assert 7 == counts_1.max()
    assert 7 == counts_3.max()
    assert 7 == counts_None.max()

    # The most common feature should be the same
    assert "the" == features_1[np.argmax(counts_1)]
    assert "the" == features_3[np.argmax(counts_3)]
    assert "the" == features_None[np.argmax(counts_None)]
Esempio n. 28
0
def test_feature_names():
    cv = CountVectorizer(max_df=0.5)

    # test for Value error on unfitted/empty vocabulary
    with pytest.raises(ValueError):
        cv.get_feature_names()
    assert not cv.fixed_vocabulary_

    # test for vocabulary learned from data
    X = cv.fit_transform(ALL_FOOD_DOCS)
    n_samples, n_features = X.shape
    assert len(cv.vocabulary_) == n_features

    feature_names = cv.get_feature_names()
    assert len(feature_names) == n_features
    assert_array_equal(
        [
            "beer",
            "burger",
            "celeri",
            "coke",
            "pizza",
            "salad",
            "sparkling",
            "tomato",
            "water",
        ],
        feature_names,
    )

    for idx, name in enumerate(feature_names):
        assert idx == cv.vocabulary_.get(name)

    # test for custom vocabulary
    vocab = [
        "beer",
        "burger",
        "celeri",
        "coke",
        "pizza",
        "salad",
        "sparkling",
        "tomato",
        "water",
    ]

    cv = CountVectorizer(vocabulary=vocab)
    feature_names = cv.get_feature_names()
    assert_array_equal(
        [
            "beer",
            "burger",
            "celeri",
            "coke",
            "pizza",
            "salad",
            "sparkling",
            "tomato",
            "water",
        ],
        feature_names,
    )
    assert cv.fixed_vocabulary_

    for idx, name in enumerate(feature_names):
        assert idx == cv.vocabulary_.get(name)
Esempio n. 29
0
        vect.fit([])


@pytest.mark.parametrize("Vectorizer", (CountVectorizer, ))
def test_vectorizer_string_object_as_input(Vectorizer):
    message = "Iterable over raw text documents expected, " "string object received."
    vec = Vectorizer()
    assert_raise_message(ValueError, message, vec.fit_transform,
                         "hello world!")
    assert_raise_message(ValueError, message, vec.fit, "hello world!")
    vec.fit(["some text", "some other text"])
    assert_raise_message(ValueError, message, vec.transform, "hello world!")


@pytest.mark.parametrize("vec", [
    CountVectorizer(ngram_range=(2, 1)),
])
def test_vectorizers_invalid_ngram_range(vec):
    # vectorizers could be initialized with invalid ngram range
    # test for raising error message
    invalid_range = vec.ngram_range
    message = ("Invalid value for ngram_range=%s "
               "lower boundary larger than the upper boundary." %
               str(invalid_range))

    assert_raise_message(ValueError, message, vec.fit, ["good news everyone"])
    assert_raise_message(ValueError, message, vec.fit_transform,
                         ["good news everyone"])


def _check_stop_words_consistency(estimator):