def test_fastcountvectorizer_save_stop_words():
    cv = FastCountVectorizer(analyzer="char", min_df=2, save_stop_words=True)
    cv.fit(["ab", "ac"])
    assert hasattr(cv, "stop_words_")
    assert cv.stop_words_ == {"b", "c"}

    cv = FastCountVectorizer(analyzer="char", min_df=2, save_stop_words=False)
    cv.fit(["ab", "ac"])
    assert not hasattr(cv, "stop_words_")
def test_fastcountvectorizer_char_ngram1_strip_accents_ascii():
    cv = FastCountVectorizer(strip_accents="ascii",
                             analyzer="char",
                             ngram_range=(1, 1))
    check_cv(cv,
             input=["ábc"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["a", "b", "c"])
def test_unicode_decode_error_input_file_bytes():
    text = "àbć"

    cv = FastCountVectorizer(encoding="ascii", input="file", analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([io.BytesIO(text.encode("utf-8"))])

    cv = FastCountVectorizer(encoding="ascii", input="file", analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([io.BytesIO(text.encode("utf-8"))])
def test_fastcountvectorizer_validate_params():
    FastCountVectorizer().fit(["foo"])

    FastCountVectorizer(input="content")
    FastCountVectorizer(input="file")._validate_params()
    FastCountVectorizer(input="filename")._validate_params()
    with pytest.raises(ValueError):
        FastCountVectorizer(input="unsupported")._validate_params()

    FastCountVectorizer(analyzer="char").fit(["foo"])
    FastCountVectorizer(analyzer="word").fit(["foo"])
    with pytest.raises(ValueError):
        FastCountVectorizer(analyzer="char_wb").fit(["foo"])
    with pytest.raises(ValueError):
        FastCountVectorizer(input="unsupported").fit(["foo"])
def test_fastcountvectorizer_char_ngram1_unicode():
    cv = FastCountVectorizer(analyzer="char", ngram_range=(1, 1))
    check_cv(cv,
             input=["ǟƂƇ"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["Ƃ", "Ƈ", "ǟ"])
    check_cv(cv,
             input=["ƇƂǟ"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["Ƃ", "Ƈ", "ǟ"])
def test_fastcountvectorizer_char_ngram1_3():
    cv = FastCountVectorizer(analyzer="char", ngram_range=(1, 3))
    check_cv(
        cv,
        input=["abcef"],
        vocab=[
            "a", "ab", "abc", "b", "bc", "bce", "c", "ce", "cef", "e", "ef",
            "f"
        ],
        output=lil_matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]).tocsr(),
    )
def test_unicode_decode_error_input_content():
    text = "àbć"
    doc = text.encode("utf-8")

    cv = FastCountVectorizer(encoding="ascii",
                             input="content",
                             analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

    cv = FastCountVectorizer(encoding="ascii",
                             input="content",
                             analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])
def test_fastcountvectorizer_char_ngram1():
    cv = FastCountVectorizer(analyzer="char", ngram_range=(1, 1))
    check_cv(cv,
             input=["abc"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["a", "b", "c"])
    check_cv(cv,
             input=["cba"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["a", "b", "c"])
    check_cv(
        cv,
        input=["cba", "ade"],
        output=lil_matrix([[1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]).tocsr(),
        vocab=["a", "b", "c", "d", "e"],
    )
def test_unicode_decode_error_input_filename(tmp_path):
    p = tmp_path / "input_file.txt"
    with p.open("w", encoding="utf-8") as f:
        text = "àbć"
        f.write(text)
    doc = str(p)

    cv = FastCountVectorizer(encoding="ascii",
                             input="filename",
                             analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

    cv = FastCountVectorizer(encoding="ascii",
                             input="filename",
                             analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])
def test_fastcountvectorizer_word_ngram1():
    cv = FastCountVectorizer(analyzer="word", ngram_range=(1, 1))
    check_cv(
        cv,
        input=["aaa bbb ccc"],
        output=lil_matrix([[1, 1, 1]]).tocsr(),
        vocab=["aaa", "bbb", "ccc"],
    )
    check_cv(
        cv,
        input=["bbb aaa ccc"],
        output=lil_matrix([[1, 1, 1]]).tocsr(),
        vocab=["aaa", "bbb", "ccc"],
    )
    check_cv(
        cv,
        input=["ccc bbb aaa", "aaa  ddd\teee"],
        output=lil_matrix([[1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]).tocsr(),
        vocab=["aaa", "bbb", "ccc", "ddd", "eee"],
    )
def test_fastcountvectorizer_word_ngram1_3():
    cv = FastCountVectorizer(analyzer="word", ngram_range=(1, 3))
    check_cv(
        cv,
        input=["aaa bbb ccc eee fff"],
        vocab=[
            "aaa",
            "aaa bbb",
            "aaa bbb ccc",
            "bbb",
            "bbb ccc",
            "bbb ccc eee",
            "ccc",
            "ccc eee",
            "ccc eee fff",
            "eee",
            "eee fff",
            "fff",
        ],
        output=lil_matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]).tocsr(),
    )
def test_fastcountvectorizer_word_return_dtype():
    input = ["abc"]
    cv = FastCountVectorizer()
    result = cv.fit_transform(input)
    assert result.dtype == np.int64

    cv = FastCountVectorizer(dtype=np.int64)
    result = cv.fit_transform(input)
    assert result.dtype == np.int64

    cv = FastCountVectorizer(dtype=np.int32)
    result = cv.fit_transform(input)
    assert result.dtype == np.int32

    cv = FastCountVectorizer(dtype=np.float64)
    result = cv.fit_transform(input)
    assert result.dtype == np.float64
Esempio n. 13
0
def run_fastcountvectorizer_fit(ngram_range):
    cv = FastCountVectorizer(ngram_range=ngram_range)
    cv.fit(docs)