Ejemplo n.º 1
0
def test_countvectorizer_custom_vocabulary():
    what_we_like = ["pizza", "beer"]
    vect = CountVectorizer(vocabulary=what_we_like)
    vect.fit(JUNK_FOOD_DOCS)
    assert_equal(set(vect.vocabulary), set(what_we_like))
    X = vect.transform(JUNK_FOOD_DOCS)
    assert_equal(X.shape[1], len(what_we_like))
Ejemplo n.º 2
0
def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in all strings
    vect = CountVectorizer(CharNGramAnalyzer(min_n=1, max_n=1), max_df=1.0)
    vect.fit(test_data)
    assert u'a' in vect.vocabulary.keys()
    assert_equals(len(vect.vocabulary.keys()), 5)
    vect.max_df = 0.5
    vect.fit(test_data)
    assert u'a' not in vect.vocabulary.keys()  # 'a' is ignored
    assert_equals(len(vect.vocabulary.keys()), 4)  # the others remain
Ejemplo n.º 3
0
def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in all strings
    vect = CountVectorizer(CharNGramAnalyzer(min_n=1, max_n=1), max_df=1.0)
    vect.fit(test_data)
    assert u'a' in vect.vocabulary.keys()
    assert_equals(len(vect.vocabulary.keys()), 5)
    vect.max_df = 0.5
    vect.fit(test_data)
    assert u'a' not in vect.vocabulary.keys()  # 'a' is ignored
    assert_equals(len(vect.vocabulary.keys()), 4)  # the others remain