def getBagOfWords(documents, stopWords, minThreshold, maxThreshold):
    vectorizer = CountVectorizer()
    vectorizer.stop_words = stopWords
    vectorizer.min_df = minThreshold
    vectorizer.max_df = maxThreshold
    X = vectorizer.fit_transform(documents)
    return vectorizer, X.toarray()
def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer='char', max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true(u'a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 5)

    vect.max_df = 0.5
    vect.fit(test_data)
    assert_true(u'a' not in vect.vocabulary_.keys())  # 'a' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)  # the others remain

    # absolute count: if in more than one
    vect.max_df = 1
    vect.fit(test_data)
    assert_true(u'a' not in vect.vocabulary_.keys())  # 'a' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)  # the others remain
Example #3
0
def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer='char', max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true(u'a' in vect.vocabulary_.keys())
    assert_equals(len(vect.vocabulary_.keys()), 5)

    vect.max_df = 0.5
    vect.fit(test_data)
    assert_true(u'a' not in vect.vocabulary_.keys())  # 'a' is ignored
    assert_equals(len(vect.vocabulary_.keys()), 4)  # the others remain

    # absolute count: if in more than one
    vect.max_df = 1
    vect.fit(test_data)
    assert_true(u'a' not in vect.vocabulary_.keys())  # 'a' is ignored
    assert_equals(len(vect.vocabulary_.keys()), 4)  # the others remain
Example #4
0
def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in both strings
    vect = CountVectorizer(CharNGramAnalyzer(min_n=1, max_n=1), max_df=1.0)
    vect.fit(test_data)
    assert_true(u'a' in vect.vocabulary.keys())
    assert_equals(len(vect.vocabulary.keys()), 5)
    vect.max_df = 0.5
    vect.fit(test_data)
    assert_true(u'a' not in vect.vocabulary.keys())  # 'a' is ignored
    assert_equals(len(vect.vocabulary.keys()), 4)  # the others remain
Example #5
0
def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in both strings
    vect = CountVectorizer(CharNGramAnalyzer(min_n=1, max_n=1), max_df=1.0)
    vect.fit(test_data)
    assert u'a' in vect.vocabulary.keys()
    assert_equals(len(vect.vocabulary.keys()), 5)
    vect.max_df = 0.5
    vect.fit(test_data)
    assert u'a' not in vect.vocabulary.keys()  # 'a' is ignored
    assert_equals(len(vect.vocabulary.keys()), 4)  # the others remain
Example #6
0
def test_vectorizer_max_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.keys()
    assert len(vect.vocabulary_.keys()) == 6
    assert len(vect.stop_words_) == 0

    vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
    vect.fit(test_data)
    assert 'a' not in vect.vocabulary_.keys()  # {ae} ignored
    assert len(vect.vocabulary_.keys()) == 4    # {bcdt} remain
    assert 'a' in vect.stop_words_
    assert len(vect.stop_words_) == 2

    vect.max_df = 1
    vect.fit(test_data)
    assert 'a' not in vect.vocabulary_.keys()  # {ae} ignored
    assert len(vect.vocabulary_.keys()) == 4    # {bcdt} remain
    assert 'a' in vect.stop_words_
    assert len(vect.stop_words_) == 2
Example #7
0
def test_vectorizer_max_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    vect.fit(test_data)
    assert_true('a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
    vect.fit(test_data)
    assert_true('a' not in vect.vocabulary_.keys())  # {ae} ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)    # {bcdt} remain
    assert_true('a' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 2)

    vect.max_df = 1
    vect.fit(test_data)
    assert_true('a' not in vect.vocabulary_.keys())  # {ae} ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)    # {bcdt} remain
    assert_true('a' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 2)
Example #8
0
def test_vectorizer_max_df():
    test_data = ["abc", "dea", "eat"]
    vect = CountVectorizer(analyzer="char", max_df=1.0)
    vect.fit(test_data)
    assert_true("a" in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
    vect.fit(test_data)
    assert_true("a" not in vect.vocabulary_.keys())  # {ae} ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)  # {bcdt} remain
    assert_true("a" in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 2)

    vect.max_df = 1
    vect.fit(test_data)
    assert_true("a" not in vect.vocabulary_.keys())  # {ae} ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)  # {bcdt} remain
    assert_true("a" in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 2)
def test_vectorizer_max_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    vect.fit(test_data)
    assert_true('a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
    vect.fit(test_data)
    assert_true('a' not in vect.vocabulary_.keys())  # {ae} ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)    # {bcdt} remain
    assert_true('a' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 2)

    vect.max_df = 1
    vect.fit(test_data)
    assert_true('a' not in vect.vocabulary_.keys())  # {ae} ignored
    assert_equal(len(vect.vocabulary_.keys()), 4)    # {bcdt} remain
    assert_true('a' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 2)