Python CountVectorizer.min_df Examples

Programming Language: Python

Namespace/Package Name: sklearn.feature_extraction.text

Class/Type: CountVectorizer

Method/Function: min_df

Examples at hotexamples.com: 10

Python CountVectorizer.min_df - 10 examples found. These are the top rated real world Python examples of sklearn.feature_extraction.text.CountVectorizer.min_df extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CountVectorizer(30)

_validate_vocabulary(30)

fit_transform(30)

fit(30)

build_tokenizer(30)

build_analyzer(30)

get_stop_words(30)

get_params(21)

get_feature_names_out(15)

build_preprocessor(13)

__init__(10)

get_feature_names(9)

dictionary_freeze(6)

count(4)

analyzer(4)

fixed_vocabulary(3)

astype(3)

_count_vocab(2)

copy(2)

fit_trainsform(2)

get_features_names(2)

append(2)

_word_ngrams(2)

get_feature_name(1)

getSenVec(1)

_sort_features(1)

get_features(1)

get_sentence_vector(1)

get_shape(1)

getOutputCol(1)

fit_Transform(1)

fit_trasform(1)

fit_transfrom(1)

fit_transforn(1)

__repr__(1)

fir_transform(1)

__dict__(1)

extract_ngrams(1)

delete_temporary_training_data(1)

count_features(1)

_limit_features(1)

fir(1)

Example #1

Show file

File: task_solving.py Project: EugeniaMatveeva/SentimentAnalysis

def week2(X, y):
    cnt_vectorizer = CountVectorizer()
    cnt_vectorizer.fit_transform(X)
    classifier = LogisticRegression()
    pipe_cnt_logreg = Pipeline([('countvectorizer', cnt_vectorizer),
                                ('logisticregression', classifier)])
    tf_vectorizer = TfidfVectorizer()
    pipe_tf_logreg = Pipeline([('tfidfvectorizer', tf_vectorizer),
                               ('logisticregression', classifier)])

    # #1
    scores = compare_accuracy([pipe_cnt_logreg, pipe_tf_logreg], X, y)
    helper.out(
        '2-1.txt',
        [scores[0].mean(), scores[0].std(), scores[1].mean(), scores[1].std()])

    # #2
    cnt_vectorizer.min_df = 10
    scores_cnt_logreg_10 = np.array(
        cross_val_score(pipe_cnt_logreg, X, y, cv=n_cv))
    print 'Pipeline %s, min_df=10: accuracy mean = %f' % (
        pipe_cnt_logreg.named_steps.keys(), scores_cnt_logreg_10.mean())

    cnt_vectorizer.min_df = 50
    scores_cnt_logreg_50 = np.array(
        cross_val_score(pipe_cnt_logreg, X, y, cv=n_cv))
    print 'Pipeline %s, min_df=50: accuracy mean = %f' % (
        pipe_cnt_logreg.named_steps.keys(), scores_cnt_logreg_50.mean())
    helper.out('2-2.txt',
               [scores_cnt_logreg_10.mean(),
                scores_cnt_logreg_50.mean()])

    # #3
    pipe_cnt_logreg = Pipeline([('countvectorizer', cnt_vectorizer),
                                ('logisticregression', LogisticRegression())])
    pipe_cnt_svc = Pipeline([('countvectorizer', cnt_vectorizer),
                             ('linearsvc', LinearSVC())])
    pipe_cnt_sgd = Pipeline([('countvectorizer', cnt_vectorizer),
                             ('sgdclassifier', SGDClassifier())])
    scores = choose_classifier([pipe_cnt_logreg, pipe_cnt_svc, pipe_cnt_sgd],
                               X, y)
    worst_score = min(scores.values())
    print 'Worst score: %f' % worst_score
    helper.out('2-3.txt', worst_score)

    # #4
    stop_words_dict = {
        'nltk stop-words': nltk.corpus.stopwords.words('english'),
        'sklearn stop-words': 'english'
    }
    scores = estimate_stop_words(stop_words_dict, classifier, X, y)
    helper.out('2-4.txt', scores.values())

Example #2

Show file

File: test_text.py Project: lucidfrontier45/scikit-learn

def test_vectorizer_min_df():
    test_data = [u'abc', u'dea', u'eat']  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer='char', max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true(u'a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain

    vect.min_df = .5
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain

Example #3

Show file

File: test_text.py Project: GbalsaC/bitnamiP

def test_vectorizer_min_df():
    test_data = [u'abc', u'dea', u'eat']  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer='char', max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true(u'a' in vect.vocabulary_.keys())
    assert_equals(len(vect.vocabulary_.keys()), 6)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equals(len(vect.vocabulary_.keys()), 2)  # only e, a remain

    vect.min_df = .5
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equals(len(vect.vocabulary_.keys()), 2)  # only e, a remain

Example #4

Show file

File: test_text.py Project: JohnFNovak/scikit-learn

def test_vectorizer_min_df():
    test_data = ["abc", "dea", "eat"]  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer="char", max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true("a" in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain

    vect.min_df = 0.5
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain

Example #5

Show file

File: logisticRegression.py Project: hj59172507/MachineLearningApp

def getBagOfWords(documents, stopWords, minThreshold, maxThreshold):
    vectorizer = CountVectorizer()
    vectorizer.stop_words = stopWords
    vectorizer.min_df = minThreshold
    vectorizer.max_df = maxThreshold
    X = vectorizer.fit_transform(documents)
    return vectorizer, X.toarray()

Example #6

Show file

def test_vectorizer_min_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.keys()
    assert len(vect.vocabulary_.keys()) == 6
    assert len(vect.stop_words_) == 0

    vect.min_df = 2
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.keys()  # {bcdt} ignored
    assert len(vect.vocabulary_.keys()) == 2    # {ae} remain
    assert 'c' in vect.stop_words_
    assert len(vect.stop_words_) == 4

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.keys()  # {bcdet} ignored
    assert len(vect.vocabulary_.keys()) == 1    # {a} remains
    assert 'c' in vect.stop_words_
    assert len(vect.stop_words_) == 5

Example #7

Show file

File: test_text.py Project: LoveYakamoz/scikit-learn

def test_vectorizer_min_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert_true('a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdt} ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)    # {ae} remain
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 4)

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdet} ignored
    assert_equal(len(vect.vocabulary_.keys()), 1)    # {a} remains
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 5)

Example #8

Show file

File: test_text.py Project: Greenall/scikit-learn

def test_vectorizer_min_df():
    test_data = ["abc", "dea", "eat"]
    vect = CountVectorizer(analyzer="char", min_df=1)
    vect.fit(test_data)
    assert_true("a" in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # {bcdt} ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # {ae} remain
    assert_true("c" in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 4)

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # {bcdet} ignored
    assert_equal(len(vect.vocabulary_.keys()), 1)  # {a} remains
    assert_true("c" in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 5)

Example #9

Show file

File: test_text.py Project: odeskdataproducts/scikit-learn

def test_vectorizer_min_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert_true('a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdt} ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)    # {ae} remain
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 4)

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdet} ignored
    assert_equal(len(vect.vocabulary_.keys()), 1)    # {a} remains
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 5)

Example #10

Show file

File: Main.py Project: ZarghamKhaliq/Prototype2




#################################

messages = pandas.read_csv('dataSet', sep='\t', quoting=csv.QUOTE_NONE, names=["label", "message"])

msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0,
                                                                    random_state=0)

    # print(msg_train)


   # Training the Classifier
bow_transformer = CountVectorizer(analyzer=split_into_lemmas_).fit(msg_train)
bow_transformer.min_df = 0.5

print(len(bow_transformer.vocabulary_))

messages_bow = bow_transformer.transform(msg_train)
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

    # providing data and labels to the classifier
meeting_detector = MultinomialNB().fit(messages_tfidf, label_train)

##########################################################

dal=DAL()
run()