def week2(X, y):
    cnt_vectorizer = CountVectorizer()
    cnt_vectorizer.fit_transform(X)
    classifier = LogisticRegression()
    pipe_cnt_logreg = Pipeline([('countvectorizer', cnt_vectorizer),
                                ('logisticregression', classifier)])
    tf_vectorizer = TfidfVectorizer()
    pipe_tf_logreg = Pipeline([('tfidfvectorizer', tf_vectorizer),
                               ('logisticregression', classifier)])

    # #1
    scores = compare_accuracy([pipe_cnt_logreg, pipe_tf_logreg], X, y)
    helper.out(
        '2-1.txt',
        [scores[0].mean(), scores[0].std(), scores[1].mean(), scores[1].std()])

    # #2
    cnt_vectorizer.min_df = 10
    scores_cnt_logreg_10 = np.array(
        cross_val_score(pipe_cnt_logreg, X, y, cv=n_cv))
    print 'Pipeline %s, min_df=10: accuracy mean = %f' % (
        pipe_cnt_logreg.named_steps.keys(), scores_cnt_logreg_10.mean())

    cnt_vectorizer.min_df = 50
    scores_cnt_logreg_50 = np.array(
        cross_val_score(pipe_cnt_logreg, X, y, cv=n_cv))
    print 'Pipeline %s, min_df=50: accuracy mean = %f' % (
        pipe_cnt_logreg.named_steps.keys(), scores_cnt_logreg_50.mean())
    helper.out('2-2.txt',
               [scores_cnt_logreg_10.mean(),
                scores_cnt_logreg_50.mean()])

    # #3
    pipe_cnt_logreg = Pipeline([('countvectorizer', cnt_vectorizer),
                                ('logisticregression', LogisticRegression())])
    pipe_cnt_svc = Pipeline([('countvectorizer', cnt_vectorizer),
                             ('linearsvc', LinearSVC())])
    pipe_cnt_sgd = Pipeline([('countvectorizer', cnt_vectorizer),
                             ('sgdclassifier', SGDClassifier())])
    scores = choose_classifier([pipe_cnt_logreg, pipe_cnt_svc, pipe_cnt_sgd],
                               X, y)
    worst_score = min(scores.values())
    print 'Worst score: %f' % worst_score
    helper.out('2-3.txt', worst_score)

    # #4
    stop_words_dict = {
        'nltk stop-words': nltk.corpus.stopwords.words('english'),
        'sklearn stop-words': 'english'
    }
    scores = estimate_stop_words(stop_words_dict, classifier, X, y)
    helper.out('2-4.txt', scores.values())
def test_vectorizer_min_df():
    test_data = [u'abc', u'dea', u'eat']  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer='char', max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true(u'a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain

    vect.min_df = .5
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain
Example #3
0
def test_vectorizer_min_df():
    test_data = [u'abc', u'dea', u'eat']  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer='char', max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true(u'a' in vect.vocabulary_.keys())
    assert_equals(len(vect.vocabulary_.keys()), 6)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equals(len(vect.vocabulary_.keys()), 2)  # only e, a remain

    vect.min_df = .5
    vect.fit(test_data)
    assert_true(u'c' not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equals(len(vect.vocabulary_.keys()), 2)  # only e, a remain
Example #4
0
def test_vectorizer_min_df():
    test_data = ["abc", "dea", "eat"]  # the letter a occurs in both strings
    vect = CountVectorizer(analyzer="char", max_df=1.0, min_df=1)
    vect.fit(test_data)
    assert_true("a" in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain

    vect.min_df = 0.5
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # 'c' is ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # only e, a remain
def getBagOfWords(documents, stopWords, minThreshold, maxThreshold):
    vectorizer = CountVectorizer()
    vectorizer.stop_words = stopWords
    vectorizer.min_df = minThreshold
    vectorizer.max_df = maxThreshold
    X = vectorizer.fit_transform(documents)
    return vectorizer, X.toarray()
Example #6
0
def test_vectorizer_min_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.keys()
    assert len(vect.vocabulary_.keys()) == 6
    assert len(vect.stop_words_) == 0

    vect.min_df = 2
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.keys()  # {bcdt} ignored
    assert len(vect.vocabulary_.keys()) == 2    # {ae} remain
    assert 'c' in vect.stop_words_
    assert len(vect.stop_words_) == 4

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.keys()  # {bcdet} ignored
    assert len(vect.vocabulary_.keys()) == 1    # {a} remains
    assert 'c' in vect.stop_words_
    assert len(vect.stop_words_) == 5
Example #7
0
def test_vectorizer_min_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert_true('a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdt} ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)    # {ae} remain
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 4)

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdet} ignored
    assert_equal(len(vect.vocabulary_.keys()), 1)    # {a} remains
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 5)
Example #8
0
def test_vectorizer_min_df():
    test_data = ["abc", "dea", "eat"]
    vect = CountVectorizer(analyzer="char", min_df=1)
    vect.fit(test_data)
    assert_true("a" in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # {bcdt} ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)  # {ae} remain
    assert_true("c" in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 4)

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert_true("c" not in vect.vocabulary_.keys())  # {bcdet} ignored
    assert_equal(len(vect.vocabulary_.keys()), 1)  # {a} remains
    assert_true("c" in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 5)
def test_vectorizer_min_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert_true('a' in vect.vocabulary_.keys())
    assert_equal(len(vect.vocabulary_.keys()), 6)
    assert_equal(len(vect.stop_words_), 0)

    vect.min_df = 2
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdt} ignored
    assert_equal(len(vect.vocabulary_.keys()), 2)    # {ae} remain
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 4)

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert_true('c' not in vect.vocabulary_.keys())  # {bcdet} ignored
    assert_equal(len(vect.vocabulary_.keys()), 1)    # {a} remains
    assert_true('c' in vect.stop_words_)
    assert_equal(len(vect.stop_words_), 5)
Example #10
0



#################################

messages = pandas.read_csv('dataSet', sep='\t', quoting=csv.QUOTE_NONE, names=["label", "message"])

msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0,
                                                                    random_state=0)

    # print(msg_train)


   # Training the Classifier
bow_transformer = CountVectorizer(analyzer=split_into_lemmas_).fit(msg_train)
bow_transformer.min_df = 0.5

print(len(bow_transformer.vocabulary_))

messages_bow = bow_transformer.transform(msg_train)
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

    # providing data and labels to the classifier
meeting_detector = MultinomialNB().fit(messages_tfidf, label_train)

##########################################################

dal=DAL()
run()