def subsetter(data, column, stopwords):
    """Use CountVectorizer to actually measure the prediction ability using Naive Bayes"""
    texttotokens = []
    flatlist = []
    tokenizer = RegexpTokenizer(r'[A-Za-z]+')
    for i in data[column]:
        i = i
        texttotokens.append(i)
    for y in i:
        flatlist.append(y)

    frequency_dist = nltk.FreqDist([word for word in tokenizer.tokenize(str(flatlist)) \
                                    if word not in stopwords])
    top50n = sorted(frequency_dist, key=frequency_dist.__getitem__, reverse=True)[0:50]

    X_train, X_test, y_train, y_test = train_test_split(data['Reviews'].values,data['naive_bayes'].values,
                                                        test_size=0.2,random_state=1)
    REGEX = re.compile(r",\s*")
    tokenize = [tok.strip().lower() for tok in REGEX.split(str(stopwords))]
    cv = CountVectorizer(lowercase=True, stop_words='english', binary=True)

    X_train_cv = cv.fit_transform(X_train)
    naive_bayes = BernoulliNB()
    naive_bayes.fit(X_train_cv, y_train)
    X_test_cv = cv.transform(X_test)
    predictions = naive_bayes.predict(X_test_cv)

    print('Accuracy score: ', accuracy_score(y_test, predictions))
    print(sum(y_test == predictions) / len(predictions), "/n")

    print('Precision score: ', precision_score(y_test, predictions))
    print(sum(y_test[predictions == 1] == 1) / len(y_test[predictions == 1]), "/n")

    print('Recall score: ', recall_score(y_test, predictions))
    print(sum(predictions[y_test == 1] == 1) / len(predictions[y_test == 1] == 1), "/n")
roc_auc_score1 = roc_auc_score(yten_test,pred_label)
print("ROC AUC: {0}".format(roc_auc_score1))


# ### Naive Bayes 

# In[758]:

from sklearn import naive_bayes as naive_b


# In[759]:

#Fitting the model
naive_b = GaussianNB()
naive_b.fit(Xall_train,yall_train)


# In[760]:

#Predicting y values
pred_label = naive_b.predict(Xall_test)


# In[761]:

from sklearn.metrics import confusion_matrix


# In[762]:
Example #3
0
    'min_samples_split': [2, 5, 10]
}
dt_grid_estimator = model_selection.GridSearchCV(dt_estimator,
                                                 dt_grid,
                                                 scoring='accuracy',
                                                 cv=10,
                                                 refit=True,
                                                 return_train_score=True)
dt_grid_estimator.fit(training_data, y_train)
result = dt_grid_estimator.cv_results_

#######
#multiNBClassifier = MultinomialNB().fit(X_train_tfidf, y_train)

naive_bayes = MultinomialNB()
naive_bayes_text_classifier = naive_bayes.fit((training_data), y_train)
print(naive_bayes_text_classifier.score(training_data, y_train))
#0.7658017298735862
print(naive_bayes_text_classifier.score(testing_data, y_test))
#0.7325349301397206
##Generate Prediction
predictions = naive_bayes_text_classifier.predict(testing_data)
predictions

#########Ensemble Random Forest###############
rf_estimator = ensemble.RandomForestClassifier(random_state=100)
rf_estimator_clf = rf_estimator.fit(training_data, y_train)
print(result)
print(result.get('params'))
print(result.get('mean_train_score'))
print(result.get('mean_test_score'))
                     encoding='utf-8',
                     input='content',
                     lowercase=True,
                     max_df=1.0,
                     max_features=None,
                     min_df=1,
                     ngram_range=(1, 1),
                     preprocessor=None,
                     stop_words=None,
                     strip_accents=None,
                     token_pattern='(?u)\\b\\w\\w+\\b',
                     tokenizer=twokenize.tokenizeRawTweetText,
                     vocabulary=None)
trainx = cv.fit_transform(trainx)
testx = cv.transform(testx)
naive_bayes = MultinomialNB().map(lambda x: twokenize.tokenizeRawTweetText(x))
corpus = df + dft

naive_bayes.fit(trainx, trainy)
predictions = naive_bayes.predict(testx)
#%%
print("Accuracy score: ", accuracy_score(testy, predictions))
print("Precision score: ", precision_score(testy, predictions))
print("Recall score: ", recall_score(testy, predictions))

#%%
svm = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm.fit(trainx, trainy)
predictions_svm = svm.predict(testx)
print("svm Accuracy Score -> ", accuracy_score(predictions_svm, testy) * 100)
    # compare with naive bayes as a baseline model
    nb = nb.BernoulliNB()
    models = [lr, nb]
    scores = crossValidation(X_train, y_train, models)

    # plot error for model LR
    plotErrorCurve(X_train, y_train, lr)

    # signifance test
    ttest = ttest_across_folds(scores[0], scores[1])
    print("T-test score: {}".format(ttest))

    # McNemar's test
    lr.fit(X_train, y_train)
    lrYPred = lr.predict(X_test)
    nb.fit(X_train, y_train)
    nbYPred = nb.predict(X_test)

    lr_yn = y_test == lrYPred
    nb_yn = y_test == nbYPred
    print("Logistic Regression accuracy: {}".format(
        np.sum(lr_yn) / len(y_test)))
    print("Naive Bayes accuracy: {}".format(np.sum(nb_yn) / len(y_test)))
    cmp = mcnemar(lr_yn, nb_yn)
    print("McNemar's test score: {}".format(cmp))

    # plot the lr confusion matrix
    pl.figure()
    pl.matshow(met.confusion_matrix(y_test, lrYPred), interpolation='nearest')
    pl.colorbar()
    pl.ylabel('true label')