def evaluate_classifier(classifier, validationFeatures): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(validationFeatures): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, validationFeatures) pos_precision = precision(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME']) pos_recall = recall(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME']) pos_f_measure = f_measure(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME']) neg_precision = precision(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME']) neg_recall = recall(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME']) neg_f_measure = f_measure(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME']) print 'accuracy:', accuracy print 'pos precision:', pos_precision print 'pos recall:', pos_recall print 'pos f-measure', pos_f_measure print 'neg precision:', neg_precision print 'neg recall:', neg_recall print 'neg f-measure', neg_f_measure return {'accuracy': accuracy, 'pos precision': pos_precision, 'pos recall': pos_recall, 'pos f-measure': pos_f_measure, 'neg precision': neg_precision, 'neg recall': neg_recall, 'neg f-measure': neg_f_measure}
def get_accuracy_measures(classifier, testing_data, p_label): actuallabels =collections.defaultdict(set) predictedlabels = collections.defaultdict(set) for i, (tokens, label) in enumerate(testing_data): actuallabels[label].add(i) predicted = classifier.classify(tokens) predictedlabels[predicted].add(i) result=[] result.append(precision(actuallabels[p_label], predictedlabels[p_label])) result.append(recall(actuallabels[p_label], predictedlabels[p_label])) result.append(f_measure(actuallabels[p_label], predictedlabels[p_label])) return result
def calcPrecRecallFMeasure(reference, prediction): precision = nltk.f_measure(reference, prediction, alpha=1.0) recall = nltk.f_measure(reference, prediction, alpha=0) f_measure = nltk.f_measure(reference, prediction, alpha=0.5) return [precision, recall, f_measure]
print "Training..." classifier = svm.SVC(kernel="linear", decision_function_shape="ovr", probability=False) classifier.fit(train_set, train_label) print "Testing..." refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) refsets[1] = set() refsets[-1] = set() refsets[0] = set() testsets[1] = set() testsets[-1] = set() testsets[0] = set() for i in range(len(test_set)): refsets[test_label[i]].add(i) observed = classifier.predict([test_set[i]]) testsets[observed[0]].add(i) print "Saving results..." results.write('pos precision:' + str(precision(refsets[1], testsets[1])) + "\n") results.write('pos recall:' + str(recall(refsets[1], testsets[1])) + "\n") results.write('pos F-measure:' + str(f_measure(refsets[1], testsets[1])) + "\n") results.write('neg precision:' + str(precision(refsets[-1], testsets[-1])) + "\n") results.write('neg recall:' + str(recall(refsets[-1], testsets[-1])) + "\n") results.write('neg F-measure:' + str(f_measure(refsets[-1], testsets[-1])) + "\n") results.close() end = time.time() print "Duration: ", end - start, " seconds" print "2-train.py done!"
def evaluate_classifier(featx): #negfeats = [(featx(mark_negation(f)), 'neg') for f in word_split(negdata)] #posfeats = [(featx(mark_negation(f)), 'pos') for f in word_split(posdata)] negfeats = [(featx(f), 'neg') for f in word_split(negdata)] #print negfeats[1:25] #raw_input('>') posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] print "No of training reviews:", len(trainfeats) #print trainfeats testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print "No of testing reviews:", len(testfeats) # using 3 classifiers classifier_list = ['nb', 'svm', 'maxent'] # NB_pred = [] new_label = [] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) original_label = [] for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) original_label.append(label) #print feats #raw_input('> ') observed = classifier.classify(feats) NB_pred.append(observed) testsets[observed].add(i) #print refsets['pos'] #print testsets['pos'] #print original_label #print NB_Pred #cm = confusion_matrix(original_label,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(original_label,NB_pred)) new_label = original_label accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = nltk.recall(refsets['pos'], testsets['pos']) pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) neg_precision = nltk.precision(refsets['neg'], testsets['neg']) neg_recall = nltk.recall(refsets['neg'], testsets['neg']) neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features(50) print('') #print len(NB_pred) ME_pred = NB_pred[982:] SVM_pred = NB_pred[491:982] NB_pred = NB_pred[0:491] #print NB_pred #print "-----------------------" #print ME_pred #print "-----------------------" #print SVM_pred #print "-----------------------" #cm = confusion_matrix(SVM_pred,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,NB_pred)) #cm = confusion_matrix(ME_pred,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(ME_pred,NB_pred)) #cm = confusion_matrix(SVM_pred,ME_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,ME_pred)) final_pred = [] for i in range(0, 491): c1 = 0 if NB_pred[i] == 'pos': c1 = c1 + 1 if ME_pred[i] == 'pos': c1 = c1 + 1 if SVM_pred[i] == 'pos': c1 = c1 + 1 #print i if c1 == 3 or c1 == 2: final_pred.append('pos') else: final_pred.append('neg') print "-----------------------" #print final_pred print "-----------------------" #print new_label print "Results of ensemble: NB + SVM + ME::" print "----------Confusion Matrix--------------" cm = confusion_matrix(final_pred, new_label) print cm print "" print "The accuracy score of ensemble is {:.2%}".format( accuracy_score(final_pred, new_label)) print "##############################################" ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = len(trainfeats) / n accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos']) cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg']) cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) if cl == 'maxent': maxent_accuracy_next = (sum(accuracy) / n) maxent_accuracy.append(maxent_accuracy_next) elif cl == 'svm': svm_accuracy_next = (sum(accuracy) / n) svm_accuracy.append(svm_accuracy_next) else: nb_accuracy_next = (sum(accuracy) / n) nb_accuracy.append(nb_accuracy_next)
observed = classifier.classify(feats) testsets[observed].add(i) for i, (feats, label) in enumerate(testdata_features): refdatasets[label].add(i) observed = classifier.classify(feats) testdatasets[observed].add(i) accuracy_scores.append(util.accuracy(classifier, test_features)) accuracy_data_scores.append(util.accuracy(classifier, testdata_features)) print('train: {} test: {}'.format(len(train_set), len(test_set))) print('=================== Results ===================') print('Accuracy {:f}'.format(accuracy_scores[-1])) print(' Positive Negative') print('F1 [{:f} {:f}]'.format( f_measure(refsets['pos'], testsets['pos']), f_measure(refsets['neg'], testsets['neg']))) print('Precision [{:f} {:f}]'.format( precision(refsets['pos'], testsets['pos']), precision(refsets['neg'], testsets['neg']))) print('Recall [{:f} {:f}]'.format( recall(refsets['pos'], testsets['pos']), recall(refsets['neg'], testsets['neg']))) print('===============================================\n') print('testData: {}'.format(len(testSentences))) print('=================== Results ===================') print('Accuracy TestData {:f}'.format(accuracy_data_scores[-1])) print('F1 [{:f} {:f}]'.format( f_measure(refdatasets['pos'], testdatasets['pos']), f_measure(refdatasets['neg'], testdatasets['neg']))) print('Precision [{:f} {:f}]'.format(
test_tweets = BuildFeatureVector(all_tweet_array[training_size:]) print len(test_tweets) training_set = nltk.classify.apply_features(extract_features, train_tweets) test_set = nltk.classify.apply_features(extract_features, test_tweets) NBClassifier = nltk.NaiveBayesClassifier.train(training_set) NBClassifier.show_most_informative_features(20) TestSet(all_tweet_array[training_size:]) print '' print 'TRAINING accuracy:', nltk.classify.accuracy(NBClassifier, training_set) print 'TEST accuracy:', nltk.classify.accuracy(NBClassifier, test_set) print '' print 'NEU precision:', precision(refSet['NEU'], testSet['NEU']) print 'NEU recall:', recall(refSet['NEU'], testSet['NEU']) print 'NEU F-measure:', f_measure(refSet['NEU'], testSet['NEU']) print '' print 'POS precision:', precision(refSet['POZ'], testSet['POZ']) print 'POS recall:', recall(refSet['POZ'], testSet['POZ']) print 'POS F-measure:', f_measure(refSet['POZ'], testSet['POZ']) print '' print 'NEG precision:', precision(refSet['NEG'], testSet['NEG']) print 'NEG recall:', recall(refSet['NEG'], testSet['NEG']) print 'NEG F-measure:', f_measure(refSet['NEG'], testSet['NEG']) print '' print ConfusionMatrix(refSetF, testSetF)
def do_test(): # global classifier print('Start training') k_splits = 10 response = { 'type': 1, 'status': 'start', 'step': 0, 'max_step': k_splits, 'trainset': 0, 'testset': 0 } socketio.emit('test_result', response, namespace='/test') socketio.sleep(.1) data = sentiment.prepare_data() from sklearn.model_selection import KFold import numpy as np import collections k_fold = KFold(n_splits=k_splits, random_state=1992, shuffle=True) featuresets = np.array(data) accuracy_scores = [] index = 0 for train_set, test_set in k_fold.split(featuresets): index += 1 sentiment.word_features = sentiment.get_word_features( sentiment.get_words_in_reviews(featuresets[train_set].tolist())) train_features = nltk.classify.apply_features( sentiment.extract_features, featuresets[train_set].tolist()) test_features = nltk.classify.apply_features( sentiment.extract_features, featuresets[test_set].tolist()) classifier = nltk.NaiveBayesClassifier.train(train_features) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_features): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy_scores.append( nltk.classify.util.accuracy(classifier, test_features)) f1_pos = nltk.f_measure(refsets['pos'], testsets['pos']) f1_neg = nltk.f_measure(refsets['neg'], testsets['neg']) pre_pos = nltk.precision(refsets['pos'], testsets['pos']) pre_neg = nltk.precision(refsets['neg'], testsets['neg']) re_pos = nltk.recall(refsets['pos'], testsets['pos']) re_neg = nltk.recall(refsets['neg'], testsets['neg']) response = { 'type': 1, 'status': 'progress', 'step': index, 'max_step': k_splits, 'trainset': len(train_set), 'testset': len(test_set), 'accuracy': accuracy_scores[-1], 'f1_pos': f1_pos, 'f1_neg': f1_neg, 'pre_pos': pre_pos, 'pre_neg': pre_neg, 're_pos': re_pos, 're_neg': re_neg } socketio.emit('test_result', response, namespace='/test') socketio.sleep(.1) print("Success training") response = { 'type': 1, 'status': 'success', 'step': index, 'max_step': k_splits, 'trainset': len(train_set), 'testset': len(test_set) } socketio.emit('test_result', response, namespace='/test') socketio.sleep(.1)
for w in token_features: features[w] = (w in words) return features featuresets = [(extract_features(tweet), category) for (rev, category) in document] training_set = featuresets[:1500] testing_set = featuresets[500:] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100) # print(classifier.show_most_informative_features()) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print("Pos Precision:", (nltk.precision(refsets['pos'], testsets['pos']))) print("Pos Recall:", (nltk.recall(refsets['pos'], testsets['pos']))) print("Pos F-Score:", (nltk.f_measure(refsets['pos'], testsets['pos']))) print("Neg Precision:", (nltk.precision(refsets['neg'], testsets['neg']))) print("Neg Recall:", (nltk.recall(refsets['neg'], testsets['neg']))) print("Neg F-Score:", (nltk.f_measure(refsets['neg'], testsets['neg'])))
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) #print(negcutoff) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #print(trainfeats) testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats) #classifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) #print(testfeats) for i, (feats, label) in enumerate(testfeats): #print(feats,'---',label) refsets[label].add(i) observed = classifier.classify(feats) #print(observed) testsets[observed].add(i) #print(testsets) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = nltk.recall(refsets['pos'], testsets['pos']) pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) neg_precision = nltk.precision(refsets['neg'], testsets['neg']) neg_recall = nltk.recall(refsets['neg'], testsets['neg']) neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation subset_size = int(len(trainfeats) / n) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos']) cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg']) cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) print('')
refsets['neg'] = set() refsets['neu'] = set() testsets['pos'] = set() testsets['neg'] = set() testsets['neu'] = set() print "Testing..." for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = NBClassifier.classify(feats) testsets[observed].add(i) print "Saving results..." results.write('pos precision:' + str(precision(refsets["pos"], testsets["pos"])) + "\n") results.write('pos recall:' + str(recall(refsets["pos"], testsets["pos"])) + "\n") results.write('pos F-measure:' + str(f_measure(refsets["pos"], testsets["pos"])) + "\n") results.write('neg precision:' + str(precision(refsets["neg"], testsets["neg"])) + "\n") results.write('neg recall:' + str(recall(refsets["neg"], testsets["neg"])) + "\n") results.write('neg F-measure:' + str(f_measure(refsets["neg"], testsets["neg"])) + "\n") results.write('neu precision:' + str(precision(refsets["neutral"], testsets["neutral"])) + "\n") results.write('neu recall:' + str(recall(refsets["neutral"], testsets["neutral"])) + "\n") results.write('neu F-measure:' + str(f_measure(refsets["neutral"], testsets["neutral"])) + "\n") results.write("\nMost informative features:\n") mif = NBClassifier.most_informative_features() for f in mif: results.write(str(f) + "\n") results.close() end = time.time() print "Duration: ", end - start, " seconds"
def author_beng_nbc(): #1st Set bankc = open("/python27/Bankim500_1.txt", "r").read() bankw = bankc.split() bankz = reduce(concat, [['bankim', x] for x in bankw[1:]], bankw[0:1]) #print a3 it = iter(bankz) bankt = zip(it, it) #print a4 #2nd Set bibhuc = open("/python27/Bibhuti500_1.txt", "r").read() bibhuw = bibhuc.split() bibhuz = reduce(concat, [['bibhuti', x] for x in bibhuw[1:]], bibhuw[0:1]) #print b3 it1 = iter(bibhuz) bibhut = zip(it1, it1) #print b4 #3rd Set rabindrac = open("/python27/Rabindra500_1.txt", "r").read() rabindraw = rabindrac.split() rabindraz = reduce(concat, [['rabindra', x] for x in rabindraw[1:]], rabindraw[0:1]) #print a3 it2 = iter(rabindraz) rabindrat = zip(it2, it2) #4th Set saratc = open("/python27/Sarat500_1.txt", "r").read() saratw = saratc.split() saratz = reduce(concat, [['sarat', x] for x in saratw[1:]], saratw[0:1]) #print a3 it3 = iter(saratz) saratt = zip(it3, it3) add1 = bankt + bibhut + rabindrat + saratt #print c1 training_data = add1 vocabulary = set( chain(*[word_tokenize(i[0].lower()) for i in training_data])) feature_set = [ ({i: (i in word_tokenize(sentence.lower())) for i in vocabulary}, tag) for sentence, tag in training_data ] #print "###",feature_set from nltk.classify import NaiveBayesClassifier as nbc train_set, test_set = feature_set[:300], feature_set[300:] print len(train_set) print len(test_set) classifier = nbc.train(train_set) test_sentence = "আলীপুরের উকিল বিশেষ কিছু হয় বলিয়া মনে হয় না বালিগঞ্জের ওদিকে কোথায় একটা টিউশনি আছে" featurized_test_sentence = { i: (i in word_tokenize(test_sentence.lower())) for i in vocabulary } print "test_sent:", test_sentence print "tag:", classifier.classify(featurized_test_sentence) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'bankim precision:', nltk.precision(refsets['bankim'], testsets['bankim']) print 'bankim recall:', nltk.recall(refsets['bankim'], testsets['bankim']) print 'bankim F-measure:', nltk.f_measure(refsets['bankim'], testsets['bankim']) print 'bibhuti precision:', nltk.precision(refsets['bibhuti'], testsets['bibhuti']) print 'bibhuti recall:', nltk.recall(refsets['bibhuti'], testsets['bibhuti']) print 'bibhuti F-measure:', nltk.f_measure(refsets['bibhuti'], testsets['bibhuti']) print 'bankim precision:', nltk.precision(refsets['rabindra'], testsets['rabindra']) print 'bankim recall:', nltk.recall(refsets['rabindra'], testsets['rabindra']) print 'bankim F-measure:', nltk.f_measure(refsets['rabindra'], testsets['rabindra']) print 'bibhuti precision:', nltk.precision(refsets['sarat'], testsets['sarat']) print 'bibhuti recall:', nltk.recall(refsets['sarat'], testsets['sarat']) print 'bibhuti F-measure:', nltk.f_measure(refsets['sarat'], testsets['sarat'])
tweets = neg_tweets[:cutoff] + pos_tweets[:cutoff] test_tweets = neg_tweets[cutoff:] + pos_tweets[cutoff:] all_words = [] words_frequency = [] print(tweets) #Get all the words for (words, sentiment) in tweets: all_words.extend(words) #extract the features wordlist = nltk.FreqDist(all_words) word_features = wordlist.keys() training_set = nltk.classify.apply_features(extract_features, tweets) classifier = NaiveBayesClassifier.train(training_set) refsets = {'pos': set([]), 'neg': set([])} testsets = {'pos': set([]), 'neg': set([])} classifier.show_most_informative_features() for i, (feats, label) in enumerate(test_tweets): refsets.get(label).add(i) testsets[classifier.classify(extract_features(feats))].add(i) print('pos precision:', nltk.precision(refsets['pos'], testsets['pos'])) print('pos recall:', nltk.recall(refsets['pos'], testsets['pos'])) print('pos F-measure:', nltk.f_measure(refsets['pos'], testsets['pos'])) print('neg precision:', nltk.precision(refsets['neg'], testsets['neg'])) print('neg recall:', nltk.recall(refsets['neg'], testsets['neg'])) print('neg F-measure:', nltk.f_measure(refsets['neg'], testsets['neg']))
def evaluate_mult_classifiers(feature_x, n_folds=5): # 5-fold default for cross-validation # train_feats = 75% of pos_data + 75% of neg_data # test_feats = 25% of pos_data + 25% of neg_data neg_feats = [(feature_x(i), 'neg') for i in word_split(neg_data)] pos_feats = [(feature_x(i), 'pos') for i in word_split(pos_data)] neg_cutoff = int(len(neg_feats) * 0.75) pos_cutoff = int(len(pos_feats) * 0.75) train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff] test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:] classifier_list = ['NB', 'SVM'] ## CROSS VALIDATION train_feats = neg_feats + pos_feats # Shuffle training set random.shuffle(train_feats) for cl in classifier_list: subset_size = int(len(train_feats) / n_folds) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 print('--------------------------') print('Beginning Cross-validation') print('--------------------------') for i in range(n_folds): testing_this_round = train_feats[i * subset_size:][:subset_size] training_this_round = train_feats[:i * subset_size] + train_feats[ (i + 1) * subset_size:] if cl == 'NB': classifierName = 'Naive Bayes' # Using NLTK NaiveBayesClassifier classifier = NaiveBayesClassifier.train(training_this_round) else: classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) ref_sets = collections.defaultdict(set) test_sets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): ref_sets[label].add(i) observed = classifier.classify(feats) test_sets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(ref_sets['pos'], test_sets['pos']) cv_pos_recall = nltk.recall(ref_sets['pos'], test_sets['pos']) cv_pos_fmeasure = nltk.f_measure(ref_sets['pos'], test_sets['pos']) cv_neg_precision = nltk.precision(ref_sets['neg'], test_sets['neg']) cv_neg_recall = nltk.recall(ref_sets['neg'], test_sets['neg']) cv_neg_fmeasure = nltk.f_measure(ref_sets['neg'], test_sets['neg']) print('Fold: {} Acc : {:.4F}'.format(cv_count, cv_accuracy)) print('Fold: {} pos_prec : {:.4F} neg_prec : {:.4F}'.format( cv_count, cv_pos_precision, cv_neg_precision)) print('Fold: {} pos_recall: {:.4F} neg_recall: {:.4F}'.format( cv_count, cv_pos_recall, cv_neg_recall)) print('Fold: {} pos_fmeas : {:.4F} neg_fmeas : {:.4F}'.format( cv_count, cv_pos_fmeasure, cv_neg_fmeasure)) print('--') accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('----------------------------------------------------------') print('{}-Fold Cross Validation results for {} Classifier'.format( n_folds, classifierName)) print('----------------------------------------------------------') print('accuracy : {:.4F}'.format(sum(accuracy) / n_folds)) print('precision: {:.4F}'.format( (sum(pos_precision) / n_folds + sum(neg_precision) / n_folds) / 2)) print('recall : {:.4F}'.format( (sum(pos_recall) / n_folds + sum(neg_recall) / n_folds) / 2)) print('f-measure: {:.4F}'.format( (sum(pos_fmeasure) / n_folds + sum(neg_fmeasure) / n_folds) / 2)) print('\n')
NBayesClassifier=nltk.NaiveBayesClassifier.train(trainFeat) #NBResultLabels=[NBayesClassifier.classify(extract_features(tweet[0])) for tweet in testData] print("Accuracy : " + str(nltk.classify.util.accuracy(NBayesClassifier, testFeat)*100) + " %") refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testFeat): refsets[label].add(i) observed = NBayesClassifier.classify(feats) testsets[observed].add(i) print 'sarcasm precision : ' + str((precision(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %" print 'sarcasm recall : ' + str((recall(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %" print 'sarcasm F-measure : ' + str((f_measure(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %" print 'non-sarcasm precision : ' + str((precision(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %" print 'non-sarcasm recall : ' + str((recall(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %" print 'non-sarcasm F-measure : ' + str((f_measure(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %" #NBayesClassifier.show_most_informative_features(100) # print(NBResultLabels) # if NBResultLabels.count('positive')>NBResultLabels.count('negative'): # print "NB Result Sarcastic Sentiment\t\t:" + str(100*NBResultLabels.count('sarcasm')/len(NBResultLabels))+"%" # else: # print "NB Result Non-Sarcastic Sentiment\t:" + str(100*NBResultLabels.count('non-sarcasm')/len(NBResultLabels))+"%"