def evaluate_classifier(classifier, validationFeatures):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(validationFeatures):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, validationFeatures)
    pos_precision = precision(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME'])
    pos_recall = recall(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME'])
    pos_f_measure = f_measure(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME'])
    neg_precision = precision(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME'])
    neg_recall = recall(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME'])
    neg_f_measure = f_measure(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME'])

    print 'accuracy:', accuracy
    print 'pos precision:', pos_precision
    print 'pos recall:', pos_recall
    print 'pos f-measure', pos_f_measure
    print 'neg precision:', neg_precision
    print 'neg recall:', neg_recall
    print 'neg f-measure', neg_f_measure

    return {'accuracy': accuracy, 'pos precision': pos_precision, 'pos recall': pos_recall, 'pos f-measure': pos_f_measure, 'neg precision': neg_precision, 'neg recall': neg_recall, 'neg f-measure': neg_f_measure}
Esempio n. 2
0
def get_accuracy_measures(classifier, testing_data, p_label):
    actuallabels =collections.defaultdict(set)
    predictedlabels = collections.defaultdict(set)

    for i, (tokens, label) in enumerate(testing_data):
        actuallabels[label].add(i)
        predicted = classifier.classify(tokens)
        predictedlabels[predicted].add(i)
        
    result=[]
    result.append(precision(actuallabels[p_label], predictedlabels[p_label]))
    result.append(recall(actuallabels[p_label], predictedlabels[p_label]))
    result.append(f_measure(actuallabels[p_label], predictedlabels[p_label]))
    return result
Esempio n. 3
0
def calcPrecRecallFMeasure(reference, prediction):
    precision = nltk.f_measure(reference, prediction, alpha=1.0)
    recall = nltk.f_measure(reference, prediction, alpha=0)
    f_measure = nltk.f_measure(reference, prediction, alpha=0.5)
    return [precision, recall, f_measure]
Esempio n. 4
0
  print "Training..."
  classifier = svm.SVC(kernel="linear", decision_function_shape="ovr", probability=False)
  classifier.fit(train_set, train_label)
  
  print "Testing..."
  refsets = collections.defaultdict(set)
  testsets = collections.defaultdict(set)
  refsets[1] = set()
  refsets[-1] = set()
  refsets[0] = set()
  testsets[1] = set()
  testsets[-1] = set()
  testsets[0] = set()
  for i in range(len(test_set)):
    refsets[test_label[i]].add(i)
    observed = classifier.predict([test_set[i]])
    testsets[observed[0]].add(i)

  print "Saving results..."
  results.write('pos precision:' + str(precision(refsets[1], testsets[1])) + "\n")
  results.write('pos recall:' + str(recall(refsets[1], testsets[1])) + "\n")
  results.write('pos F-measure:' + str(f_measure(refsets[1], testsets[1])) + "\n")
  results.write('neg precision:' + str(precision(refsets[-1], testsets[-1])) + "\n")
  results.write('neg recall:' + str(recall(refsets[-1], testsets[-1])) + "\n")
  results.write('neg F-measure:' + str(f_measure(refsets[-1], testsets[-1])) + "\n")
  
results.close()
end = time.time()
print "Duration: ", end - start, " seconds"
print "2-train.py done!"
Esempio n. 5
0
def evaluate_classifier(featx):

    #negfeats = [(featx(mark_negation(f)), 'neg') for f in word_split(negdata)]
    #posfeats = [(featx(mark_negation(f)), 'pos') for f in word_split(posdata)]
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    #print negfeats[1:25]
    #raw_input('>')
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    negcutoff = len(negfeats) * 3 / 4
    poscutoff = len(posfeats) * 3 / 4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    print "No of training reviews:", len(trainfeats)
    #print trainfeats
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print "No of testing reviews:", len(testfeats)

    # using 3 classifiers
    classifier_list = ['nb', 'svm', 'maxent']  #
    NB_pred = []
    new_label = []
    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        original_label = []

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            original_label.append(label)
            #print feats
            #raw_input('> ')
            observed = classifier.classify(feats)
            NB_pred.append(observed)

            testsets[observed].add(i)

        #print refsets['pos']
        #print testsets['pos']
        #print original_label
        #print NB_Pred
        #cm = confusion_matrix(original_label,NB_pred)
        #print cm
        #print "The accuracy score is {:.2%}".format(accuracy_score(original_label,NB_pred))
        new_label = original_label
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

        #classifier.show_most_informative_features(50)

    print('')

    #print len(NB_pred)

    ME_pred = NB_pred[982:]
    SVM_pred = NB_pred[491:982]
    NB_pred = NB_pred[0:491]
    #print NB_pred
    #print "-----------------------"
    #print ME_pred
    #print "-----------------------"
    #print SVM_pred
    #print "-----------------------"
    #cm = confusion_matrix(SVM_pred,NB_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,NB_pred))
    #cm = confusion_matrix(ME_pred,NB_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(ME_pred,NB_pred))
    #cm = confusion_matrix(SVM_pred,ME_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,ME_pred))

    final_pred = []
    for i in range(0, 491):
        c1 = 0
        if NB_pred[i] == 'pos':
            c1 = c1 + 1
        if ME_pred[i] == 'pos':
            c1 = c1 + 1
        if SVM_pred[i] == 'pos':
            c1 = c1 + 1
        #print i
        if c1 == 3 or c1 == 2:
            final_pred.append('pos')
        else:
            final_pred.append('neg')

    print "-----------------------"
    #print final_pred
    print "-----------------------"
    #print new_label

    print "Results of ensemble: NB + SVM + ME::"
    print "----------Confusion Matrix--------------"
    cm = confusion_matrix(final_pred, new_label)
    print cm
    print ""
    print "The accuracy score of ensemble is {:.2%}".format(
        accuracy_score(final_pred, new_label))
    print "##############################################"

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    for cl in classifier_list:

        subset_size = len(trainfeats) / n
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):
            testing_this_round = trainfeats[i * subset_size:][:subset_size]
            training_this_round = trainfeats[:i * subset_size] + trainfeats[
                (i + 1) * subset_size:]

            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                classifier = MaxentClassifier.train(training_this_round,
                                                    'GIS',
                                                    trace=0,
                                                    encoding=None,
                                                    labels=None,
                                                    gaussian_prior_sigma=0,
                                                    max_iter=1)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)

            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision',
              (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
        print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
        print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
        if cl == 'maxent':
            maxent_accuracy_next = (sum(accuracy) / n)
            maxent_accuracy.append(maxent_accuracy_next)
        elif cl == 'svm':
            svm_accuracy_next = (sum(accuracy) / n)
            svm_accuracy.append(svm_accuracy_next)
        else:
            nb_accuracy_next = (sum(accuracy) / n)
            nb_accuracy.append(nb_accuracy_next)
Esempio n. 6
0
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    for i, (feats, label) in enumerate(testdata_features):
        refdatasets[label].add(i)
        observed = classifier.classify(feats)
        testdatasets[observed].add(i)

    accuracy_scores.append(util.accuracy(classifier, test_features))
    accuracy_data_scores.append(util.accuracy(classifier, testdata_features))
    print('train: {} test: {}'.format(len(train_set), len(test_set)))
    print('=================== Results ===================')
    print('Accuracy {:f}'.format(accuracy_scores[-1]))
    print('            Positive     Negative')
    print('F1         [{:f}     {:f}]'.format(
        f_measure(refsets['pos'], testsets['pos']),
        f_measure(refsets['neg'], testsets['neg'])))
    print('Precision  [{:f}     {:f}]'.format(
        precision(refsets['pos'], testsets['pos']),
        precision(refsets['neg'], testsets['neg'])))
    print('Recall     [{:f}     {:f}]'.format(
        recall(refsets['pos'], testsets['pos']),
        recall(refsets['neg'], testsets['neg'])))
    print('===============================================\n')
    print('testData: {}'.format(len(testSentences)))
    print('=================== Results ===================')
    print('Accuracy TestData {:f}'.format(accuracy_data_scores[-1]))
    print('F1         [{:f}     {:f}]'.format(
        f_measure(refdatasets['pos'], testdatasets['pos']),
        f_measure(refdatasets['neg'], testdatasets['neg'])))
    print('Precision  [{:f}     {:f}]'.format(
Esempio n. 7
0
File: bay.py Progetto: rolandinsh/om
test_tweets = BuildFeatureVector(all_tweet_array[training_size:])

print len(test_tweets)

training_set = nltk.classify.apply_features(extract_features, train_tweets)
test_set = nltk.classify.apply_features(extract_features, test_tweets)

NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

NBClassifier.show_most_informative_features(20)

TestSet(all_tweet_array[training_size:])

print ''
print 'TRAINING accuracy:', nltk.classify.accuracy(NBClassifier, training_set)
print 'TEST accuracy:', nltk.classify.accuracy(NBClassifier, test_set)
print ''
print 'NEU precision:', precision(refSet['NEU'], testSet['NEU'])
print 'NEU recall:', recall(refSet['NEU'], testSet['NEU'])
print 'NEU F-measure:', f_measure(refSet['NEU'], testSet['NEU'])
print ''
print 'POS precision:', precision(refSet['POZ'], testSet['POZ'])
print 'POS recall:', recall(refSet['POZ'], testSet['POZ'])
print 'POS F-measure:', f_measure(refSet['POZ'], testSet['POZ'])
print ''
print 'NEG precision:', precision(refSet['NEG'], testSet['NEG'])
print 'NEG recall:', recall(refSet['NEG'], testSet['NEG'])
print 'NEG F-measure:', f_measure(refSet['NEG'], testSet['NEG'])
print ''
print ConfusionMatrix(refSetF, testSetF)
Esempio n. 8
0
def do_test():
    # global classifier
    print('Start training')
    k_splits = 10

    response = {
        'type': 1,
        'status': 'start',
        'step': 0,
        'max_step': k_splits,
        'trainset': 0,
        'testset': 0
    }
    socketio.emit('test_result', response, namespace='/test')
    socketio.sleep(.1)
    data = sentiment.prepare_data()

    from sklearn.model_selection import KFold
    import numpy as np
    import collections

    k_fold = KFold(n_splits=k_splits, random_state=1992, shuffle=True)
    featuresets = np.array(data)
    accuracy_scores = []
    index = 0
    for train_set, test_set in k_fold.split(featuresets):
        index += 1
        sentiment.word_features = sentiment.get_word_features(
            sentiment.get_words_in_reviews(featuresets[train_set].tolist()))
        train_features = nltk.classify.apply_features(
            sentiment.extract_features, featuresets[train_set].tolist())
        test_features = nltk.classify.apply_features(
            sentiment.extract_features, featuresets[test_set].tolist())
        classifier = nltk.NaiveBayesClassifier.train(train_features)
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(test_features):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy_scores.append(
            nltk.classify.util.accuracy(classifier, test_features))

        f1_pos = nltk.f_measure(refsets['pos'], testsets['pos'])
        f1_neg = nltk.f_measure(refsets['neg'], testsets['neg'])
        pre_pos = nltk.precision(refsets['pos'], testsets['pos'])
        pre_neg = nltk.precision(refsets['neg'], testsets['neg'])
        re_pos = nltk.recall(refsets['pos'], testsets['pos'])
        re_neg = nltk.recall(refsets['neg'], testsets['neg'])

        response = {
            'type': 1,
            'status': 'progress',
            'step': index,
            'max_step': k_splits,
            'trainset': len(train_set),
            'testset': len(test_set),
            'accuracy': accuracy_scores[-1],
            'f1_pos': f1_pos,
            'f1_neg': f1_neg,
            'pre_pos': pre_pos,
            'pre_neg': pre_neg,
            're_pos': re_pos,
            're_neg': re_neg
        }
        socketio.emit('test_result', response, namespace='/test')
        socketio.sleep(.1)

    print("Success training")
    response = {
        'type': 1,
        'status': 'success',
        'step': index,
        'max_step': k_splits,
        'trainset': len(train_set),
        'testset': len(test_set)
    }
    socketio.emit('test_result', response, namespace='/test')
    socketio.sleep(.1)
Esempio n. 9
0
    for w in token_features:
        features[w] = (w in words)
    return features


featuresets = [(extract_features(tweet), category)
               for (rev, category) in document]

training_set = featuresets[:1500]
testing_set = featuresets[500:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Classifier accuracy percent:",
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
# print(classifier.show_most_informative_features())

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testing_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print("Pos Precision:", (nltk.precision(refsets['pos'], testsets['pos'])))
print("Pos Recall:", (nltk.recall(refsets['pos'], testsets['pos'])))
print("Pos F-Score:", (nltk.f_measure(refsets['pos'], testsets['pos'])))
print("Neg Precision:", (nltk.precision(refsets['neg'], testsets['neg'])))
print("Neg Recall:", (nltk.recall(refsets['neg'], testsets['neg'])))
print("Neg F-Score:", (nltk.f_measure(refsets['neg'], testsets['neg'])))
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    #print(negcutoff)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #print(trainfeats)
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    classifierName = 'SVM'
    classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats)
    #classifier.train(trainfeats)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    #print(testfeats)
    for i, (feats, label) in enumerate(testfeats):
        #print(feats,'---',label)
        refsets[label].add(i)
        observed = classifier.classify(feats)
        #print(observed)
        testsets[observed].add(i)
        #print(testsets)

    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
    pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
    pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
    neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
    neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
    neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

    print('')
    print('---------------------------------------')
    print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', accuracy)
    print('precision', (pos_precision + neg_precision) / 2)
    print('recall', (pos_recall + neg_recall) / 2)
    print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

    #classifier.show_most_informative_features()

    print('')

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    subset_size = int(len(trainfeats) / n)
    accuracy = []
    pos_precision = []
    pos_recall = []
    neg_precision = []
    neg_recall = []
    pos_fmeasure = []
    neg_fmeasure = []
    cv_count = 1
    for i in range(n):
        testing_this_round = trainfeats[i * subset_size:][:subset_size]
        training_this_round = trainfeats[:i * subset_size] + trainfeats[
            (i + 1) * subset_size:]

        classifierName = 'SVM'
        classifier = SklearnClassifier(LinearSVC(), sparse=False)
        classifier.train(training_this_round)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        for i, (feats, label) in enumerate(testing_this_round):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                  testing_this_round)
        cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

        accuracy.append(cv_accuracy)
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)
        pos_fmeasure.append(cv_pos_fmeasure)
        neg_fmeasure.append(cv_neg_fmeasure)

        cv_count += 1

    print('---------------------------------------')
    print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', sum(accuracy) / n)
    print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
    print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
    print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
    print('')
Esempio n. 11
0
  refsets['neg'] = set()
  refsets['neu'] = set()
  testsets['pos'] = set()
  testsets['neg'] = set()
  testsets['neu'] = set()
  
  print "Testing..."
  for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = NBClassifier.classify(feats)
    testsets[observed].add(i)

  print "Saving results..."
  results.write('pos precision:' + str(precision(refsets["pos"], testsets["pos"])) + "\n")
  results.write('pos recall:' + str(recall(refsets["pos"], testsets["pos"])) + "\n")
  results.write('pos F-measure:' + str(f_measure(refsets["pos"], testsets["pos"])) + "\n")
  results.write('neg precision:' + str(precision(refsets["neg"], testsets["neg"])) + "\n")
  results.write('neg recall:' + str(recall(refsets["neg"], testsets["neg"])) + "\n")
  results.write('neg F-measure:' + str(f_measure(refsets["neg"], testsets["neg"])) + "\n")
  results.write('neu precision:' + str(precision(refsets["neutral"], testsets["neutral"])) + "\n")
  results.write('neu recall:' + str(recall(refsets["neutral"], testsets["neutral"])) + "\n")
  results.write('neu F-measure:' + str(f_measure(refsets["neutral"], testsets["neutral"])) + "\n")

  results.write("\nMost informative features:\n")
  mif = NBClassifier.most_informative_features()
  for f in mif:
    results.write(str(f) + "\n")

results.close()
end = time.time()
print "Duration: ", end - start, " seconds"
Esempio n. 12
0
def author_beng_nbc():
    #1st Set
    bankc = open("/python27/Bankim500_1.txt", "r").read()
    bankw = bankc.split()
    bankz = reduce(concat, [['bankim', x] for x in bankw[1:]], bankw[0:1])
    #print a3
    it = iter(bankz)
    bankt = zip(it, it)
    #print a4
    #2nd Set
    bibhuc = open("/python27/Bibhuti500_1.txt", "r").read()
    bibhuw = bibhuc.split()
    bibhuz = reduce(concat, [['bibhuti', x] for x in bibhuw[1:]], bibhuw[0:1])
    #print b3
    it1 = iter(bibhuz)
    bibhut = zip(it1, it1)
    #print b4
    #3rd Set
    rabindrac = open("/python27/Rabindra500_1.txt", "r").read()
    rabindraw = rabindrac.split()
    rabindraz = reduce(concat, [['rabindra', x] for x in rabindraw[1:]],
                       rabindraw[0:1])
    #print a3
    it2 = iter(rabindraz)
    rabindrat = zip(it2, it2)
    #4th Set
    saratc = open("/python27/Sarat500_1.txt", "r").read()
    saratw = saratc.split()
    saratz = reduce(concat, [['sarat', x] for x in saratw[1:]], saratw[0:1])
    #print a3
    it3 = iter(saratz)
    saratt = zip(it3, it3)
    add1 = bankt + bibhut + rabindrat + saratt
    #print c1
    training_data = add1
    vocabulary = set(
        chain(*[word_tokenize(i[0].lower()) for i in training_data]))
    feature_set = [
        ({i: (i in word_tokenize(sentence.lower()))
          for i in vocabulary}, tag) for sentence, tag in training_data
    ]
    #print "###",feature_set
    from nltk.classify import NaiveBayesClassifier as nbc
    train_set, test_set = feature_set[:300], feature_set[300:]
    print len(train_set)
    print len(test_set)
    classifier = nbc.train(train_set)
    test_sentence = "আলীপুরের উকিল বিশেষ কিছু হয় বলিয়া মনে হয় না বালিগঞ্জের ওদিকে কোথায় একটা টিউশনি আছে"
    featurized_test_sentence = {
        i: (i in word_tokenize(test_sentence.lower()))
        for i in vocabulary
    }
    print "test_sent:", test_sentence
    print "tag:", classifier.classify(featurized_test_sentence)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print 'bankim precision:', nltk.precision(refsets['bankim'],
                                              testsets['bankim'])
    print 'bankim recall:', nltk.recall(refsets['bankim'], testsets['bankim'])
    print 'bankim F-measure:', nltk.f_measure(refsets['bankim'],
                                              testsets['bankim'])
    print 'bibhuti precision:', nltk.precision(refsets['bibhuti'],
                                               testsets['bibhuti'])
    print 'bibhuti recall:', nltk.recall(refsets['bibhuti'],
                                         testsets['bibhuti'])
    print 'bibhuti F-measure:', nltk.f_measure(refsets['bibhuti'],
                                               testsets['bibhuti'])
    print 'bankim precision:', nltk.precision(refsets['rabindra'],
                                              testsets['rabindra'])
    print 'bankim recall:', nltk.recall(refsets['rabindra'],
                                        testsets['rabindra'])
    print 'bankim F-measure:', nltk.f_measure(refsets['rabindra'],
                                              testsets['rabindra'])
    print 'bibhuti precision:', nltk.precision(refsets['sarat'],
                                               testsets['sarat'])
    print 'bibhuti recall:', nltk.recall(refsets['sarat'], testsets['sarat'])
    print 'bibhuti F-measure:', nltk.f_measure(refsets['sarat'],
                                               testsets['sarat'])
Esempio n. 13
0
    tweets = neg_tweets[:cutoff] + pos_tweets[:cutoff]
    test_tweets = neg_tweets[cutoff:] + pos_tweets[cutoff:]
    all_words = []
    words_frequency = []
    print(tweets)
    #Get all the words
    for (words, sentiment) in tweets:
        all_words.extend(words)

    #extract the features
    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    training_set = nltk.classify.apply_features(extract_features, tweets)
    classifier = NaiveBayesClassifier.train(training_set)
    refsets = {'pos': set([]), 'neg': set([])}
    testsets = {'pos': set([]), 'neg': set([])}

    classifier.show_most_informative_features()

    for i, (feats, label) in enumerate(test_tweets):
        refsets.get(label).add(i)
        testsets[classifier.classify(extract_features(feats))].add(i)

    print('pos precision:', nltk.precision(refsets['pos'], testsets['pos']))
    print('pos recall:', nltk.recall(refsets['pos'], testsets['pos']))
    print('pos F-measure:', nltk.f_measure(refsets['pos'], testsets['pos']))
    print('neg precision:', nltk.precision(refsets['neg'], testsets['neg']))
    print('neg recall:', nltk.recall(refsets['neg'], testsets['neg']))
    print('neg F-measure:', nltk.f_measure(refsets['neg'], testsets['neg']))
Esempio n. 14
0
def evaluate_mult_classifiers(feature_x, n_folds=5):

    # 5-fold default for cross-validation
    # train_feats = 75% of pos_data + 75% of neg_data
    # test_feats  = 25% of pos_data + 25% of neg_data

    neg_feats = [(feature_x(i), 'neg') for i in word_split(neg_data)]
    pos_feats = [(feature_x(i), 'pos') for i in word_split(pos_data)]

    neg_cutoff = int(len(neg_feats) * 0.75)
    pos_cutoff = int(len(pos_feats) * 0.75)

    train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
    test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]

    classifier_list = ['NB', 'SVM']

    ## CROSS VALIDATION
    train_feats = neg_feats + pos_feats

    # Shuffle training set
    random.shuffle(train_feats)

    for cl in classifier_list:

        subset_size = int(len(train_feats) / n_folds)
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1

        print('--------------------------')
        print('Beginning Cross-validation')
        print('--------------------------')

        for i in range(n_folds):
            testing_this_round = train_feats[i * subset_size:][:subset_size]
            training_this_round = train_feats[:i * subset_size] + train_feats[
                (i + 1) * subset_size:]

            if cl == 'NB':
                classifierName = 'Naive Bayes'
                # Using NLTK NaiveBayesClassifier
                classifier = NaiveBayesClassifier.train(training_this_round)
            else:
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)

            ref_sets = collections.defaultdict(set)
            test_sets = collections.defaultdict(set)

            for i, (feats, label) in enumerate(testing_this_round):
                ref_sets[label].add(i)
                observed = classifier.classify(feats)
                test_sets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = nltk.precision(ref_sets['pos'],
                                              test_sets['pos'])
            cv_pos_recall = nltk.recall(ref_sets['pos'], test_sets['pos'])
            cv_pos_fmeasure = nltk.f_measure(ref_sets['pos'], test_sets['pos'])
            cv_neg_precision = nltk.precision(ref_sets['neg'],
                                              test_sets['neg'])
            cv_neg_recall = nltk.recall(ref_sets['neg'], test_sets['neg'])
            cv_neg_fmeasure = nltk.f_measure(ref_sets['neg'], test_sets['neg'])

            print('Fold: {} Acc       : {:.4F}'.format(cv_count, cv_accuracy))
            print('Fold: {} pos_prec  : {:.4F} neg_prec  : {:.4F}'.format(
                cv_count, cv_pos_precision, cv_neg_precision))
            print('Fold: {} pos_recall: {:.4F} neg_recall: {:.4F}'.format(
                cv_count, cv_pos_recall, cv_neg_recall))
            print('Fold: {} pos_fmeas : {:.4F} neg_fmeas : {:.4F}'.format(
                cv_count, cv_pos_fmeasure, cv_neg_fmeasure))
            print('--')

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('----------------------------------------------------------')
        print('{}-Fold Cross Validation results for {} Classifier'.format(
            n_folds, classifierName))
        print('----------------------------------------------------------')
        print('accuracy : {:.4F}'.format(sum(accuracy) / n_folds))
        print('precision: {:.4F}'.format(
            (sum(pos_precision) / n_folds + sum(neg_precision) / n_folds) / 2))
        print('recall   : {:.4F}'.format(
            (sum(pos_recall) / n_folds + sum(neg_recall) / n_folds) / 2))
        print('f-measure: {:.4F}'.format(
            (sum(pos_fmeasure) / n_folds + sum(neg_fmeasure) / n_folds) / 2))
        print('\n')
NBayesClassifier=nltk.NaiveBayesClassifier.train(trainFeat)

#NBResultLabels=[NBayesClassifier.classify(extract_features(tweet[0])) for tweet in testData]

print("Accuracy : " + str(nltk.classify.util.accuracy(NBayesClassifier, testFeat)*100) + " %")

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testFeat):
    refsets[label].add(i)
    observed = NBayesClassifier.classify(feats)
    testsets[observed].add(i)

print 'sarcasm precision : '  + str((precision(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %"
print 'sarcasm recall : '     + str((recall(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %"
print 'sarcasm F-measure : '  + str((f_measure(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %"

print 'non-sarcasm precision : '    + str((precision(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %"
print 'non-sarcasm recall : '       + str((recall(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %"
print 'non-sarcasm F-measure : '    + str((f_measure(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %"

#NBayesClassifier.show_most_informative_features(100)
# print(NBResultLabels)

# if NBResultLabels.count('positive')>NBResultLabels.count('negative'):
# print "NB Result Sarcastic Sentiment\t\t:" + str(100*NBResultLabels.count('sarcasm')/len(NBResultLabels))+"%"
# else:
# print "NB Result Non-Sarcastic Sentiment\t:" + str(100*NBResultLabels.count('non-sarcasm')/len(NBResultLabels))+"%"