コード例 #1
0
def evaluation2(test_data, classifier):

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    pos_precision = nltk.precision(refsets['Positive'], testsets['Positive'])
    pos_recall = nltk.recall(refsets['Positive'], testsets['Positive'])

    neg_precision = nltk.precision(refsets['Negative'], testsets['Negative'])
    neg_recall = nltk.recall(refsets['Negative'], testsets['Negative'])
    precision = (pos_precision + neg_precision) / 2
    recall = (pos_recall + neg_recall) / 2
    try:  #Try loop added for neutrals
        neu_precision = nltk.precision(refsets['Neutral'], testsets['Neutral'])
        neu_recall = nltk.recall(refsets['Neutral'], testsets['Neutral'])
        precision = (pos_precision + neg_precision + neu_precision) / 3
        recall = (pos_recall + neg_recall + neu_recall) / 3
    except:
        pass
    return precision, recall
コード例 #2
0
ファイル: model_evaluator.py プロジェクト: vinidixit/codes
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    random.shuffle(negids)
    random.shuffle(posids)
    
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', precision(refsets['neg'], testsets['neg'])
    print 'neg recall:',recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
コード例 #3
0
def evaluate_naivebayes(classifier, test_reviews):
    # For computing metrics
    ref_set = collections.defaultdict(set)
    test_set = collections.defaultdict(set)
    ref_set_arr = []
    test_set_arr = []

    # Create gold standard and predicted labels
    for i, (feat, label) in enumerate(test_reviews):
        # Predict
        observed = classifier.classify(feat)

        ref_set[label].add(i)
        test_set[observed].add(i)

        label = 0 if label == "neg" else 1
        observed = 0 if observed == "neg" else 1
        ref_set_arr.append(label)
        test_set_arr.append(observed)

    print('pos precision:', precision(ref_set['pos'], test_set['pos']))
    print('pos recall:', recall(ref_set['pos'], test_set['pos']))
    print('neg precision:', precision(ref_set['neg'], test_set['neg']))
    print('neg recall:', recall(ref_set['neg'], test_set['neg']))
    print('misclassification rate', zero_one_loss(ref_set_arr, test_set_arr))
    print('most informative features',
          classifier.show_most_informative_features(10))
コード例 #4
0
def search():
    negTweets = []
    posTweets = []

    with open('positiveTweets.csv', 'r') as csv_file:
        csv_reader = csv.reader(csv_file)

        for l in csv_reader:
            posTweets.append([l[0], l[1]])
        #tweets_ = tweets.tweets

    with open('negativeTweets.csv', 'r') as csv_file:
        csv_reader = csv.reader(csv_file)

        for l in csv_reader:
            negTweets.append([l[0], l[1]])
        #tweets_ = tweets.tweets

    # load absolute path of word lists
    positives = os.path.join(sys.path[0], "positive-words.txt")
    negatives = os.path.join(sys.path[0], "negative-words.txt")

    # instantiate analyzer
    analyzer = Analyzer(positives, negatives)
    positive, negative, neutral = 0.0, 0.0, 0.0

    cutoff = 0
    if (len(negTweets) > len(posTweets)):
        cutoff = len(posTweets) * 4 / 5
    else:
        cutoff = len(negTweets) * 4 / 5

    tweets_ = negTweets[:cutoff] + posTweets[:cutoff]
    testTweets_ = negTweets[cutoff:] + posTweets[cutoff:]
    refsets = {'pos': set([]), 'neg': set([])}
    testsets = {'pos': set([]), 'neg': set([])}

    for tweet in tweets_:
        score = analyzer.analyze(tweet[0])
        if tweet[1] == 'pos':
            testsets['pos'].add(tweet[0])
        else:
            testsets['neg'].add(tweet[0])
        if score > 0.0:
            positive += 1.0
            refsets['pos'].add(tweet[0])
        elif score <= 0.0:
            negative += 1.0
            refsets['neg'].add(tweet[0])
        else:
            neutral += 1.0

    print('pos precision:', nltk.precision(refsets['pos'], testsets['pos']))
    print('pos recall:', nltk.recall(refsets['pos'], testsets['pos']))
    print('pos F-measure:', nltk.f_measure(refsets['pos'], testsets['pos']))
    print('neg precision:', nltk.precision(refsets['neg'], testsets['neg']))
    print('neg recall:', nltk.recall(refsets['neg'], testsets['neg']))
    print('neg F-measure:', nltk.f_measure(refsets['neg'], testsets['neg']))

    print(str(positive) + ',' + str(negative))
コード例 #5
0
def evaluate_classifier(classifier, validationFeatures):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(validationFeatures):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, validationFeatures)
    pos_precision = precision(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME'])
    pos_recall = recall(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME'])
    pos_f_measure = f_measure(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME'])
    neg_precision = precision(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME'])
    neg_recall = recall(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME'])
    neg_f_measure = f_measure(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME'])

    print 'accuracy:', accuracy
    print 'pos precision:', pos_precision
    print 'pos recall:', pos_recall
    print 'pos f-measure', pos_f_measure
    print 'neg precision:', neg_precision
    print 'neg recall:', neg_recall
    print 'neg f-measure', neg_f_measure

    return {'accuracy': accuracy, 'pos precision': pos_precision, 'pos recall': pos_recall, 'pos f-measure': pos_f_measure, 'neg precision': neg_precision, 'neg recall': neg_recall, 'neg f-measure': neg_f_measure}
コード例 #6
0
def evaluate_classifier(featx):
    negids = nltk.corpus.movie_reviews.fileids('neg')
    posids = nltk.corpus.movie_reviews.fileids('pos')

    negfeats = [(featx(nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    classifier = nltk.classify.NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print('pos precision', nltk.precision(refsets['pos'], testsets['pos']))
    print('pos recall', nltk.recall(refsets['pos'], testsets['pos']))
    print('neg precision', nltk.precision(refsets['neg'], testsets['neg']))
    print('neg recall', nltk.recall(refsets['neg'], testsets['neg']))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
    classifier.show_most_informative_features()
コード例 #7
0
def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(RT_POLARITY_POS_FILE, "rb") as f:
        posSentences = f.readlines()
        #posSentences = pos_data.split('\n')
    random.shuffle(posSentences)
    
    with open(RT_POLARITY_NEG_FILE, "rb") as f:
        #negSentences = f.read().split('\n')
        negSentences = f.readlines()
    random.shuffle(negSentences)
    
    #with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        posWords = [feature_select(posWords), 'pos'] #pos = contains location
        posFeatures.append(posWords)
    #with open(RT_POLARITY_NEG_FILE, 'r') as negSenBufferedReader: <_io.BufferedReader name='/home/ira/Dropbox/twitter/contain_location_tweets.txt'>tences:
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        negWords = [feature_select(negWords), 'neg'] #neg = doesn't contain location 
        negFeatures.append(negWords)

    
    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    #trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)    

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)    

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)    

    #prints metrics to show how well the feature selection did
    print ('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) )
    print ('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    #print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    print ('pos precision:', precision(referenceSets['pos'], testSets['pos']) )
    print ('pos recall:', recall(referenceSets['pos'], testSets['pos']) )
    #print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    print ('neg precision:',precision(referenceSets['neg'], testSets['neg']))
    print ('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)
コード例 #8
0
def train(feature_name, feature_detector, use_best_words, limit):
    # call function read the text(21) and split into sentiment(23) return a best words by limit
    neudata, posdata, negdata, alldata, best_words = split_sets(
        selected_input, 21, 23, limit)

    # only some of the tokinator help function uses the best_words input which is handled by the
    # use_best_words parameter
    negfeats = split_feats('Negative', negdata, feature_detector,
                           use_best_words, best_words)
    posfeats = split_feats('Positive', posdata, feature_detector,
                           use_best_words, best_words)
    neufeats = split_feats('Neutral', neudata, feature_detector,
                           use_best_words, best_words)

    trainfeats, testfeats = create_train_test_sets(negfeats, posfeats,
                                                   neufeats)

    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = precision(refsets['Positive'], testsets['Positive'])
    pos_recall = recall(refsets['Positive'], testsets['Positive'])
    neg_precision = precision(refsets['Negative'], testsets['Negative'])
    neg_recall = recall(refsets['Negative'], testsets['Negative'])
    neu_precision = precision(refsets['Neutral'], testsets['Neutral'])
    neu_recall = recall(refsets['Neutral'], testsets['Neutral'])

    classifier_with_accuracy = {
        'classifier': classifier,
        'feature_name': feature_name,
        'feature_detector': feature_detector,
        'best_words': use_best_words,
        'accuracy': accuracy,
        'limit': limit,
        'best_words_list': best_words,
        'pos_precision': pos_precision,
        'pos_recall': pos_recall,
        'neg_precision': neg_precision,
        'neg_recall': neg_recall,
        'neu_precision': neu_precision,
        'neu_recall': neu_recall
    }

    return classifier_with_accuracy
コード例 #9
0
ファイル: train.py プロジェクト: stpwin/fb-group-sentiment
def KFoldAccuracy(all_reviews):
    global word_features
    from sklearn.model_selection import KFold
    import numpy as np
    import collections

    k_fold = KFold(n_splits=10, random_state=1992, shuffle=True)
    featuresets = np.array(all_reviews)
    accuracy_scores = []

    for train_set, test_set in k_fold.split(featuresets):
        word_features = get_word_features(
            get_words_in_reviews(featuresets[train_set].tolist()))
        train_features = nltk.classify.apply_features(
            extract_features, featuresets[train_set].tolist())
        test_features = nltk.classify.apply_features(
            extract_features, featuresets[test_set].tolist())
        classifier = nltk.NaiveBayesClassifier.train(train_features)
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(test_features):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy_scores.append(
            nltk.classify.util.accuracy(classifier, test_features))

        # f1_pos = nltk.f_measure(refsets['pos'], testsets['pos'])
        f1_neg = nltk.f_measure(refsets['neg'], testsets['neg'])
        f1_swear = nltk.f_measure(refsets['swear'], testsets['swear'])

        # pre_pos = nltk.precision(refsets['pos'], testsets['pos'])
        pre_neg = nltk.precision(refsets['neg'], testsets['neg'])
        pre_swear = nltk.precision(refsets['swear'], testsets['swear'])

        # re_pos = nltk.recall(refsets['pos'], testsets['pos'])
        re_neg = nltk.recall(refsets['neg'], testsets['neg'])
        re_swear = nltk.recall(refsets['swear'], testsets['swear'])

        print(f'train: {len(train_set)} test: {len(test_set)}')
        print('=================== Results ===================')
        print(f'Accuracy {accuracy_scores[-1]:f}')
        print('            Negative     Swear')
        print(f'F1         {f1_neg:f}     {f1_swear:f}]')
        print(f'Precision  {pre_neg:f}     {pre_swear:f}]')
        print(f'Recall     {re_neg:f}     {re_swear:f}]')
        print('===============================================\n')
コード例 #10
0
def check_mutato_accuracy(input_path):
    with open(input_path, 'r', encoding=encoding) as csv_train:
        csv_reader = csv.reader(csv_train, delimiter=delim)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        results = []
        gold = []
        label = ''
        for i, row in enumerate(csv_reader):
            if row[23] == '2':
                label = 'Negative'
            elif row[23] == '1':
                label = 'Positive'
            elif row[23] == '0':
                label = 'Neutral'
            gold.append(label)
            refsets[label].add(i)
            testsets[row[26]].add(i)
            results.append(row[26])

        # Since we dont have a classifier object we manually calculate the accuracy
        # We do it the same way as nltk.util.accuracy but instead of using the classifier to compare
        # We just compare between the result and the gold standard (refset)
        correct = [l == r for (l, r) in zip(gold, results)]
        accuracy_manual = sum(correct) / len(correct)
        pos_precision = precision(refsets['Positive'], testsets['Positive'])
        pos_recall = recall(refsets['Positive'], testsets['Positive'])
        neg_precision = precision(refsets['Negative'], testsets['Negative'])
        neg_recall = recall(refsets['Negative'], testsets['Negative'])
        neu_precision = precision(refsets['Neutral'], testsets['Neutral'])
        neu_recall = recall(refsets['Neutral'], testsets['Neutral'])

        classifier_with_accuracy = {
            'classifier': 'N/A',
            'feature_name': 'MUTATO',
            'feature_detector': 'MUTATO',
            'best_words': False,
            'accuracy': accuracy_manual,
            'limit': 0,
            'pos_precision': pos_precision,
            'pos_recall': pos_recall,
            'neg_precision': neg_precision,
            'neg_recall': neg_recall,
            'neu_precision': neu_precision,
            'neu_recall': neu_recall
        }
        return classifier_with_accuracy
コード例 #11
0
def evaluate_classifier(featx):
    reviews = product_reviews_1.reviews()
    reviewlines = []
    for review in reviews:
        for line in review.review_lines:
            reviewlines.append(line)


    sentfeats = [[(featx(line.sent), f[1][0]) for f in line.features] for line in reviewlines if len(line.features)>0]
    plusfeats = []
    minusfeats = []
    for sentfeat in sentfeats:
        for feat in sentfeat:
            if feat[1] == "+":
                plusfeats.append(feat)
            elif feat[1] == "-":
                minusfeats.append(feat)

    if len(minusfeats) > len(plusfeats):
        minusfeats = minusfeats[:len(plusfeats)]
    else:
        plusfeats = plusfeats[:len(minusfeats)]

    minuscutoff = int(len(minusfeats)*3/4)
    pluscutoff = int(len(plusfeats)*3/4)

    trainfeats = minusfeats[:minuscutoff] + plusfeats[:pluscutoff]
    testfeats = minusfeats[minuscutoff:] + plusfeats[pluscutoff:]
    print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    #return nltk.classify.util.accuracy(classifier, testfeats)
    print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
    print('pos precision:', precision(refsets['+'], testsets['+']))
    print('pos recall:', recall(refsets['+'], testsets['+']))
    print('neg precision:', precision(refsets['-'], testsets['-']))
    print('neg recall:', recall(refsets['-'], testsets['-']))
    classifier.show_most_informative_features()
    print(classifier.classify(featx(["I", "hate", "it", "."])))
    print(classifier.classify(featx(["I", "love", "it", "."])))
コード例 #12
0
def print_precision_recall(classifier, test_set):
    """ Computes and prints the precision and recall metrics, given a classifier
    and a test set
    """
    known_set = collections.defaultdict(set)
    computed_set = collections.defaultdict(set)

    for i, (features, label) in enumerate(test_set):
        known_set[label].add(i)
        predicted = classifier.classify(features)
        computed_set[predicted].add(i)

    print('pos precision:', precision(known_set['pos'], computed_set['pos']))
    print('pos recall:', recall(known_set['pos'], computed_set['pos']))
    print('neg precision:', precision(known_set['neg'], computed_set['neg']))
    print('neg recall:', recall(known_set['neg'], computed_set['neg']))
コード例 #13
0
    def evaluation(self, test_set, classifier):
        referenceSet = collections.defaultdict(set)
        testSet = collections.defaultdict(set)
        referenceSet_cm = []
        testSet_cm = []

        for index, (sentences, actualLabel) in enumerate(test_set):
            referenceSet[actualLabel].add(index)
            referenceSet_cm.append(actualLabel)
            predictedLabel = classifier.classify(sentences)
            testSet[predictedLabel].add(index)
            testSet_cm.append(predictedLabel)

        print referenceSet.keys()
        print testSet.keys()

        print("-------------Claim metrics-----------")
        print('Accuracy of the classifier:  ',
              nltk.classify.util.accuracy(classifier, test_set))
        print(
            'precision:           ',
            precision(referenceSet[referenceSet.keys()[0]],
                      testSet[testSet.keys()[0]]))
        print(
            'recall:              ',
            recall(referenceSet[referenceSet.keys()[0]],
                   testSet[testSet.keys()[0]]))
        print(
            'F-measure:           ',
            f_measure(referenceSet[referenceSet.keys()[0]],
                      testSet[testSet.keys()[0]]))

        print("-------------Premise metrics-----------")
        print('Accuracy of the classifier:  ',
              nltk.classify.util.accuracy(classifier, test_set))
        print(
            'precision:           ',
            precision(referenceSet[referenceSet.keys()[1]],
                      testSet[testSet.keys()[1]]))
        print(
            'recall:              ',
            recall(referenceSet[referenceSet.keys()[1]],
                   testSet[testSet.keys()[1]]))
        print(
            'F-measure:           ',
            f_measure(referenceSet[referenceSet.keys()[1]],
                      testSet[testSet.keys()[1]]))
コード例 #14
0
def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open('rt-polarity-pos.txt', 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    with open('rt-polarity-neg.txt', 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    #trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    #prints metrics to show how well the feature selection did
    print('train on %d instances, test on %d instances' %
          (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:',
          nltk.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', nltk.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:',
          nltk.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', nltk.recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)
コード例 #15
0
    def classifier_Train(self, train, test, labels):
        print('Training the Classifier (%s Classifier)' % labels)

        train_set_final = []
        test_set_final = []

        train_text = []
        for item in train:
            train_text.append((item[0] + item[1], item[2]))  #review text,label
            train_set_final.append((item[0] + item[1], item[2]))

        for item in test:
            test_set_final.append((item[0] + item[1], item[2]))

        start_time = time.time()
        self.word_features = _get_words_from_dataset(train_set_final)

        train_features_labels = [(self.extractor(d), c)
                                 for d, c in train_set_final]

        cl = class_.train(train_features_labels, **class_params)
        end_time = time.time()

        P = collections.defaultdict(set)
        R = collections.defaultdict(set)
        F = collections.defaultdict(set)

        test_features_labels = [(self.extractor(d), c)
                                for d, c in test_set_final]

        for label in labels:
            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)

            for i, (feats, lbl) in enumerate(test_features_labels):
                refsets[lbl].add(i)
                observed = cl.classify(feats)
                testsets[observed].add(i)

            p = nltk.precision(refsets[label], testsets[label])
            if p is None:
                p = 0

            r = nltk.recall(refsets[label], testsets[label])
            if r is None:
                r = 0

            f = nltk.f_measure(refsets[label], testsets[label])
            if f is None:
                f = 0
            print "label: %s" % label
            print("P:%s, R:%s, F:%s" % (p, r, f))
            P[label] = p
            R[label] = r
            F[label] = f

        cl.show_most_informative_features(10)
        return P, R, F, (end_time - start_time)
コード例 #16
0
def evaluation(test_data, classifier):
    # accuracy=np.round(classify.accuracy(classifier, test_data),3)
    # print("Accuracy of Model is:", accuracy)
    # print(classifier.show_most_informative_features(10))

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    pos_precision = np.round(
        nltk.precision(refsets['Positive'], testsets['Positive']), 3)
    pos_recall = np.round(
        nltk.recall(refsets['Positive'], testsets['Positive']), 3)
    pos_fscore = np.round(
        nltk.f_measure(refsets['Positive'], testsets['Positive']), 3)
    neg_precision = np.round(
        nltk.precision(refsets['Negative'], testsets['Negative']), 3)
    neg_recall = np.round(
        nltk.recall(refsets['Negative'], testsets['Negative']), 3)
    neg_fscore = np.round(
        nltk.f_measure(refsets['Negative'], testsets['Negative']), 3)

    print('pos precision:', pos_precision)
    print('pos recall:', pos_recall)
    print('pos F-score:', pos_fscore)
    print('neg precision:', neg_precision)
    print('neg recall:', neg_recall)
    print('neg F-score:', neg_fscore)
    try:  # Try loop added for neutrals
        neu_precision = np.round(
            nltk.precision(refsets['Neutral'], testsets['Neutral']), 3)
        neu_recall = np.round(
            nltk.recall(refsets['Neutral'], testsets['Neutral']), 3)
        neu_fscore = np.round(
            nltk.f_measure(refsets['Neutral'], testsets['Neutral']), 3)

        print('neu precision:', neu_precision)
        print('neu recall:', neu_recall)
        print('neu F-score:', neu_fscore)
    except:
        pass
コード例 #17
0
def evaluate_classifier(featx):
    #print(featx)
    neg_dict = movie_words('neg')
    pos_dict = movie_words('pos')

    negfeats = []
    posfeats = []
    for word in featx:
        try:
            neg_dict[word]
            negfeats.append(({word: True}, 'neg'))
        except KeyError:
            print(word + " Missing from negative")

        try:
            pos_dict[word]
            posfeats.append(({word: True}, 'neg'))
        except KeyError:
            print(word + " Missing from positive")

    negcutoff = len(negfeats)*3//4
    poscutoff = len(posfeats)*3//4

    print(negcutoff)
    print(poscutoff)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

    print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
    print('pos precision:', precision(refsets['pos'], testsets['pos']))
    print('pos recall:', recall(refsets['pos'], testsets['pos']))
    print('neg precision:', precision(refsets['neg'], testsets['neg']))
    print('neg recall:', recall(refsets['neg'], testsets['neg']))
    classifier.show_most_informative_features()
コード例 #18
0
    def predict(self, data_vectorized):
        start_time = time.time()

        # format the testfeats in the format nltk asks for
        # use the word without vectorizing
        testfeats = []
        logging.info('create the testing feats')
        for i, feat in enumerate(data_vectorized['x_test']):

            feat_as_words = self.bigram_word_feats(
                self.tokenizer.tokenize(feat))
            label = data_vectorized['y_test'][i]
            testfeats.append((feat_as_words, label))

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = self.classifier.classify(feats)
            testsets[observed].add(i)
        self.time_prediction = (time.time() - start_time)

        logging.info("Results for" + self.name + "with nltk scoring methods")
        logging.info("Training time: %fs; Prediction time: %fs" %
                     (self.time_training, self.time_prediction))
        # logging.info(
        #     classification_report(data_vectorized['y_test'], self.predictions, target_names=target_names))

        logging.info('--- accuracy: %6.2f ---' %
                     nltk.classify.util.accuracy(self.classifier, testfeats))
        logging.info('--- pos precision: %6.2f ---' %
                     precision(refsets[1], testsets[1]))
        logging.info('--- pos recall: %6.2f ---' %
                     recall(refsets[1], testsets[1]))
        logging.info('--- neg precision: %6.2f ---' %
                     precision(refsets[0], testsets[0]))
        logging.info('--- neg recall: %6.2f ---' %
                     recall(refsets[0], testsets[0]))
        logging.info("--- testing done - %6.2f seconds ---" %
                     (time.time() - start_time))
        logging.info(self.classifier.most_informative_features(n=10))
        self.classifier.show_most_informative_features()
コード例 #19
0
ファイル: SentimentAnalysis2.py プロジェクト: aafeliz/twitter
    def __init__(self):
        #reading pre-labeled input and splitting into lines
        posSentences = open('negMessages.txt', 'r')
        negSentences = open('posMessages.txt', 'r')
        posSentences = re.split(r'\n', posSentences.read())
        negSentences = re.split(r'\n', negSentences.read())

        posFeatures = []
        negFeatures = []
        #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i)
            posWords = [self.make_full_dict(posWords), 'pos']
            posFeatures.append(posWords)
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i)
            negWords = [self.make_full_dict(negWords), 'neg']
            negFeatures.append(negWords)

        # selects 3/4 of the features to be used for training and 1/4 to be used for testing
        posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
        negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
        trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
        testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

        self.classifier = NaiveBayesClassifier.train(trainFeatures)

        referenceSets = {'pos':set([]), 'neg':set([])}
        testSets = {'pos':set([]), 'neg':set([])}

        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            predicted = self.classifier.classify(features)
            testSets[predicted].add(i)
        posPrecision = precision(reference=referenceSets['pos'], test=testSets['pos'])
        print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
        print 'accuracy:', nltk.classify.util.accuracy(self.classifier, testFeatures)
        print 'pos precision:', precision(referenceSets['pos'], testSets['pos'])
        print 'pos recall:', recall(referenceSets['pos'], testSets['pos'])
        print 'neg precision:', precision(referenceSets['neg'], testSets['neg'])
        print 'neg recall:',recall(referenceSets['neg'], testSets['neg'])
        self.classifier.show_most_informative_features(10)
コード例 #20
0
 def performance(self, test_data):
     prediction = self.predict(test_data)
     pos_loc = set(
         [i for i in range(len(prediction)) if prediction[i] == 'pos'])
     neg_loc = set(range(len(prediction))) - pos_loc
     pos_ref = set(
         [i for i in range(len(prediction)) if test_data[i][1] == 'pos'])
     neg_ref = set(range(len(prediction))) - pos_ref
     print('===============================\n')
     print('Model Summary:\n')
     print(self.classifier_type + ' with features ' + self.feats.__name__ +
           '\n')
     print('Overall Accuracy: %.3f\n' %
           (nltk.classify.util.accuracy(self.classifier, test_data)))
     print('Positive Precision: %.3f\n' %
           (nltk.precision(pos_ref, pos_loc)))
     print('Positive Recall: %.3f\n' % (nltk.recall(pos_ref, pos_loc)))
     print('Negative Precision: %.3f\n' %
           (nltk.precision(neg_ref, neg_loc)))
     print('Negative Recall: %.3f\n' % (nltk.recall(neg_ref, neg_loc)))
コード例 #21
0
    def classifier_metrics(self, test_set):
        """Method to print the classifier metrics(precision, recall, accuracy, f-measure) and the most informative features
		The NLTK metrics module provides functions for calculating all three metrics but we need build 2 sets for each classification label
	    Args: 
	       comment text  
	    Returns:
	    	None
	    """
        classifier = self.model
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        for i, (features, label) in enumerate(test_set):
            refsets[label].add(i)
            observed = classifier.classify(features)
            testsets[observed].add(i)

        print("The Naive bayes classifier accuracy is : {}".format(
            nltk.classify.accuracy(classifier, test_set)))
        print('')
        print(
            "The Naive bayes classifier positive sentiment precision is : {}".
            format(precision(refsets['pos'], testsets['pos'])))
        print(
            "The Naive bayes classifier negative sentiment precision is : {}".
            format(precision(refsets['neg'], testsets['neg'])))
        print('')
        print("The Naive bayes classifier positive sentiment recall is : {}".
              format(recall(refsets['pos'], testsets['pos'])))
        print("The Naive bayes classifier negative sentiment recall is : {}".
              format(recall(refsets['neg'], testsets['neg'])))
        print('')
        print(
            "The Naive bayes classifier positive sentiment f-measure is : {}".
            format(f_measure(refsets['pos'], testsets['pos'])))
        print(
            "The Naive bayes classifier negative sentiment f-measure is : {}".
            format(f_measure(refsets['neg'], testsets['neg'])))
        print('')
        print('classifier top 5 most informative features is {}'.format(
            classifier.show_most_informative_features(5)))
コード例 #22
0
ファイル: Script.py プロジェクト: schoff43/Lemondo-parsing
def precision_recall(classifier, test_set):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (sent, category) in enumerate(test_set):
        refsets[category].add(i)
        observed = classifier.classify(sent)
        testsets[observed].add(i)
    prec = {}
    rec = {}
    for category in leMonde.categories():
        prec[category] = nltk.precision(refsets[category], testsets[category])
        rec[category] = nltk.recall(refsets[category], testsets[category])
    return prec, rec
コード例 #23
0
ファイル: tags2.py プロジェクト: shengo/Python-IFG
def precision_recall (classifier, test_set): #pour faire listes des precissions et rappels
	refsets=collections.defaultdict(set)
	testset=collections.defaultdict(set)
	for i, (sent, category) in enumerate(test_set):
		refsets[category].add(i)
		observed=classifier.classify(sent)
		testset[observed].add(i)
	prec={}	#list pour preccision
	rapp={}	#list pour rappel
	for category in leMonde.categories(): #pour chaque categorie
		prec[category]=nltk.precision(refsets[category], testset[category]) 	#preccision pour cet categorie
		rapp[category]=nltk.recall(refsets[category],testset[category])		#rappel pour cet categorie
	return prec,rapp #returer ces listes
コード例 #24
0
ファイル: full.py プロジェクト: shengo/Python-IFG
def precision_recall (classifier, test_set):
	refsets=collections.defaultdict(set)
	testset=collections.defaultdict(set)
	for i, (sent, category) in enumerate(test_set):
		refsets[category].add(i)
		observed=classifier.classify(sent)
		testset[observed].add(i)
	prec={}
	rec={}
	for category in leMonde.categories():
		prec[category]=nltk.precision(refsets[category], testset[category])
		rec[category]=nltk.recall(refsets[category],testset[category])
	return prec,rec
コード例 #25
0
def log_precision_recall(classifier, test_set, file):
    """ Computes and logs the precision and recall metrics into a file, given a 
    classifier and a test set
    """
    known_set = collections.defaultdict(set)
    computed_set = collections.defaultdict(set)

    for i, (features, label) in enumerate(test_set):
        known_set[label].add(i)
        predicted = classifier.classify(features)
        computed_set[predicted].add(i)

    line = 'pos precision: ' + str(
        precision(known_set['pos'], computed_set['pos']))
    file.write(line + '\n')
    line = 'pos recall: ' + str(recall(known_set['pos'], computed_set['pos']))
    file.write(line + '\n')
    line = 'neg precision: ' + str(
        precision(known_set['neg'], computed_set['neg']))
    file.write(line + '\n')
    line = 'neg recall: ' + str(recall(known_set['neg'], computed_set['neg']))
    file.write(line + '\n')
def evaluate_model(MaxEntClassifier):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    accuracy = classify.accuracy(MaxEntClassifier, validation_features)*100
    accuracy_list.append(accuracy)

    for i, (feats, label) in enumerate(validation_features):
        refsets[label].add(i)
        observed = MaxEntClassifier.classify(feats)
        testsets[observed].add(i)
        negative_precision = precision(refsets['negative'], testsets['negative'])
        positive_precision = precision(refsets['positive'], testsets['positive'])
        positive_recall = recall(refsets['positive'], testsets['positive'])
        negative_recall = recall(refsets['negative'], testsets['negative'])
        try:
            avg_recall = 0.5*(negative_recall+positive_recall)
            avg_precision = 0.5*(negative_precision+positive_precision)
            precision_list.append(avg_precision)
            recall_list.append(avg_recall)
        except TypeError:
            pass
    return precision_list, recall_list, accuracy_list
コード例 #27
0
def evaluate_svm(y_hat, test_reviews):
    # For computing metrics
    ref_set = collections.defaultdict(set)
    test_set = collections.defaultdict(set)
    ref_set_arr = []
    test_set_arr = []
    # Create gold standard and predicted labels
    for i, label in enumerate(test_reviews):
        label = "neg" if label == 0 else "pos"
        observed = "neg" if y_hat[i] == 0 else "pos"
        ref_set[label].add(i)
        test_set[observed].add(i)

        label = 0 if label == "neg" else 1
        observed = 0 if observed == "neg" else 1
        ref_set_arr.append(label)
        test_set_arr.append(observed)

    print('pos precision:', precision(ref_set['pos'], test_set['pos']))
    print('pos recall:', recall(ref_set['pos'], test_set['pos']))
    print('neg precision:', precision(ref_set['neg'], test_set['neg']))
    print('neg recall:', recall(ref_set['neg'], test_set['neg']))
    print('misclassification rate', zero_one_loss(ref_set_arr, test_set_arr))
コード例 #28
0
def get_pr(classifier, features_train, features_test):
    refsets = defaultdict(set)
    testsets = defaultdict(set)
    for i, (feats, label) in enumerate(features_test):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    # [0] = positive precision
    # [1] = negative precision
    p = [
        precision(refsets['Positive'], testsets['Positive']),
        precision(refsets['Negative'], testsets['Negative'])
    ]

    # [0] = positive recall
    # [1] = negative recall
    r = [
        recall(refsets['Positive'], testsets['Positive']),
        recall(refsets['Negative'], testsets['Negative'])
    ]

    return [p, r]
コード例 #29
0
ファイル: utilities.py プロジェクト: kaliaanup/XAIProject
def get_accuracy_measures(classifier, testing_data, p_label):
    actuallabels =collections.defaultdict(set)
    predictedlabels = collections.defaultdict(set)

    for i, (tokens, label) in enumerate(testing_data):
        actuallabels[label].add(i)
        predicted = classifier.classify(tokens)
        predictedlabels[predicted].add(i)
        
    result=[]
    result.append(precision(actuallabels[p_label], predictedlabels[p_label]))
    result.append(recall(actuallabels[p_label], predictedlabels[p_label]))
    result.append(f_measure(actuallabels[p_label], predictedlabels[p_label]))
    return result
コード例 #30
0
def evaluate_classifier(classifier, validationFeatures):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(validationFeatures):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, validationFeatures)
    pos_precision = precision(refsets['POSITIVE_TIME'],
                              testsets['POSITIVE_TIME'])
    pos_recall = recall(refsets['POSITIVE_TIME'], testsets['POSITIVE_TIME'])
    pos_f_measure = f_measure(refsets['POSITIVE_TIME'],
                              testsets['POSITIVE_TIME'])
    neg_precision = precision(refsets['NEGATIVE_TIME'],
                              testsets['NEGATIVE_TIME'])
    neg_recall = recall(refsets['NEGATIVE_TIME'], testsets['NEGATIVE_TIME'])
    neg_f_measure = f_measure(refsets['NEGATIVE_TIME'],
                              testsets['NEGATIVE_TIME'])

    print 'accuracy:', accuracy
    print 'pos precision:', pos_precision
    print 'pos recall:', pos_recall
    print 'pos f-measure', pos_f_measure
    print 'neg precision:', neg_precision
    print 'neg recall:', neg_recall
    print 'neg f-measure', neg_f_measure

    return {
        'accuracy': accuracy,
        'pos precision': pos_precision,
        'pos recall': pos_recall,
        'pos f-measure': pos_f_measure,
        'neg precision': neg_precision,
        'neg recall': neg_recall,
        'neg f-measure': neg_f_measure
    }
コード例 #31
0
def evaluate_classifier(classifier):
    aspects = AspectFinder.AspectFinder().get_aspects()

    minus = [f for f in aspects if f[1][0]=="-"]
    plus = [f for f in aspects if f[1][0]=="+"]

    sentences = [' '.join(s) for s in product_reviews_1.sents()]

    minusfeats = [(word_feats(s.split()), '-') for s in sentences for f in minus if s.find(f[0])!=-1]
    plusfeats = [(word_feats(s.split()), '+') for s in sentences for f in plus if s.find(f[0])!=-1]

    minuscutoff = int(len(minusfeats)*3/4)
    pluscutoff = int(len(plusfeats)*3/4)

    trainfeats = minusfeats[:minuscutoff] + plusfeats[:pluscutoff]
    testfeats = minusfeats[minuscutoff:] + plusfeats[pluscutoff:]
    print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))


    classifier = train(classifier, trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
    print('pos precision:', precision(refsets['pos'], testsets['pos']))
    print('pos recall:', recall(refsets['pos'], testsets['pos']))
    print('neg precision:', precision(refsets['neg'], testsets['neg']))
    print('neg recall:', recall(refsets['neg'], testsets['neg']))
    classifier.show_most_informative_features()
    print(classifier.classify(word_feats(["I", "hate", "it", "."])))
    print(classifier.classify(word_feats(["I", "love", "it", "."])))
コード例 #32
0
    def getAccuracy(self, classifier, sentences):
        test_set = nltk.classify.apply_features(self.extract_features_unigram,
                                                sentences[:500])
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        classifierResult = {}

        for i, (feats, label) in enumerate(test_set):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        classifierResult['accuracy'] = nltk.classify.util.accuracy(
            classifier, test_set)

        classifierResult['suggestion precision'] = 0.0 if nltk.precision(
            refsets['suggestion'],
            testsets['suggestion']) == None else nltk.precision(
                refsets['suggestion'], testsets['suggestion'])

        classifierResult['suggestion recall'] = 0.0 if nltk.recall(
            refsets['suggestion'],
            testsets['suggestion']) == None else nltk.recall(
                refsets['suggestion'], testsets['suggestion'])

        classifierResult['nonsuggestion precision'] = 0.0 if nltk.precision(
            refsets['nonsuggestion'],
            testsets['nonsuggestion']) == None else nltk.precision(
                refsets['nonsuggestion'], testsets['nonsuggestion'])

        classifierResult['nonsuggestion recall'] = 0.0 if nltk.recall(
            refsets['nonsuggestion'],
            testsets['nonsuggestion']) == None else nltk.recall(
                refsets['nonsuggestion'], testsets['nonsuggestion'])

        return classifierResult
コード例 #33
0
def precision_recall(classifier, testfeats):
    """ computes precision and recall of a classifier """

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    precisions = {}
    recalls = {}

    for label in classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label])
        recalls[label] = recall(refsets[label], testsets[label])

    return precisions, recalls
コード例 #34
0
ファイル: Classifier_DT.py プロジェクト: Saher-/SATC
def precision_recall(C, test_set):
    """
    :param C: trained classifier
    :param test_set: testing set
    :return: two Dict 1st holds the precision for each label
                      2nd holds the recall for each label
    """
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_set):
            refsets[label].add(i)
            observed = C.classify(feats)
            testsets[observed].add(i)

    precisions = {}
    recalls = {}

    for label in C.labels():
            precisions[label] = nltk.precision(refsets[label], testsets[label])
            recalls[label] = nltk.recall(refsets[label], testsets[label])
    return precisions, recalls
コード例 #35
0
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    #print(negfeats)

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    #print(negcutoff)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #print(trainfeats)
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    classifierName = 'SVM'
    classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats)
    #classifier.train(trainfeats)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    #print(testfeats)
    for i, (feats, label) in enumerate(testfeats):
        #feats : list of words
        #label : neg/pos
        #observed : neg/pos
        #print(feats,'---',label)
        refsets[label].add(i)
        observed = classifier.classify(feats)
        #print(observed)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
    pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
    pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
    neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
    neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
    neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

    print('')
    print('---------------------------------------')
    print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', accuracy)
    print('precision', (pos_precision + neg_precision) / 2)
    print('recall', (pos_recall + neg_recall) / 2)
    print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

    #classifier.show_most_informative_features()

    print('')

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    subset_size = int(len(trainfeats) / n)
    accuracy = []
    pos_precision = []
    pos_recall = []
    neg_precision = []
    neg_recall = []
    pos_fmeasure = []
    neg_fmeasure = []
    cv_count = 1
    for i in range(n):
        testing_this_round = trainfeats[i * subset_size:][:subset_size]
        training_this_round = trainfeats[:i * subset_size] + trainfeats[
            (i + 1) * subset_size:]

        classifierName = 'SVM'
        classifier = SklearnClassifier(LinearSVC(), sparse=False)
        classifier.train(training_this_round)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        for i, (feats, label) in enumerate(testing_this_round):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                  testing_this_round)
        cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

        accuracy.append(cv_accuracy)
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)
        pos_fmeasure.append(cv_pos_fmeasure)
        neg_fmeasure.append(cv_neg_fmeasure)

        cv_count += 1

    print('---------------------------------------')
    print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', sum(accuracy) / n)
    print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
    print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
    print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
    print('')
コード例 #36
0
NBayesClassifier=nltk.NaiveBayesClassifier.train(trainFeat)

#NBResultLabels=[NBayesClassifier.classify(extract_features(tweet[0])) for tweet in testData]

print("Accuracy : " + str(nltk.classify.util.accuracy(NBayesClassifier, testFeat)*100) + " %")

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testFeat):
    refsets[label].add(i)
    observed = NBayesClassifier.classify(feats)
    testsets[observed].add(i)

print 'sarcasm precision : '  + str((precision(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %"
print 'sarcasm recall : '     + str((recall(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %"
print 'sarcasm F-measure : '  + str((f_measure(refsets['sarcasm'], testsets['sarcasm'])*100)) + " %"

print 'non-sarcasm precision : '    + str((precision(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %"
print 'non-sarcasm recall : '       + str((recall(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %"
print 'non-sarcasm F-measure : '    + str((f_measure(refsets['non-sarcasm'], testsets['non-sarcasm'])*100)) + " %"

#NBayesClassifier.show_most_informative_features(100)
# print(NBResultLabels)

# if NBResultLabels.count('positive')>NBResultLabels.count('negative'):
# print "NB Result Sarcastic Sentiment\t\t:" + str(100*NBResultLabels.count('sarcasm')/len(NBResultLabels))+"%"
# else:
# print "NB Result Non-Sarcastic Sentiment\t:" + str(100*NBResultLabels.count('non-sarcasm')/len(NBResultLabels))+"%"
コード例 #37
0
ファイル: bay10.py プロジェクト: rolandinsh/om
    test_tweets = BuildFeatureVector(testing_this_round)
    train_tweets = BuildFeatureVector(training_this_round)
    print len(train_tweets)
    print len(test_tweets)

    training_set = nltk.classify.apply_features(extract_features, train_tweets)
    test_set = nltk.classify.apply_features(extract_features, test_tweets)

    NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

    #NBClassifier.show_most_informative_features(2)

    TestSet(testing_this_round)

    KF_metrics_accuracy.append(nltk.classify.accuracy(NBClassifier, test_set))
    KF_metrics_NEU.append((precision(refSet['NEU'], testSet['NEU']), recall(refSet['NEU'], testSet['NEU']), f_measure(refSet['NEU'], testSet['NEU'])))
    KF_metrics_POS.append((precision(refSet['POZ'], testSet['POZ']), recall(refSet['POZ'], testSet['POZ']), f_measure(refSet['POZ'], testSet['POZ'])))
    KF_metrics_NEG.append((precision(refSet['NEG'], testSet['NEG']), recall(refSet['NEG'], testSet['NEG']), f_measure(refSet['NEG'], testSet['NEG'])))

print 'TEST accuracy:', sum(KF_metrics_accuracy) / float(len(KF_metrics_accuracy))
print ''
print ''
print 'NEU precision:', sum(KF_metrics_NEU[0]) / float(len(KF_metrics_NEU[0]))
print 'NEU recall:', sum(KF_metrics_NEU[1]) / float(len(KF_metrics_NEU[1]))
print 'NEU F-measure:', sum(KF_metrics_NEU[2]) / float(len(KF_metrics_NEU[2]))
print ''
print 'POS precision:', sum(KF_metrics_POS[0]) / float(len(KF_metrics_POS[0]))
print 'POS recall:', sum(KF_metrics_POS[1]) / float(len(KF_metrics_POS[1]))
print 'POS F-measure:', sum(KF_metrics_POS[2]) / float(len(KF_metrics_POS[2]))
print ''
print 'NEG precision:', sum(KF_metrics_NEG[0]) / float(len(KF_metrics_NEG[0]))
コード例 #38
0
########################################################################################
########################################################################################

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

str = 'SINGLE FOLD RESULT ' + '(' + 'linear-svc' + ')'
#training with LinearSVC
classifier = SklearnClassifier(LinearSVC())
classifier.train(trainfeats)
for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats) * 100
pos_precision = nltk.precision(refsets['pos'], testsets['pos'])

pos_recall = recall(refsets['pos'], testsets['pos'])
pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
neg_precision = precision(refsets['neg'], testsets['neg'])
neg_recall = recall(refsets['neg'], testsets['neg'])
neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])
print('')
print('---------------------------------------')
print(str)
print('---------------------------------------')
print('accuracy: ', accuracy, '%')
print('precision', (pos_precision + neg_precision) / 2)
print('recall', (pos_recall + neg_recall) / 2)
print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
コード例 #39
0
def evaluate_features(feature_select):
    posFeatures_train = []
    negFeatures_train = []
    posFeatures_test = []
    negFeatures_test = []
    #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(POS_FILE_TRAIN, "rb") as f:
        posSentences_train = f.readlines()
        #posSentences = pos_data.split('\n')
    random.shuffle(posSentences_train)

    with open(POS_FILE_TEST, "rb") as f:
        posSentences_test = f.readlines()
        #posSentences = pos_data.split('\n')
    random.shuffle(posSentences_test)

    with open(NEG_FILE_TRAIN, "rb") as f:
        #negSentences = f.read().split('\n')
        negSentences_train = f.readlines()
    random.shuffle(negSentences_train)

    with open(NEG_FILE_TEST, "rb") as f:
        #negSentences = f.read().split('\n')
        negSentences_test = f.readlines()
    random.shuffle(negSentences_test)

    #with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
    for i in posSentences_train:
        posWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        posWords = [feature_select(posWords), 'pos']  #pos = contains location
        posFeatures_train.append(posWords)
        str_i = (i.decode("utf-8")).strip()
        Train_twit_Dic[frozenset(posWords[0].items())] = str_i

    for i in posSentences_test:
        posWords_test = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        posWords_test = [feature_select(posWords_test),
                         'pos']  #pos = contains location
        posFeatures_test.append(posWords_test)
        str1 = (i.decode("utf-8")).strip()
        Test_twit_Dic[frozenset(posWords_test[0].items())] = str1

    #with open(RT_POLARITY_NEG_FILE, 'r') as negSenBufferedReader: <_io.BufferedReader name='/home/ira/Dropbox/twitter/contain_location_tweets.txt'>tences:
    for i in negSentences_train:
        negWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        negWords = [feature_select(negWords),
                    'neg']  #neg = doesn't contain location
        negFeatures_train.append(negWords)
        str2 = (i.decode("utf-8")).strip()
        Train_twit_Dic[frozenset(negWords[0].items())] = str2

    for i in negSentences_test:
        negWords_test = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        negWords_test = [feature_select(negWords_test),
                         'neg']  #neg = doesn't contain location
        negFeatures_test.append(negWords_test)
        str3 = (i.decode("utf-8")).strip()
        Test_twit_Dic[frozenset(negWords_test[0].items())] = str3
    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    #posCutoff = int(math.floor(len(posFeatures)*3/4))
    #negCutoff = int(math.floor(len(negFeatures)*3/4))
    #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] ###need to understand what is test here
    #testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    trainFeatures = posFeatures_train + negFeatures_train  ###need to understand what is test here
    testFeatures = posFeatures_test + negFeatures_test

    ##############################################################################3
    #trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)
        ####################   MINE   ####################################
        if predicted == "pos":  ##the twit according to the classifier contains a location
            twiit = Test_twit_Dic[frozenset(features.items())]
            list_close_twits = Close_Twt_Dic[twiit]
            words = nltk.word_tokenize(twiit)
            tagged_words = ner_tagger.tag(words)
            lbl = ""
            for tag_w in tagged_words:
                if tag_w[1] == "LOCATION":
                    lbl = lbl + tag_w[0] + " "  #found a label for our twiit
                final_lbl = lbl
            ### employ satnford trained classifier on all of the close twiits to find
            if lbl == "":  #couldnt find the location (lable) for our twiit, lets try to find it wihitn its physical neiborhood twwits
                lbl_list = []
                for s in list_close_twits:
                    words = nltk.word_tokenize(s)
                    tagged_words = ner_tagger.tag(words)
                    lbl = ""
                    for tag_w in tagged_words:
                        if tag_w[1] == "LOCATION":
                            lbl = lbl + tag_w[0] + " "
                    if lbl != "":
                        lbl_list.append(lbl)
                ## find most common str (label) in lbl_list
                c = Counter(lbl_list)
                c.most_common(1)
                final_lbl = c[0][0]

    #prints metrics to show how well the feature selection did
    print('train on %d instances, test on %d instances' %
          (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    #print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    print('pos precision:', precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', recall(referenceSets['pos'], testSets['pos']))
    #print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    print('neg precision:', precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)
コード例 #40
0
def evaluate_classifier(featx):
    
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
        
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    
    # using 3 classifiers
    classifier_list = ['nb', 'svm']     
         #classifier_list = ['nb', 'maxent', 'svm']
    for cl in classifier_list:
        #if cl == 'maxent':
        #    classifierName = 'Maximum Entropy'
        #    classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter = 1)
        if cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)
            
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
 
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = precision(refsets['pos'], testsets['pos'])
        pos_recall = recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
        neg_precision = precision(refsets['neg'], testsets['neg'])
        neg_recall = recall(refsets['neg'], testsets['neg'])
        neg_fmeasure =  f_measure(refsets['neg'], testsets['neg'])
        
        print ''
        print '---------------------------------------'
        print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
        print '---------------------------------------'
        print 'accuracy:', accuracy
        print 'precision', (pos_precision + neg_precision) / 2
        print 'recall', (pos_recall + neg_recall) / 2
        print 'f-measure', (pos_fmeasure + neg_fmeasure) / 2    
                
        #classifier.show_most_informative_features()
    
    print ''
    
    ## CROSS VALIDATION
    
    trainfeats = negfeats + posfeats    
    
    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data    
    random.shuffle(trainfeats)    
    n = 5 # 5-fold cross-validation    
    
    for cl in classifier_list:
        
        subset_size = len(trainfeats) / n
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):        
            testing_this_round = trainfeats[i*subset_size:][:subset_size]
            training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]
            
            #if cl == 'maxent':
            #    classifierName = 'Maximum Entropy'
            #    classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter = 1)
            if cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)
                    
            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
            
            cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
            cv_pos_precision = precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])
                    
            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)
            
            cv_count += 1
                
        print '---------------------------------------'
        print 'N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
        print '---------------------------------------'
        print 'accuracy:', sum(accuracy) / n
        print 'precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2
        print 'recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2
        print 'f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2
        print ''
コード例 #41
0
ファイル: productbase.py プロジェクト: frederikflpl/BACode
             minus3feats[:minus3cutoff] + plus3feats[:plus3cutoff]
testfeats = minus1feats[minus1cutoff:] + plus1feats[plus1cutoff:] + \
             minus2feats[minus2cutoff:] + plus2feats[plus2cutoff:] + \
             minus3feats[minus3cutoff:] + plus3feats[plus3cutoff:]
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
print('+1 precision:', precision(refsets['+1'], testsets['+1']))
print('+1 recall:', recall(refsets['+1'], testsets['+1']))
print('+2 precision:', precision(refsets['+2'], testsets['+2']))
print('+2 recall:', recall(refsets['+2'], testsets['+2']))
print('+3 precision:', precision(refsets['+3'], testsets['+3']))
print('+3 recall:', recall(refsets['+3'], testsets['+3']))
print('-1 precision:', precision(refsets['-1'], testsets['-1']))
print('-1 recall:', recall(refsets['-1'], testsets['-1']))
print('-2 precision:', precision(refsets['-2'], testsets['-2']))
print('-2 recall:', recall(refsets['-2'], testsets['-2']))
print('-3 precision:', precision(refsets['-3'], testsets['-3']))
print('-3 recall:', recall(refsets['-3'], testsets['-3']))
classifier.show_most_informative_features()
print(classifier.classify(word_feats(["I", "hate", "it", "."])))
print(classifier.classify(word_feats(["I", "love", "it", "."])))
コード例 #42
0
            minusfeats.append(feat)

if len(minusfeats) > len(plusfeats):
    minusfeats = minusfeats[:len(plusfeats)]
else:
    plusfeats = plusfeats[:len(minusfeats)]

minuscutoff = int(len(minusfeats)*3/4)
pluscutoff = int(len(plusfeats)*3/4)

trainfeats = minusfeats[:minuscutoff] + plusfeats[:pluscutoff]
testfeats = minusfeats[minuscutoff:] + plusfeats[pluscutoff:]
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
print('pos precision:', precision(refsets['+'], testsets['+']))
print('pos recall:', recall(refsets['+'], testsets['+']))
print('neg precision:', precision(refsets['-'], testsets['-']))
print('neg recall:', recall(refsets['-'], testsets['-']))
classifier.show_most_informative_features()
print(classifier.classify(word_feats(["I", "hate", "it", "."])))
print(classifier.classify(word_feats(["I", "love", "it", "."])))
コード例 #43
0
ファイル: trainModel.py プロジェクト: jtlucas/cs5450
    # create training and cross-validation feature sets
    trainCutoff = len(reviews) * 4/5
    trainSet = reviews[:trainCutoff]
    cvSet = reviews[trainCutoff:]

    print ("Getting best words..")
    bestwords = getBestWords(trainSet)
    print ("Extracting feature sets..")
    trainFeatureSet = extractFeaturesFromSet(trainSet, bestwords)
    cvFeatureSet = extractFeaturesFromSet(cvSet, bestwords)

    print ("Training model..")
    classifier = NaiveBayesClassifier(trainFeatureSet)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(cvFeatureSet):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print ("Training accuracy: ", getClassifierAccuracy(classifier, trainFeatureSet))
    print ("Cross-validation accuracy: ", getClassifierAccuracy(classifier, cvFeatureSet))
    print ("'pos' Precision: ", nltk.precision(refsets['pos'], testsets['pos']))
    print ("'pos' Recall: ", nltk.recall(refsets['pos'], testsets['pos']))
    print ("'neg' Precision: ", nltk.precision(refsets['neg'], testsets['neg']))
    print ("'neg' Recall: ", nltk.recall(refsets['neg'], testsets['neg']))

    classifier.showMostInformativeFeatures(20)
コード例 #44
0
# Training using the NaiveBayes algorithm
train_set, test_set = train_test_split(featuresets,
                                       test_size=0.33,
                                       random_state=42)
base_classifier = nltk.NaiveBayesClassifier.train(train_set)
print('Accuracy:', nltk.classify.accuracy(base_classifier, test_set))

# Evaluation
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = base_classifier.classify(feats)
    testsets[observed].add(i)
print('Precision:', nltk.precision(refsets['pos'], testsets['pos']))
print('Recall:', nltk.recall(refsets['pos'], testsets['pos']))
print('f_measure:', nltk.f_measure(refsets['pos'], testsets['pos']))

# In[6]:


## Using the lexicon file (Subjectivity) and define the feature
def readSubjectivity(path):
    flexicon = open(path, 'r')
    # initialize an empty dictionary
    sldict = {}
    for line in flexicon:
        fields = line.split()  # default is to split on whitespace
        # split each field on the '=' and keep the second part as the value
        strength = fields[0].split("=")[1]
コード例 #45
0
ファイル: streamhacker.py プロジェクト: msintaha/nlp_thesis
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4

trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print(nltk.classify.accuracy(classifier, testfeats))
print(precision(refsets['pos'], testsets['pos']))
# print('pos recall:'+ nltk.metrics.recall(refsets['pos'], testsets['pos']))
# print('pos F-measure:'+ nltk.metrics.f_measure(refsets['pos'], testsets['pos']))
print(precision(refsets['neg'], testsets['neg']))
# print('neg recall:'+ nltk.metrics.recall(refsets['neg'], testsets['neg']))
# print('neg F-measure:'+ nltk.metrics.f_measure(refsets['neg'], testsets['neg']))
classifier.show_most_informative_features()