Ejemplo n.º 1
0
def exercise9():
	#Design at least 5 features and explain them.  Use nltk.NaiveBayesClassifier.
	#Report the accuracy of your classifier built using all of the features that you designed.
	#Use show_most_inforamtive_feautures(5) functionality from the classifier to inspect the individual feature performance.
	#Which of your features seem to be most influential?
	print('Extra Credit')
	
	from nltk.corpus import ppattach

	training_ppattach_corpus = ppattach.attachments('training')
	noun_ppattach_corpus = [inst for inst in training_ppattach_corpus if inst.attachment == 'N']

	features = [(noun_features(inst), inst.prep) for inst in noun_ppattach_corpus]
	cutoff = int(len(features) / 4)
	train_set, test_set = features[:cutoff], features[cutoff:]

	# Naive Bayes Classifier
	classifier1 = nltk.NaiveBayesClassifier.train(train_set)

	#Decision Tree Classifier
	classifier2 = nltk.DecisionTreeClassifier.train(train_set)

	print("Naive Bayes classifier")
	print("Accuracy", nltk.classify.accuracy(classifier1, test_set))
	print("team", classifier1.classify({'noun1': 'team'}), "researchers")
	print("Decision Tree classifier")
	print("Accuracy", nltk.classify.accuracy(classifier2, test_set))
	print("team", classifier2.classify({'noun1': 'team'}), "researchers")
	
	print("5 features:")
	print classifier1.show_most_informative_features(5)
Ejemplo n.º 2
0
def exercise9():
    print('Extra Credit')

    from nltk.corpus import ppattach

    training_ppattach_corpus = ppattach.attachments('training')
    noun_ppattach_corpus = [
        inst for inst in training_ppattach_corpus if inst.attachment == 'N'
    ]

    features = [(noun_features(inst), inst.prep)
                for inst in noun_ppattach_corpus]
    cutoff = int(len(features) / 4)
    train_set, test_set = features[:cutoff], features[cutoff:]

    # Naive Bayes Classifier
    classifier1 = nltk.NaiveBayesClassifier.train(train_set)

    # Decision Tree Classifier
    classifier2 = nltk.DecisionTreeClassifier.train(train_set)

    print("Naive Bayes classifier")
    print("Accuracy", nltk.classify.accuracy(classifier1, test_set))
    print("team", classifier1.classify({'noun1': 'team'}), "researchers")
    print("Decision Tree classifier")
    print("Accuracy", nltk.classify.accuracy(classifier2, test_set))
    print("team", classifier2.classify({'noun1': 'team'}), "researchers")

    print("5 features:")
    print(classifier1.show_most_informative_features(5))
Ejemplo n.º 3
0
def exercise9():
    print('Extra Credit')
    featuresets = [(ppattach_features(ppobject.noun1, ppobject.noun2,
                                      ppobject.verb), ppobject.prep)
                   for ppobject in ppattach.attachments('training')
                   if ppobject.attachment == 'N']
    train_set, test_set = featuresets[100:], featuresets[:100]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print(nltk.classify.accuracy(classifier, test_set))
    print(classifier.most_informative_features(5))
Ejemplo n.º 4
0
            lk[i]=""
            tk=tuple(lk)
            lv=list(value)
            lv[i]=""
            tv=tuple(lv)
            assert tk not in feats
            feats[tk]=tv
    return feats
def get_list_of_feats_lables(data):
#     data_tuples=[(ppa,ppa.attachment) for ppa in data]
#     feat_lable_tuples=nltk.classify.apply_features(get_feats,data)
    feat_lable_tuples=[(get_feats(ppa),ppa.attachment) for ppa in data]
    return feat_lable_tuples

# print "Train"
train_set = [ (get_feats(item), item.attachment) for item in ppattach.attachments('training') ]
# print "Dev"
dev_set = [ (get_feats(item), item.attachment) for item in ppattach.attachments('devset') ]
# print "Test"
test_set = [ (get_feats(item), item.attachment) for item in ppattach.attachments('test') ]
# print "Done"

# train_feats_labels=get_list_of_feats_lables(train_set)
# dev_feats_labels=get_list_of_feats_lables(dev_set)
# test_feats_labels=get_list_of_feats_lables(test_set)

classifier=nltk.NaiveBayesClassifier.train(train_set) #Not sure how to smooth this!

devacc = nltk.classify.accuracy(classifier, dev_set)
testacc = nltk.classify.accuracy(classifier, test_set)
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
9. PP附件语料库是描述介词短语附着决策的语料库。
语料库中的每个实例被编码为
PP Attachment对象:
使用此子语料库,建立一个分类器,尝试预测哪些介词是用来连接一对给定的名词。
例如:给定的名词对team 和 researchers,分类器应该预测出介词 of。
更多的使用 PP 附件语料库的信息,参阅http://www.nltk.org/howto 上的语料库HOWTO。
'''
from nltk.corpus import ppattach
import nltk

ppattach.attachments('training')
inst = ppattach.attachments('training')[2]
print inst
print inst.noun1, inst.prep, inst.noun2


#根据PPAttachment的特征,将字典项作为特征项录入,将prep作为结果项,直接生成分类器所需的set
def gender_features(NounAndPrep):
    return ({
        'noun1': NounAndPrep.noun1,
        'noun2': NounAndPrep.noun2,
        'sent': NounAndPrep.sent,
        'verb': NounAndPrep.verb,
        'attachment': NounAndPrep.attachment
    }, NounAndPrep.prep)


print gender_features(inst)
Ejemplo n.º 6
0
    default_label = 'N'
    def classify(self, featureset):
        return self.default_label

def default_feature(item):
    return {
            'verb': item.verb,
            'prep': item.prep,
            'noun1+verb': (item.noun1, item.verb),
            'noun1+prep': (item.noun1, item.prep),
            'verb+prep': (item.verb, item.prep),
            'verb+prep+noun2': (item.verb, item.prep, item.noun2),
            'verb+prep+noun1': (item.verb, item.prep, item.noun1),
            'prep+noun2': (item.prep, item.noun2),
            #'noun1+noun2+prep': (item.noun1, item.noun2, item.prep),
            }

train_set = [ (default_feature(item), item.attachment) for item in ppattach.attachments('training') ]
# the dev_set is like the open book exam; while the test_set is like the close book exam
dev_set = [ (default_feature(item), item.attachment) for item in ppattach.attachments('devset') ]
test_set = [ (default_feature(item), item.attachment) for item in ppattach.attachments('test') ]
#classifier = DefaultClassifier()
classifier = nltk.NaiveBayesClassifier.train(train_set)


devacc = nltk.classify.accuracy(classifier, dev_set)
testacc = nltk.classify.accuracy(classifier, test_set)

print "all:dev:%lf" % (devacc)
print "all:test:%lf" % (testacc)
Ejemplo n.º 7
0
import nltk
from nltk.corpus import ppattach
import sys


class DefaultClassifier(nltk.classify.ClassifierI):
    default_label = 'N'

    def classify(self, featureset):
        return self.default_label


def default_feature(item):
    return {'None': None}


train_set = [(default_feature(item), item.attachment)
             for item in ppattach.attachments('training')]
dev_set = [(default_feature(item), item.attachment)
           for item in ppattach.attachments('devset')]
test_set = [(default_feature(item), item.attachment)
            for item in ppattach.attachments('test')]
classifier = DefaultClassifier()
devacc = nltk.classify.accuracy(classifier, dev_set)
testacc = nltk.classify.accuracy(classifier, test_set)
print "prep:dev:%lf" % (devacc)
print "prep:test:%lf" % (testacc)
Ejemplo n.º 8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import ppattach
print(ppattach.attachments('training'))  # doctest: +NORMALIZE_WHITESPACE
inst = ppattach.attachments('training')[0]
print(inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2)
print(inst.attachment)
Ejemplo n.º 9
0
def dummy_print():
    [
        print(ppobject.noun1, ppobject.prep, ppobject.noun2, ppobject.verb)
        for ppobject in ppattach.attachments('training')
        if ppobject.attachment == 'N'
    ]
Ejemplo n.º 10
0
import nltk
from nltk.corpus import ppattach
from pickle import dump
ppattach.attachments('training')
inst = ppattach.attachments('training')[2]
print(inst)
print((inst.noun1, inst.prep, inst.noun2))


def noun_features(inst):

    return {
        'noun1': inst.noun1,
        'noun2': inst.noun2,
        'pos1': nltk.pos_tag(inst.noun1),
        'pos2': nltk.pos_tag(inst.noun2),
    }


nattach = [({
    'noun1': inst.noun1,
    'noun2': inst.noun2
}, inst.prep) for inst in ppattach.attachments('training')
           if inst.attachment == 'N']
size = int(0.1 * len(nattach))
train_set, test_set = nattach[size:], nattach[:size]

# classifier = nltk.MaxentClassifier.train(train_set)
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
Ejemplo n.º 11
0
from nltk.corpus import ppattach
print ppattach.attachments('training')[0]
print ppattach.attachments('devset')[0]
print ppattach.attachments('test')[0]
Ejemplo n.º 12
0

def exercise0():
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    featuresets = [(document_features_ex0(d), c) for (d, c) in documents]
    train_set, test_set = featuresets[100:], featuresets[:100]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Accuracy : ", nltk.classify.accuracy(classifier, test_set))
    print("Most Informative Features : ")
    print(classifier.most_informative_features(5))


nattach = [
    inst for inst in ppattach.attachments('training') if inst.attachment == 'N'
]


def features(noun1, noun2, verb):
    feature_set = {}
    feature_set['noun1_suffix'] = noun1[-1:].lower()
    feature_set['noun2_suffix'] = noun2[-1:].lower()
    feature_set['noun1_prefix'] = noun1[0:3].lower()
    feature_set['noun2_prefix'] = noun2[0:3].lower()
    feature_set['verb'] = verb.lower()
    if feature_set['noun1_suffix'] == feature_set['noun2_suffix']:
        feature_set['special1'] = True
    else:
        feature_set['special1'] = False
Ejemplo n.º 13
0
import nltk
from nltk.corpus import ppattach

def pp_features(pp):
	return {'noun1': pp.noun1, 'noun2': pp.noun2}

phrases = [pp for pp in ppattach.attachments('training') if pp.attachment == 'N']

featuresets = [(pp_features(pp), pp.prep) for pp in phrases]
train_set, test_set = featuresets[1000:], featuresets[:1000]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))

classifier = nltk.MaxentClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))