def exercise9(): #Design at least 5 features and explain them. Use nltk.NaiveBayesClassifier. #Report the accuracy of your classifier built using all of the features that you designed. #Use show_most_inforamtive_feautures(5) functionality from the classifier to inspect the individual feature performance. #Which of your features seem to be most influential? print('Extra Credit') from nltk.corpus import ppattach training_ppattach_corpus = ppattach.attachments('training') noun_ppattach_corpus = [inst for inst in training_ppattach_corpus if inst.attachment == 'N'] features = [(noun_features(inst), inst.prep) for inst in noun_ppattach_corpus] cutoff = int(len(features) / 4) train_set, test_set = features[:cutoff], features[cutoff:] # Naive Bayes Classifier classifier1 = nltk.NaiveBayesClassifier.train(train_set) #Decision Tree Classifier classifier2 = nltk.DecisionTreeClassifier.train(train_set) print("Naive Bayes classifier") print("Accuracy", nltk.classify.accuracy(classifier1, test_set)) print("team", classifier1.classify({'noun1': 'team'}), "researchers") print("Decision Tree classifier") print("Accuracy", nltk.classify.accuracy(classifier2, test_set)) print("team", classifier2.classify({'noun1': 'team'}), "researchers") print("5 features:") print classifier1.show_most_informative_features(5)
def exercise9(): print('Extra Credit') from nltk.corpus import ppattach training_ppattach_corpus = ppattach.attachments('training') noun_ppattach_corpus = [ inst for inst in training_ppattach_corpus if inst.attachment == 'N' ] features = [(noun_features(inst), inst.prep) for inst in noun_ppattach_corpus] cutoff = int(len(features) / 4) train_set, test_set = features[:cutoff], features[cutoff:] # Naive Bayes Classifier classifier1 = nltk.NaiveBayesClassifier.train(train_set) # Decision Tree Classifier classifier2 = nltk.DecisionTreeClassifier.train(train_set) print("Naive Bayes classifier") print("Accuracy", nltk.classify.accuracy(classifier1, test_set)) print("team", classifier1.classify({'noun1': 'team'}), "researchers") print("Decision Tree classifier") print("Accuracy", nltk.classify.accuracy(classifier2, test_set)) print("team", classifier2.classify({'noun1': 'team'}), "researchers") print("5 features:") print(classifier1.show_most_informative_features(5))
def exercise9(): print('Extra Credit') featuresets = [(ppattach_features(ppobject.noun1, ppobject.noun2, ppobject.verb), ppobject.prep) for ppobject in ppattach.attachments('training') if ppobject.attachment == 'N'] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) print(classifier.most_informative_features(5))
lk[i]="" tk=tuple(lk) lv=list(value) lv[i]="" tv=tuple(lv) assert tk not in feats feats[tk]=tv return feats def get_list_of_feats_lables(data): # data_tuples=[(ppa,ppa.attachment) for ppa in data] # feat_lable_tuples=nltk.classify.apply_features(get_feats,data) feat_lable_tuples=[(get_feats(ppa),ppa.attachment) for ppa in data] return feat_lable_tuples # print "Train" train_set = [ (get_feats(item), item.attachment) for item in ppattach.attachments('training') ] # print "Dev" dev_set = [ (get_feats(item), item.attachment) for item in ppattach.attachments('devset') ] # print "Test" test_set = [ (get_feats(item), item.attachment) for item in ppattach.attachments('test') ] # print "Done" # train_feats_labels=get_list_of_feats_lables(train_set) # dev_feats_labels=get_list_of_feats_lables(dev_set) # test_feats_labels=get_list_of_feats_lables(test_set) classifier=nltk.NaiveBayesClassifier.train(train_set) #Not sure how to smooth this! devacc = nltk.classify.accuracy(classifier, dev_set) testacc = nltk.classify.accuracy(classifier, test_set)
#!/usr/bin/env python # -*-coding:utf-8 -*- ''' 9. PP附件语料库是描述介词短语附着决策的语料库。 语料库中的每个实例被编码为 PP Attachment对象: 使用此子语料库,建立一个分类器,尝试预测哪些介词是用来连接一对给定的名词。 例如:给定的名词对team 和 researchers,分类器应该预测出介词 of。 更多的使用 PP 附件语料库的信息,参阅http://www.nltk.org/howto 上的语料库HOWTO。 ''' from nltk.corpus import ppattach import nltk ppattach.attachments('training') inst = ppattach.attachments('training')[2] print inst print inst.noun1, inst.prep, inst.noun2 #根据PPAttachment的特征,将字典项作为特征项录入,将prep作为结果项,直接生成分类器所需的set def gender_features(NounAndPrep): return ({ 'noun1': NounAndPrep.noun1, 'noun2': NounAndPrep.noun2, 'sent': NounAndPrep.sent, 'verb': NounAndPrep.verb, 'attachment': NounAndPrep.attachment }, NounAndPrep.prep) print gender_features(inst)
default_label = 'N' def classify(self, featureset): return self.default_label def default_feature(item): return { 'verb': item.verb, 'prep': item.prep, 'noun1+verb': (item.noun1, item.verb), 'noun1+prep': (item.noun1, item.prep), 'verb+prep': (item.verb, item.prep), 'verb+prep+noun2': (item.verb, item.prep, item.noun2), 'verb+prep+noun1': (item.verb, item.prep, item.noun1), 'prep+noun2': (item.prep, item.noun2), #'noun1+noun2+prep': (item.noun1, item.noun2, item.prep), } train_set = [ (default_feature(item), item.attachment) for item in ppattach.attachments('training') ] # the dev_set is like the open book exam; while the test_set is like the close book exam dev_set = [ (default_feature(item), item.attachment) for item in ppattach.attachments('devset') ] test_set = [ (default_feature(item), item.attachment) for item in ppattach.attachments('test') ] #classifier = DefaultClassifier() classifier = nltk.NaiveBayesClassifier.train(train_set) devacc = nltk.classify.accuracy(classifier, dev_set) testacc = nltk.classify.accuracy(classifier, test_set) print "all:dev:%lf" % (devacc) print "all:test:%lf" % (testacc)
import nltk from nltk.corpus import ppattach import sys class DefaultClassifier(nltk.classify.ClassifierI): default_label = 'N' def classify(self, featureset): return self.default_label def default_feature(item): return {'None': None} train_set = [(default_feature(item), item.attachment) for item in ppattach.attachments('training')] dev_set = [(default_feature(item), item.attachment) for item in ppattach.attachments('devset')] test_set = [(default_feature(item), item.attachment) for item in ppattach.attachments('test')] classifier = DefaultClassifier() devacc = nltk.classify.accuracy(classifier, dev_set) testacc = nltk.classify.accuracy(classifier, test_set) print "prep:dev:%lf" % (devacc) print "prep:test:%lf" % (testacc)
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import ppattach print(ppattach.attachments('training')) # doctest: +NORMALIZE_WHITESPACE inst = ppattach.attachments('training')[0] print(inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2) print(inst.attachment)
def dummy_print(): [ print(ppobject.noun1, ppobject.prep, ppobject.noun2, ppobject.verb) for ppobject in ppattach.attachments('training') if ppobject.attachment == 'N' ]
import nltk from nltk.corpus import ppattach from pickle import dump ppattach.attachments('training') inst = ppattach.attachments('training')[2] print(inst) print((inst.noun1, inst.prep, inst.noun2)) def noun_features(inst): return { 'noun1': inst.noun1, 'noun2': inst.noun2, 'pos1': nltk.pos_tag(inst.noun1), 'pos2': nltk.pos_tag(inst.noun2), } nattach = [({ 'noun1': inst.noun1, 'noun2': inst.noun2 }, inst.prep) for inst in ppattach.attachments('training') if inst.attachment == 'N'] size = int(0.1 * len(nattach)) train_set, test_set = nattach[size:], nattach[:size] # classifier = nltk.MaxentClassifier.train(train_set) classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set))
from nltk.corpus import ppattach print ppattach.attachments('training')[0] print ppattach.attachments('devset')[0] print ppattach.attachments('test')[0]
def exercise0(): documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] featuresets = [(document_features_ex0(d), c) for (d, c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Accuracy : ", nltk.classify.accuracy(classifier, test_set)) print("Most Informative Features : ") print(classifier.most_informative_features(5)) nattach = [ inst for inst in ppattach.attachments('training') if inst.attachment == 'N' ] def features(noun1, noun2, verb): feature_set = {} feature_set['noun1_suffix'] = noun1[-1:].lower() feature_set['noun2_suffix'] = noun2[-1:].lower() feature_set['noun1_prefix'] = noun1[0:3].lower() feature_set['noun2_prefix'] = noun2[0:3].lower() feature_set['verb'] = verb.lower() if feature_set['noun1_suffix'] == feature_set['noun2_suffix']: feature_set['special1'] = True else: feature_set['special1'] = False
import nltk from nltk.corpus import ppattach def pp_features(pp): return {'noun1': pp.noun1, 'noun2': pp.noun2} phrases = [pp for pp in ppattach.attachments('training') if pp.attachment == 'N'] featuresets = [(pp_features(pp), pp.prep) for pp in phrases] train_set, test_set = featuresets[1000:], featuresets[:1000] classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) print(classifier.show_most_informative_features(10)) classifier = nltk.MaxentClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) print(classifier.show_most_informative_features(10))