def classify_reviews(testfolder): '''Classifying the actual test data''' model=pk.load(open('classifier.p','rb')) outputf=open('g4_output.txt','w+') for testfile in os.listdir(testfolder): # skip the file like '.DS_store' in Mac OS if testfile.startswith('.'): continue testpath=os.path.join(testfolder,testfile) linenos,test_reviews=load_test(testpath) test_set_pi=extract_features(test_reviews,mode='test') test_set_prabha=classifier_prabha.extract_features(test_reviews,mode='test') test_set_david=extract_features_david.extract_features_david(test_reviews,mode='test') test_set_jt=classifier_jt.extract_unigram_feature(test_reviews,feature_words,mode='test') test_set = combine_sets(test_set_pi, test_set_prabha, test_set_david, test_set_jt, mode='test') i=0 for each_res in test_set: # TODO: [t] as netural if test_reviews[i].startswith('[t]'): outputf.write(str(testfile)+'\t'+str(i+1)+'\t0\n') else: result=model.classify(each_res) outputf.write(str(testfile)+'\t'+str(i+1)+'\t'+str(result)+'\n') i+=1 outputf.close()
def evaluate_clf(heldout): '''Testing the model on the heldout file''' products,scores,reviews=load_text_from_file(heldout) heldout_set_pi=extract_features(reviews,scores) heldout_set_prabha=classifier_prabha.extract_features(reviews,scores) heldout_set_david=extract_features_david.extract_features_david(reviews,scores) heldout_set_jt=classifier_jt.extract_unigram_feature(reviews,feature_words,scores) heldout_set = combine_sets(heldout_set_pi, heldout_set_prabha, heldout_set_david, heldout_set_jt) model=pk.load(open('classifier.p','rb')) print 'Accuracy for the heldout set: ',nltk.classify.accuracy(model,heldout_set) print model.show_most_informative_features(5)
def train_classifier(trainfile): '''Training the classifier ''' products,scores,reviews=load_text_from_file(trainfile) train_set_pi=extract_features(reviews,scores) train_set_prabha=classifier_prabha.extract_features(reviews,scores) train_set_david=extract_features_david.extract_features_david(reviews,scores) # get feature words from review texts feature_words = classifier_jt.get_feature_words(reviews, scores) train_set_jt=classifier_jt.extract_unigram_feature(reviews,feature_words,scores) train_set = combine_sets(train_set_pi, train_set_prabha, train_set_david, train_set_jt) clf=SklearnClassifier(LinearSVC()) #trainlen=int(len(train_set)*0.9) #model=clf.train(train_set) model=nltk.NaiveBayesClassifier.train(train_set) pk.dump(model,open('classifier.p','wb'))