def extract_features(data): import extractFeatures as ef best_words = get_best_words() feat = [] for i in data: feat.append(ef.best_word_features(i,best_words)) return feat
import itertools import evalueClassier as ec from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression # select positive and negative features. pos_review = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) neg_review = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) neg_review = neg_review*3 pos = pos_review[:50] neg = neg_review[:50] word_scores = ef.create_word_scores(pos,neg,'pos','neg') best_words = ef.find_best_words(word_scores, 1000) posFeatures = [] for p in pos: pos_selected = ef.best_word_features(p,best_words) posFeatures.append(ef.tagFeatures(pos_selected,'pos')) negFeatures = [] for n in neg: neg_selected = ef.best_word_features(n,best_words) negFeatures.append(ef.tagFeatures(neg_selected,'neg')) # divide Features into train devtest and test sets. trainSet = posFeatures[:50]+negFeatures[:50] devtestSet = posFeatures[40:50]+negFeatures[40:50] testSet = posFeatures[40:50]+negFeatures[40:50] print testSet classifer_dict={'BernoulliNB':BernoulliNB(),'MultinomialNB':MultinomialNB(),'LogisticRegression':LogisticRegression(),'SVC':SVC(),'LinearSVC':LinearSVC(),'NuSVC':NuSVC()} for classiferName,classiferFunc in classifer_dict.items(): ec.showEvalueResult(trainSet,devtestSet,classiferName,classiferFunc)
open( '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl', 'r')) neg_review = pickle.load( open( '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl', 'r')) neg_review = neg_review * 3 pos = pos_review[:50] neg = neg_review[:50] word_scores = ef.create_word_scores(pos, neg, 'pos', 'neg') best_words = ef.find_best_words(word_scores, 1000) posFeatures = [] for p in pos: pos_selected = ef.best_word_features(p, best_words) posFeatures.append(ef.tagFeatures(pos_selected, 'pos')) negFeatures = [] for n in neg: neg_selected = ef.best_word_features(n, best_words) negFeatures.append(ef.tagFeatures(neg_selected, 'neg')) # divide Features into train devtest and test sets. trainSet = posFeatures[:50] + negFeatures[:50] devtestSet = posFeatures[40:50] + negFeatures[40:50] testSet = posFeatures[40:50] + negFeatures[40:50] #Train and save classifier NuSVC_classifier = SklearnClassifier(NuSVC(probability=True)) NuSVC_classifier.train(trainSet) pickle.dump( NuSVC_classifier, open(