def create_word_scores(): posdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1 ) negdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1 ) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd["pos"].N() neg_word_count = cond_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel("~", 1, 1) negdata = tp.seg_fil_senti_excel("~", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() last_word = ConditionalFreqDist() for word in pos: word_fd.inc(word) last_word['pos'].inc(word) for word in neg: word_fd.inc(word) last_word['neg'].inc(word) pos_word_count = last_word['pos'].N() neg_word_count = last_word['neg'].N() totalnumber = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber) neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber) word_scores[word] = pos_score + neg_score return word_scores
def create_bigram_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) pos = posBigrams neg = negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'][word]+=1 for word in neg: word_fd[word]+=1 cond_word_fd['neg'][word]+=1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() #for word in pos: # word_fd.inc(word) # cond_word_fd['pos'].inc(word) #for word in neg: # word_fd.inc(word) # cond_word_fd['neg'].inc(word) for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in neg: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1 ) negdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1 ) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() # for word in pos: # word_fd.inc(word) # cond_word_fd['pos'].inc(word) # for word in neg: # word_fd.inc(word) # cond_word_fd['neg'].inc(word) for word in posWords: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd["pos"].N() neg_word_count = cond_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): # posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) # negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) posdata = tp.seg_fil_senti_excel( 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx', 1, 1) negdata = tp.seg_fil_senti_excel( 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx', 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in negWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posdata = tp.seg_fil_senti_excel("~", 1, 1) negdata = tp.seg_fil_senti_excel("~", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() last_word = ConditionalFreqDist() for word in posWords: word_fd.inc(word) last_word['pos'].inc(word) for word in negWords: word_fd.inc(word) last_word['neg'].inc(word) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber) neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/pos_review.xlsx",1,1) negdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords)#把文本变成双词搭配的形式 bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 1000)#使用了卡方统计的方法,选择排名前1000的双词 negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 1000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'].inc(word) for word in neg: word_fd[word]+=1 #word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N()#积极词的数量 neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 # for k in word_scores: # print k,word_scores[k] return word_scores #包括了每个词和这个词的信息量
import textprocessing as tp import cPickle as pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("review_set.xlsx", 1, 7, "data") sentiment_review = tp.seg_fil_senti_excel("review_set.xlsx", 1, 7) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from sklearn.metrics import accuracy_score # 1. Load positive and negative review data pos_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) neg_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) pos = pos_review neg = neg_review """ # Cut positive review to make it the same number of nagtive review (optional) shuffle(pos_review) size = int(len(pos_review)/2 - 18) pos = pos_review[:size] neg = neg_review
import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from sklearn.metrics import accuracy_score # 1. Load positive and negative review data pos_review = tp.seg_fil_senti_excel("pos_review.xlsx", 1, 1) neg_review = tp.seg_fil_senti_excel("neg_review.xlsx", 1, 1) pos = pos_review neg = neg_review """ # Cut positive review to make it the same number of nagtive review (optional) shuffle(pos_review) size = int(len(pos_review)/2 - 18) pos = pos_review[:size] neg = neg_review
##my classifier path filefeature = 'E:/GraduationProject/pythoncode/project/Prediction/main/result/feature_word_ngram.txt' filename = 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/senti_class_word_ngram.pkl' # 1. Load data """ review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1", "data") sentiment_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/review_set.xlsx", "1", "1") """ review = tp.get_excel_data( "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx", 1, 12, "data") sentiment_review = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx", 1, 12) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata))
import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1", "data") sentiment_review = tp.seg_fil_senti_excel( "D:/code/sentiment_test/review_set.xlsx", "1", "1") # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords)
from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from sklearn.metrics import accuracy_score # 1. Load positive and negative review data pos_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) neg_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) pos = pos_review neg = neg_review """ # Cut positive review to make it the same number of nagtive review (optional) shuffle(pos_review) size = int(len(pos_review)/2 - 18) pos = pos_review[:size] neg = neg_review """
import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1") sentiment_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/review_set.xlsx", "1", "1") # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
import pickle from random import shuffle from nltk.classify.scikitlearn import SklearnClassifier import os import time if __name__ == "__main__": #三部电影《长城》《乘风破浪》《西游伏魔篇》,分别跑一遍,然后将三部电影合起来再跑一遍 #使用MutiNB,LogisticRegression,SVM三种分类器 print '开始训练分类器' # 1. Load positive and negative review data path = os.getcwd() print '当前路径' + path start_time = time.time() pos_review = tp.seg_fil_senti_excel( path + "\\seniment review set\\THREEMIXPOS.xls", 1, 1) neg_review = tp.seg_fil_senti_excel( path + "\\seniment review set\\THREEMIXNEG.xls", 1, 1) test_review = test_review = tp.seg_fil_senti_excel( path + "\\seniment review set\\THREEMIXTEST.xls", 1, 1) pos = pos_review neg = neg_review # 2. Feature extraction function # Choose word_scores extaction methods #word_scores = create_word_scores() #word_scores = create_bigram_scores() word_scores = ssc.create_word_bigram_scores(pos, neg) # 3. Transform review to features by setting labels to words in review
import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("../../../Review set/review_set.xlsx", 1, 7, "data") sentiment_review = tp.seg_fil_senti_excel("../../../Review set/review_set.xlsx", 1, 7) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from sklearn.metrics import accuracy_score # 1. Load positive and negative review data pos_review = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1) neg_review = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1) pos = pos_review neg = neg_review """ # Cut positive review to make it the same number of nagtive review (optional) shuffle(pos_review) size = int(len(pos_review)/2 - 18) pos = pos_review[:size] neg = neg_review
import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("/home/hadoop/coding/Review set/HTC Z710t_review_2013.6.5.xlsx",1,12, "data") sentiment_review = tp.seg_fil_senti_excel("/home/hadoop/coding/Review set/Meizu MX_review_2013.6.7.xlsx", 1, 12) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/pos_review.xlsx",1,1) negdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords)#把文本变成双词搭配的形式 bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 1000)#使用了卡方统计的方法,选择排名前1000的双词
import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from sklearn.metrics import accuracy_score # 1. Load positive and negative review data pos_review = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1 ) neg_review = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1 ) pos = pos_review neg = neg_review """ # Cut positive review to make it the same number of nagtive review (optional) shuffle(pos_review) size = int(len(pos_review)/2 - 18)
import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from sklearn.metrics import accuracy_score pos_review = tp.seg_fil_senti_excel("~", 1, 1) neg_review = tp.seg_fil_senti_excel("~", 1, 1) pos = pos_review neg = neg_review def bag_of_words(words): return dict([(word, True) for word in words]) def bigrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(bigrams)
from nltk.classify.scikitlearn import SklearnClassifier #from nltk.classify import SklearnClassifier from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score from sklearn.neighbors import KNeighborsClassifier from sklearn import tree from sklearn.ensemble import RandomForestClassifier, BaggingClassifier #E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/ filename = 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/senti_class_bgram.pkl' # 1. Load positive and negative review data """ pos_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) neg_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) """ pos_review = tp.seg_fil_senti_excel( 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx', 1, 1) neg_review = tp.seg_fil_senti_excel( 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx', 1, 1) pos = pos_review neg = neg_review """ # Cut positive review to make it the same number of nagtive review (optional) shuffle(pos_review) size = int(len(pos_review)/2 - 18) pos = pos_review[:size] neg = neg_review