def create_word_scores(posWords,negWords): # posWords = ["好","棒呆了","很棒","cool","good","漂亮"] # negWords = ["差","差极了","很差","terrebiled","没眼看","滚粗"] posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = {} pos_word_fd = {} neg_word_fd = {} for word in posWords: word_fd.setdefault(word,0) word_fd[word]+=1 pos_word_fd.setdefault(word,0) neg_word_fd.setdefault(word,0) pos_word_fd[word]+=1 for word in negWords: word_fd.setdefault(word,0) word_fd[word]+=1 neg_word_fd.setdefault(word,0) pos_word_fd.setdefault(word,0) neg_word_fd[word]+=1 pos_word_count = len(pos_word_fd) #积极词的数量 neg_word_count = len(neg_word_fd) #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos = 0 neg = 0 for review in posids: pos += 1 if (pos != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['pos'].update(token_helpers.tokenize_simple(word)) for review in negids: neg += 1 if (neg != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['neg'].update(token_helpers.tokenize_simple(word)) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords """
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'][word]+=1 for word in neg: word_fd[word]+=1 cond_word_fd['neg'][word]+=1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def GetHighInformationWordsChi(num_bestwords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd[word.lower()] +=1 label_word_fd['pos'][word.lower()] +=1 for word in movie_reviews.words(categories=['neg']): word_fd[word.lower()] +=1 label_word_fd['neg'][word.lower()] +=1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords] bestwords = set([w for w, s in best]) return bestwords
def create_word_scores(posWords, negWords): file_scores = file("cn_sample_data/scores.txt", "w") #迭代,将多个序列合并 word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in negWords: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) for key in word_scores: file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n") file_scores.close() return word_scores
def create_word_scores(posdata,negdata): posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(posWords, negWords): word_fd = FreqDist() #可统计所有词的词频 print(type(word_fd)) cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in negWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores #包括了每个词和这个词的信息量
def create_word_scores(sentences): # logging.info(sentences) words = list(itertools.chain(*sentences)) # logging.info(words) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in words: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def computeFreqDistribution(): if DEBUG: print word_fd pos_word_count = label_word_fd['positive'].N() neg_word_count = label_word_fd['negative'].N() neu_word_count = label_word_fd['neutral'].N() total_word_count = pos_word_count + neg_word_count + neu_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count) neu_score = BigramAssocMeasures.chi_sq(label_word_fd['neutral'][word], (freq, neu_word_count), total_word_count) word_scores[word] = pos_score + neg_score + neu_score if DEBUG: print json.dumps(word_scores, indent = 4) threshold = 2 temp = [] for item in word_scores: if word_scores[item] > threshold: temp.append(item) if DEBUG: print temp return temp
def best_word_feats(self, words): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N() pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return dict([(word, True) for word in words if word in bestwords])
def build_top_words(self): pos_reviews = [(review, c) for (review, c) in self.documents if c == 'pos'] neg_reviews = [(review, c) for (review, c) in self.documents if c == 'neg'] pos_words = [token for (review, c) in pos_reviews for token in review] neg_words = [token for (review, c) in neg_reviews for token in review] fd_all = FreqDist(pos_words + neg_words) pos_class_words = [('pos', word) for word in pos_words] neg_class_words = [('neg', word) for word in neg_words] cfd_pos = ConditionalFreqDist(pos_class_words) cfd_neg = ConditionalFreqDist(neg_class_words) pos_word_count = len(pos_words) neg_word_count = len(neg_words) total_word_count = pos_word_count + neg_word_count word_scores = {} for (word, freq) in fd_all.items(): pos_score = BigramAssocMeasures.chi_sq(cfd_pos['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cfd_neg['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), reverse=True, key=lambda x: x[1])[:1000] self.top_words = set([w for w, s in best])
def jieba_feature(number): posWords = [] negWords = [] for items in read_file('total_pos.txt'): # 把集合的集合变成集合 for item in items: posWords.append(item) # print posWords for items in read_file('total_neg.txt'): for item in items: negWords.append(item) # print negWords word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: # print word word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} # 包括了每个词和这个词的信息量 for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 计算消极词的卡方统计量 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[ :number] # 把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的 best_words = set([w for w, s in best_vals]) return dict([(word, True) for word in best_words])
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel("~", 1, 1) negdata = tp.seg_fil_senti_excel("~", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() last_word = ConditionalFreqDist() for word in pos: word_fd.inc(word) last_word['pos'].inc(word) for word in neg: word_fd.inc(word) last_word['neg'].inc(word) pos_word_count = last_word['pos'].N() neg_word_count = last_word['neg'].N() totalnumber = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber) neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_word_scores(): posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet' posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def store_feature_scores(self): """ Determine the scores of words based on chi-sq and stores word:score to Redis. """ try: word_fd = self.pickle_load('word_fd') label_word_freqdist = self.pickle_load('label_fd') except TypeError: print('Requires frequency distributions to be built.') word_scores = {} pos_word_count = label_word_freqdist['positive'].N() neg_word_count = label_word_freqdist['negative'].N() total_word_count = pos_word_count + neg_word_count for label in label_word_freqdist.conditions(): for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['negative'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score self.pickle_store('word_scores', word_scores)
def store_feature_scores(self): """ Determine the scores of words based on chi-sq and stores word:score to Redis. """ try: word_fd = self.pickle_load('word_fd') label_word_freqdist = self.pickle_load('label_fd') except TypeError: print('Requires frequency distributions to be built.') word_scores = {} pos_word_count = label_word_freqdist['positive'].N() neg_word_count = label_word_freqdist['negative'].N() total_word_count = pos_word_count + neg_word_count for label in label_word_freqdist.conditions(): for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( label_word_freqdist['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq( label_word_freqdist['negative'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score self.pickle_store('word_scores', word_scores)
def create_word_scores(posWords, negWords, posTag, negTag): from nltk.probability import FreqDist, ConditionalFreqDist import itertools posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1 #word_fd.inc(word) cond_word_fd[posTag][word] += 1 #cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1 #word_fd.inc(word) cond_word_fd[negTag][word] += 1 #cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd[posTag].N() #积极词的数量 neg_word_count = cond_word_fd[negTag].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True): if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) if os.path.exists(cache_path): bestwords = pickle.load(open(cache_path, 'r')) print 'Loaded from cache' print 'bestwords count = %d' % (len(bestwords)) return bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos_contents = contents[labels == 1] neg_contents = contents[labels != 0] pos_words = set() neg_words = set() for pos_content in pos_contents: pos_words = pos_words.union(word_tokenize(pos_content)) for neg_content in neg_contents: neg_words = neg_words.union(word_tokenize(neg_content)) for word in pos_words: word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in neg_words: word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit] bestwords = set([w for w, s in best]) print 'all words count = %d' % (len(word_scores)) print 'bestwords count = %d' % (len(bestwords)) if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) f = open(cache_path, 'w') pickle.dump(bestwords, f) print 'Dumped to cache' return bestwords
def getFeature(number): posWords = [] negWords = [] for items in readfile('good.txt'): for item in items: posWords.append(str(item)) for items in readfile('bad.txt'): for item in items: negWords.append(str(item)) word_fd = FreqDist() # 可统计所有词的词频 con_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[str(word)] += 1 con_word_fd['pos'][str(word)] += 1 for word in negWords: word_fd[str(word)] += 1 con_word_fd['neg'][str(word)] += 1 pos_word_count = con_word_fd['pos'].N() # 积极词的数量 neg_word_count = con_word_fd['neg'].N() # 消极词的数量 # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(con_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(con_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:number] best_words = set([w for w, s in best_vals]) return dict([(str(word), True) for word in best_words])
def getWordScores(): posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): bigram_finder = BigramCollocationFinder.from_words(get_pos_words()) bigram_finder = BigramCollocationFinder.from_words(get_neg_words()) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = get_pos_words() + posBigrams # 词和双词搭配 neg = get_neg_words() + negBigrams word_fd = nltk.FreqDist() # 可统计所有词的词频 cond_word_fd = nltk.ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def getBestWords(posWords, negWords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 label_word_fd["pos"][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 label_word_fd["neg"][word.lower()] += 1 pos_word_count = label_word_fd["pos"].N() neg_word_count = label_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True) bestwords = set([w for w, s in sorted_x]) return bestwords
def setup(): global bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return train(best_bigram_word_features)
def create_word_scores(): posWords = list(itertools.chain(*datap)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*datan)) #同理 word_fd = nltk.FreqDist() cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def setup(): global bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return train(best_bigram_word_features)
def __setTermsCHISQUARE__(self, size): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in self.reader.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in self.reader.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count wordScores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) wordScores[word] = pos_score + neg_score termScore = sorted(wordScores.items(), key=lambda (w, s): s, reverse=True)[:size] self.terms = [w for (w, s) in termScore]
def jieba_feature(number): posWords = [] negWords = [] for items in read_file('data/weibo_data/pos_weibo.txt'): #把集合的集合变成集合 for item in items: posWords.append(item) for items in read_file('data/weibo_data/neg_weibo.txt'): for item in items: negWords.append(item) word_fd = FreqDist() #可统计所有词的词频 pol_word_fd = ConditionalFreqDist() #可统计积极文本中的词频或消极文本中的词频 for word in posWords: word_fd[word] += 1 pol_word_fd['pos'][word] += 1 for word in negWords: word_fd[ word] += 1 #word_fd.N()等价于之后的total_word_count ;word_fd['不行']将输出不行在两篇文档中出现的次数 pol_word_fd['neg'][word] += 1 pos_word_count = pol_word_fd['pos'].N() #积极词的数量 .B()为有多少种词 neg_word_count = pol_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} #包括了每个词和这个词的信息量 for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( pol_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(pol_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 best_vals = sorted( word_scores.items(), key=lambda item: item[1], reverse=True)[:number] #把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的 best_words = set([w for w, s in best_vals]) return dict([(word, True) for word in best_words])
def create_word_scores(): posWords = pickle.load(open('D:/code/sentiment_test/pos_review.pkl', 'r')) negWords = pickle.load(open('D:/code/sentiment_test/neg_review.pkl', 'r')) posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) # 同理 word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores # 包括了每个词和这个词的信息量
def get_best_words(words_list, num_best_words): from nltk.probability import FreqDist, ConditionalFreqDist from nltk.metrics import BigramAssocMeasures word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for pair in words_list: line,sent = pair for word in nltk.word_tokenize(line): word_fd.inc(word.lower()) label_word_fd[sent].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words] bestwords = set([w for w, s in best]) return bestwords
def create_word_scores(): posWords = get_word('static/pos.txt') negWords = get_word('static/neg.txt') posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) # 同理 word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_word_bigram_scores(): posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word) for word in neg: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = get_word('static/pos.txt') negdata = get_word('static/neg.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) posbigram_finder = BigramCollocationFinder.from_words(posWords) negbigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = posbigram_finder.nbest(BigramAssocMeasures.chi_sq, number) negBigrams = negbigram_finder.nbest(BigramAssocMeasures.chi_sq, number) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(posWords,negWords,posTag,negTag): from nltk.probability import FreqDist, ConditionalFreqDist import itertools posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd[posTag].N() #积极词的数量 neg_word_count = cond_word_fd[negTag].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_bigram_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) pos = posBigrams neg = negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in neg: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(pos_corpus, neg_corpus): word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for corpus in pos_corpus: for word in corpus: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for corpus in neg_corpus: for word in corpus: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def unigram_chi(cls, pos, neg, n=1000): pos_words = list(itertools.chain(*pos)) neg_words = list(itertools.chain(*neg)) word_tf = FreqDist() # 统计所有词频 con_word_tf = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in pos_words: word_tf[word] += 1 con_word_tf['pos'][word] += 1 for word in neg_words: word_tf[word] += 1 con_word_tf['neg'][word] += 1 pos_word_count = con_word_tf['pos'].N() # 积极词的数量 neg_word_count = con_word_tf['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count # 总词 word_scores = {} # 包括了每个词和这个词的信息量 for word, freq in word_tf.items(): pos_score = BigramAssocMeasures.chi_sq( con_word_tf['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq( con_word_tf['neg'][word], (freq, neg_word_count), total_word_count) # 计算消极词的卡方统计量 word_scores[ word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量(即算出信息增益) best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:n] # 把词按信息量倒序排序。n是特征的维度,是可以不断调整直至最优的 best_words = set([w for w, s in best_vals]) # print(best_words) return dict([(word, 1) for word in best_words])
def find_best_words(positiveWords, negativWords, dimention_num): scoreF = BigramAssocMeasures.chi_sq posBigrams = BCF.from_words(positiveWords).nbest(scoreF, 5000) negBigrams = BCF.from_words(negativWords).nbest(scoreF, 5000) pos = positiveWords + posBigrams neg = negativWords + negBigrams all_words = pos + neg word_fd = FreqDist(all_words) pos_word_fd = FreqDist(pos) neg_word_fd = FreqDist(neg) pos_word_count = pos_word_fd.N() neg_word_count = neg_word_fd.N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_vals = sorted(word_scores, key=lambda k: word_scores[k], reverse=True)[:dimention_num] return best_vals
def create_word_bigram_scores(posdata,negdata): posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder_pos = BigramCollocationFinder.from_words(posWords) bigram_finder_neg = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def store_word_scores(self): """ Stores 'word scores' into Redis. """ try: word_freqdist = pickle.loads(self.r.get('word_fd')) label_word_freqdist = pickle.loads(self.r.get('label_fd')) except TypeError: print('Requires frequency distributions to be built.') word_scores = {} pos_word_count = label_word_freqdist['pos'].N() neg_word_count = label_word_freqdist['neg'].N() total_word_count = pos_word_count + neg_word_count for word, freq in word_freqdist.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score self.r.set('word_scores', word_scores)
def _get_best_words(self): """ Get best words set """ words_frequencies = FreqDist() label_words_frequencies = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): words_frequencies[word.lower()] += 1 label_words_frequencies['pos'][word.lower()] += 1 for word in movie_reviews.words(categories=['neg']): words_frequencies[word.lower()] += 1 label_words_frequencies['neg'][word.lower()] += 1 pos_words_count = label_words_frequencies['pos'].N() neg_words_count = label_words_frequencies['neg'].N() total_words_count = pos_words_count + neg_words_count words_scores = {} for word, frequency in words_frequencies.items(): pos_score = BigramAssocMeasures.chi_sq( label_words_frequencies['pos'][word], (frequency, pos_words_count), total_words_count) neg_score = BigramAssocMeasures.chi_sq( label_words_frequencies['neg'][word], (frequency, neg_words_count), total_words_count) words_scores[word] = pos_score + neg_score best_words = sorted(words_scores.items(), key=lambda x: x[1], reverse=True)[:10000] self.best_words_set = set( [w for w, s in best_words if w not in self.stopset])
def __setTermsCHISQUARE__(self,size): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in self.reader.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in self.reader.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count wordScores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) wordScores[word] = pos_score + neg_score termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size] self.terms = [w for (w,s) in termScore];
def create_word_scores(): # FileUtils.save(os.path.join(fileFolder, 'pos'), 'pos.p') # FileUtils.save(os.path.join(fileFolder, 'neg'), 'neg.p') word_fd = nltk.FreqDist() # 可统计所有词的词频 cond_word_fd = nltk.ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in get_pos_words(): word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in get_neg_words(): word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores
def _get_bigram_scores(self, posdata, negdata): pos_words = list(itertools.chain(*posdata)) neg_words = list(itertools.chain(*negdata)) pos_bigram_finder = BigramCollocationFinder.from_words(pos_words) neg_bigram_finder = BigramCollocationFinder.from_words(neg_words) pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = pos_words + pos_bigrams neg = neg_words + neg_bigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def getBestWords(posWords,negWords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 label_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 label_word_fd['neg'][word.lower()] += 1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True) bestwords = set([w for w,s in sorted_x]) return bestwords
def _computeInstanceInformativeWords(self, cf_dist=None, f_dist=None): '''using chi_square distribution, computes and returns the words that contribute the most significant info. That is words that are mostly unique to each set(positive, negative)''' buff = self._loadData('informative_words.bin') if buff: self.informative_words = buff return elif cf_dist == None or f_dist == None: self.informative_words = dict() return total_num_words = f_dist.N() total_positive_words = cf_dist["positive"].N() total_negative_words = cf_dist["negative"].N() words_score = dict() for word in f_dist.keys(): pos_score = BigramAssocMeasures.chi_sq(cf_dist["positive"][word], (f_dist[word], total_positive_words), total_num_words) neg_score = BigramAssocMeasures.chi_sq(cf_dist["negative"][word], (f_dist[word], total_negative_words), total_num_words) words_score[word] = pos_score + neg_score #Return 1% most useful words self.informative_words = dict(sorted(words_score.iteritems(), key=lambda (word, score): score, reverse=True)[:int(0.01*len(words_score))]) self._saveData('informative_words.bin',self.informative_words)
def create_word_bigram_scores(): posdata = pickle.load(open(pos_f, 'rb')) negdata = pickle.load(open(neg_f, 'rb')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) bigram_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams # 词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_most_informative_words_chi(self, num_best_words): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd[word.lower()] += 1 label_word_fd['pos'][word.lower()] += 1 for word in movie_reviews.words(categories=['neg']): word_fd[word.lower()] += 1 label_word_fd['neg'][word.lower()] += 1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): #https://stackoverflow.com/questions/32549376/can-someone-explain-the-syntax-of-bigramassocmeasures-chi-sq pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=lambda tuple: tuple[1], reverse=True)[:num_best_words] bestwords = set([w for w, s in best]) return bestwords
def get_ranked_ngrams(self, wlist="all", pos=True): """ turn ngram into term: chi_sq associatoin metric """ word_fd = nltk.FreqDist() tag_fd = nltk.ConditionalFreqDist() for key, tweet in self.tweets.items(): word_list = self.get_selected_text(tweet) label = self.instances[key].label for ngram in word_list: # do we want the tag here word_fd.inc(ngram) tag_fd[label].inc(ngram) num_pos = tag_fd["positive"].N() num_neg = tag_fd["negative"].N() # num_neu = tag_fd["neutral"].N() # ignore neutral tweets ngram_dict = {} total = num_pos + num_neg # + num_neu for ngram, frequency in word_fd.items(): try: # build chi_sq metrics for both positive and negative tags pos_metric = BigramAssocMeasures.chi_sq( tag_fd['positive'][ngram], (frequency, num_pos), total) neg_metric = BigramAssocMeasures.chi_sq( tag_fd['negative'][ngram], (frequency, num_neg), total) #neu_metric = BigramAssocMeasures.chi_sq(tag_fd['neutral'][ngram],(frequency,num_neu),total) score = pos_metric + neg_metric ngram_dict[ngram] = score # append score except: continue return ngram_dict
def __init__(self): ## Best words feature extraction word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] self.bestwords = set([w for w, s in best]) self.train_classifier()
def create_word_scores(): posdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1 ) negdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1 ) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd["pos"].N() neg_word_count = cond_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(self): [posWords, negWords] = self.getAllWords() posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count log("Total number of words: %d" % total_word_count) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt") negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finderr = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posWords = pickle.load(open(pos_f, 'rb')) negWords = pickle.load(open(neg_f, 'rb')) posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词 for word in posWords: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores # 包括了每个词和这个词的信息量