def create_word_scores(sentences): # logging.info(sentences) words = list(itertools.chain(*sentences)) # logging.info(words) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in words: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel("~", 1, 1) negdata = tp.seg_fil_senti_excel("~", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() last_word = ConditionalFreqDist() for word in pos: word_fd.inc(word) last_word['pos'].inc(word) for word in neg: word_fd.inc(word) last_word['neg'].inc(word) pos_word_count = last_word['pos'].N() neg_word_count = last_word['neg'].N() totalnumber = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber) neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): # posdata是list类型,长度1084,表中每个元素都是一个list,如元素:[u'\u7535\u6c60', u'\u4e0d\u7ed9\u529b', u'\u90fd'], # 是每条评论的分词,如[电池 不给力 都 很 好 老婆 买 带 16G 卡 一张] posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) objWords = list(itertools.chain(*objdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 for word in objWords: word_fd[word] += 1 cond_word_fd['obj'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # N()计算出现过的次数总和,可以理解为所有pos类型的词出现的次数总和 neg_word_count = cond_word_fd['neg'].N() obj_word_count = cond_word_fd['obj'].N() total_word_count = pos_word_count + neg_word_count + obj_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count) word_scores[word] = pos_score + neg_score + obj_score return word_scores
def create_word_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq): ''' 以单独一个词来统计词的信息量 ''' word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极评论中词的数量 neg_word_count = cond_word_fd['neg'].N() #消极评论中词的数量 total_word_count = pos_word_count + neg_word_count print("IN_POSWORD_NUMS : %d\tIN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count)) #默认使用卡方统计量,这里也可以计算互信息等其它统计量 word_scores = {} for word, freq in word_fd.iteritems(): pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #键值对-词:词的信息量
def create_word_scores(): posWords = pickle.load(open(POSFILE, 'rb')) negWords = pickle.load(open(NEGFILE, 'rb')) posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) # 同理 word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores # 包括了每个词和这个词的信息量
def create_word_scores(): posWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def getBestwords(self): sentences = self.getTrainedData(self.trainingData) word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for words in sentences: for word in words: word_fd[word.lower()] += 1 label_word_fd['best'][word.lower()] += 1 word_count = label_word_fd['best'].N() total_word_count = word_count + word_count word_scores = {} for word, freq in word_fd.iteritems(): score = BigramAssocMeasures.chi_sq(label_word_fd['best'][word], (freq, word_count), total_word_count) word_scores[word] = score self.best = [] self.best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:5000] self.bestwords = set([w for w, s in self.best]) return self.best
def create_word_scores(posWords, negWords, posTag, negTag): from nltk.probability import FreqDist, ConditionalFreqDist import itertools posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1 #word_fd.inc(word) cond_word_fd[posTag][word] += 1 #cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1 #word_fd.inc(word) cond_word_fd[negTag][word] += 1 #cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd[posTag].N() #积极词的数量 neg_word_count = cond_word_fd[negTag].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_word_scores(posWords,negWords,posTag,negTag): from nltk.probability import FreqDist, ConditionalFreqDist import itertools posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd[posTag].N() #积极词的数量 neg_word_count = cond_word_fd[negTag].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_word_bigram_scores(posWords, negWords): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in neg: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq): ''' 以双词来统计词的信息量 ''' bigram_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_finder.nbest(score_method, 5000) bigram_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_finder.nbest(score_method, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count)) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def setup(): global bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return train(best_bigram_word_features)
def create_word_scores(self): [posWords, negWords] = self.getAllWords() posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count log("Total number of words: %d" % total_word_count) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt") negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finderr = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True): if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) if os.path.exists(cache_path): bestwords = pickle.load(open(cache_path, 'r')) print 'Loaded from cache' print 'bestwords count = %d' % (len(bestwords)) return bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos_contents = contents[labels == 1] neg_contents = contents[labels != 0] pos_words = set() neg_words = set() for pos_content in pos_contents: pos_words = pos_words.union(word_tokenize(pos_content)) for neg_content in neg_contents: neg_words = neg_words.union(word_tokenize(neg_content)) for word in pos_words: word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in neg_words: word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit] bestwords = set([w for w, s in best]) print 'all words count = %d' % (len(word_scores)) print 'bestwords count = %d' % (len(bestwords)) if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) f = open(cache_path, 'w') pickle.dump(bestwords, f) print 'Dumped to cache' return bestwords
def getWordScores(): posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def tfidf(phrase_lists, corpus=nltk.corpus.brown.words(), ngram_range=(1, 6)): ranker = CorpusRanker(corpus, ngram_range) phrase_frequencies = FreqDist(tuple(p) for p in phrase_lists) phrase_scores = {} for phrase, freq in phrase_frequencies.iteritems(): phrase_scores[phrase] = ranker.score(phrase, freq) return phrase_scores, phrase_frequencies
def most_informative_words(corpus, categories=['dem', 'rep'], count=2500): fd = FreqDist() cond_fd = ConditionalFreqDist() word_counts = {} for cat in categories: for word in corpus.words(categories=[cat]): word = word.lower().strip(".!?:,/ ") if not word.isalpha() or word in stopset: continue fd.inc(word) cond_fd[cat].inc(word) word_counts[cat] = cond_fd[cat].N() total_word_count = sum(word_counts.values()) word_scores = collections.defaultdict(int) for word, freq in fd.iteritems(): for cat in categories: cat_word_score = BigramAssocMeasures.chi_sq( cond_fd[cat][word], (freq, word_counts[cat]), total_word_count) word_scores[word] += cat_word_score informative_words = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:count] return set([w for w, s in informative_words])
def create_word_scores(): posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet' posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'][word]+=1 for word in neg: word_fd[word]+=1 cond_word_fd['neg'][word]+=1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def __setTermsCHISQUARE__(self, size): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in self.reader.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in self.reader.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count wordScores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) wordScores[word] = pos_score + neg_score termScore = sorted(wordScores.items(), key=lambda (w, s): s, reverse=True)[:size] self.terms = [w for (w, s) in termScore]
def _get_bigram_scores(self, posdata, negdata): pos_words = list(itertools.chain(*posdata)) neg_words = list(itertools.chain(*negdata)) pos_bigram_finder = BigramCollocationFinder.from_words(pos_words) neg_bigram_finder = BigramCollocationFinder.from_words(neg_words) pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = pos_words + pos_bigrams neg = neg_words + neg_bigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = get_word('static/pos.txt') negdata = get_word('static/neg.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) posbigram_finder = BigramCollocationFinder.from_words(posWords) negbigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = posbigram_finder.nbest(BigramAssocMeasures.chi_sq, number) negBigrams = negbigram_finder.nbest(BigramAssocMeasures.chi_sq, number) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1 ) negdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1 ) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd["pos"].N() neg_word_count = cond_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def clean_train_data_and_find_best_features(self): #Top n best unigram features are selected freq_dist_obj = FreqDist() cond_freq_dist_obj = ConditionalFreqDist() self.book_category_set = set() for instance in self.book_instances: try: raw_data = instance and instance.strip() and instance.strip().split("\t") if not raw_data or len(raw_data) != 4 : continue bookid = raw_data[0] self.book_category_set.add(bookid) features = [] features.extend(self.clean_book_title(raw_data[2])) features.extend(self.clean_author_name(raw_data[3])) features.extend(self.bookid_to_toc_dict.get(raw_data[1], [])) for feat in features: freq_dist_obj.inc(feat) cond_freq_dist_obj[bookid].inc(feat) except: self.logging.info("Exception while running this instance %s \n" % instance) total_word_count = 0 for bookid in self.book_category_set: total_word_count += cond_freq_dist_obj[bookid].N() word_score_dict = {} for word, freq in freq_dist_obj.iteritems(): score = 0 if word and word.lower() in self.stopwords_set:continue for bookid in self.book_category_set: score += BigramAssocMeasures.chi_sq(cond_freq_dist_obj[bookid][word], (freq, cond_freq_dist_obj[bookid].N()), total_word_count) word_score_dict[word] = score self.select_top_n_best_features(word_score_dict)
def setup(): global bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return train(best_bigram_word_features)
def create_bigram_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) pos = posBigrams neg = negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_best_words(words_list, num_best_words): from nltk.probability import FreqDist, ConditionalFreqDist from nltk.metrics import BigramAssocMeasures word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for pair in words_list: line,sent = pair for word in nltk.word_tokenize(line): word_fd.inc(word.lower()) label_word_fd[sent].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words] bestwords = set([w for w, s in best]) return bestwords
def create_word_bigram_scores(pos_corpus, neg_corpus): word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for corpus in pos_corpus: for word in corpus: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for corpus in neg_corpus: for word in corpus: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def best_word_feats(self, words): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N() pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return dict([(word, True) for word in words if word in bestwords])
def Do_alpha(self): """The observed disagreement for the alpha coefficient. The alpha coefficient, unlike the other metrics, uses this rather than observed agreement. """ total = 0.0 for i, itemdata in self._grouped_data('item'): label_freqs = FreqDist(x['labels'] for x in itemdata) for j, nj in label_freqs.iteritems(): for l, nl in label_freqs.iteritems(): total += float(nj * nl) * self.distance(l, j) ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total log.debug("Observed disagreement: %f", ret) return ret
def create_word_scores(posWords, negWords): file_scores = file("cn_sample_data/scores.txt", "w") #迭代,将多个序列合并 word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in negWords: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) for key in word_scores: file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n") file_scores.close() return word_scores
def __setTermsCHISQUARE__(self,size): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in self.reader.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in self.reader.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count wordScores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) wordScores[word] = pos_score + neg_score termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size] self.terms = [w for (w,s) in termScore];
def __init__(self): ## Best words feature extraction word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] self.bestwords = set([w for w, s in best]) self.train_classifier()
def get_best_words(corpora): # Technically, this stuff should all be restricted to the current train set # so it doesn't look at labels in the text set word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for text in corpora['positive']: for word in text.tokens: word_fd.inc(word.lower()) label_word_fd['positive'].inc(word.lower()) for text in corpora['negative']: for word in text.tokens: word_fd.inc(word.lower()) label_word_fd['negative'].inc(word.lower()) pos_word_count = label_word_fd['positive'].N() neg_word_count = label_word_fd['negative'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords
def Do_alpha(self): """The observed disagreement for the alpha coefficient. The alpha coefficient, unlike the other metrics, uses this rather than observed agreement. """ total = 0.0 for i, itemdata in self._grouped_data("item"): label_freqs = FreqDist(x["labels"] for x in itemdata) for j, nj in label_freqs.iteritems(): for l, nl in label_freqs.iteritems(): total += float(nj * nl) * self.distance(l, j) ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total log.debug("Observed disagreement: %f", ret) return ret
def create_word_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word) for word in neg: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def GetHighInformationWordsChi(num_bestwords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd[word.lower()] +=1 label_word_fd['pos'][word.lower()] +=1 for word in movie_reviews.words(categories=['neg']): word_fd[word.lower()] +=1 label_word_fd['neg'][word.lower()] +=1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords] bestwords = set([w for w, s in best]) return bestwords
def create_word_bigram_scores(posdata,negdata): posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder_pos = BigramCollocationFinder.from_words(posWords) bigram_finder_neg = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(posdata,negdata): posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords, score_method=BigramAssocMeasures.chi_sq): ''' 以双词来统计词的信息量 ''' bigram_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_finder.nbest(score_method, 5000) bigram_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_finder.nbest(score_method, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count)) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(posWords, negWords, score_method=BigramAssocMeasures.chi_sq): ''' 以单独一个词来统计词的信息量 ''' word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极评论中词的数量 neg_word_count = cond_word_fd['neg'].N() #消极评论中词的数量 total_word_count = pos_word_count + neg_word_count print("IN_POSWORD_NUMS : %d\tIN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count)) #默认使用卡方统计量,这里也可以计算互信息等其它统计量 word_scores = {} for word, freq in word_fd.iteritems(): pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #键值对-词:词的信息量
def create_word_scores(): global POS_Words global NEG_Words global MIX_Words global NEUTRAL_Words POS_Words = list(itertools.chain(*POS_Words)) NEG_Words = list(itertools.chain(*NEG_Words)) MIX_Words = list(itertools.chain(*MIX_Words)) NEUTRAL_Words = list(itertools.chain(*NEUTRAL_Words)) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in POS_Words: for key in word: word_fd.inc(key.lower()) cond_word_fd['+'].inc(key.lower()) for word in NEG_Words: for key in word: word_fd.inc(key.lower()) cond_word_fd['-'].inc(key.lower()) for word in MIX_Words: for key in word: word_fd.inc(key.lower()) cond_word_fd['*'].inc(key.lower()) for word in NEUTRAL_Words: for key in word: word_fd.inc(key.lower()) cond_word_fd['='].inc(key.lower()) #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['+'].N() neg_word_count = cond_word_fd['-'].N() #mix_word_count = cond_word_fd['*'].N() neutral_word_count = cond_word_fd['='].N() # total_word_count = pos_word_count + neg_word_count + mix_word_count+ neutral_word_count total_word_count = pos_word_count + neg_word_count + neutral_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['+'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['-'][word], (freq, neg_word_count), total_word_count) #mix_score = BigramAssocMeasures.chi_sq(cond_word_fd['*'][word], (freq, mix_word_count), total_word_count) neutral_score = BigramAssocMeasures.chi_sq(cond_word_fd['='][word], (freq, neutral_word_count), total_word_count) #word_scores[word] = pos_score + neg_score + mix_score+neutral_score word_scores[word] = pos_score + neg_score + neutral_score return word_scores
def create_word_scores(): #creates lists of all positive and negative words posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) for word in negWords: word_fd.inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores #finds word scores word_scores = create_word_scores() #finds the best 'number' words based on word scores def find_best_words(word_scores, number): best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number] best_words = set([w for w, s in best_vals]) return best_words #creates feature selection mechanism that only uses best words def best_word_features(words): return dict([(word, True) for word in words if word in best_words]) #numbers of features to select numbers_to_test = [10, 100, 1000, 10000, 15000] #tries the best_word_features mechanism with each of the numbers_to_test of features for num in numbers_to_test: print 'evaluating best %d word features' % (num) best_words = find_best_words(word_scores, num) evaluate_features(best_word_features)
def bigramAnalysis(self): label_word_fd = ConditionalFreqDist() word_fd = FreqDist() datafiles = [ { 'emo': "Sad", 'name': "/negative.csv" }, { 'emo': "Happy", 'name': "/positive.csv" } # , {'emo': 'Happy', 'name': "/trust.csv"}, {'emo': 'Sad', 'name': "/anger.csv"} ] for value in datafiles: emo = value['emo'] name = value['name'] read = self.readFile(name) normalized_sentences = [s.lower() for s in read['tweets']] for statement in normalized_sentences: for word in statement.split(): wor = word.lower() if word not in stopset: word_fd[word] += 1 label_word_fd[emo][word] += 1 # word_fd.inc(word.lower()) word_scores = {} pos_word_count = label_word_fd['Happy'].N() neg_word_count = label_word_fd['Sad'].N() total_word_count = word_fd.N() for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd['Happy'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['Sad'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:500] self.bestwords = set([w for w, s in best]) print("\n\nevaluating best word features") self.unigramAnalysis(self.best_word_feats) print("\n\nBigram + bigram chi_sq word ") self.unigramAnalysis(self.best_bigram_word_feats)
def pi(self): """Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988). """ total = 0.0 label_freqs = FreqDist(x['labels'] for x in self.data) for k, f in label_freqs.iteritems(): total += f**2 Ae = total / float((len(self.I) * len(self.C))**2) return (self.avg_Ao() - Ae) / (1 - Ae)
def create_word_scores(): # creates lists of all positive and negative words posWords = [] negWords = [] sentences = read_in_tweets(twitter_data) random.shuffle(sentences) sentences = sentences[:100000] posSentences = [] negSentences = [] for tup in sentences: if tup[0]=='0': negSentences.append(tup[1]) if tup[0]=='4': posSentences.append(tup[1]) for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) # build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) for word in negWords: word_fd.inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) # finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count # builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def process_bigrams(conn, polarity, total_word_count, best_words): cursor = conn.cursor() sql = Statements.GRAM_SQL % polarity cursor.execute(sql) rows = list(cursor.fetchall()) l = [x[0] for x in rows] words_split = map(string.split, l) raw_words = [item for sublist in words_split for item in sublist] words = [] for w in raw_words: if not (w.startswith("http://") or w.startswith("@")): words.append(w) word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in words: word_fd.inc(word.lower()) label_word_fd[polarity].inc(word.lower()) pos_word_count = label_word_fd[polarity].N() word_scores = {} for word, freq in word_fd.iteritems(): score = BigramAssocMeasures.chi_sq(label_word_fd[polarity][word], (freq, pos_word_count), total_word_count) word_scores[word] = score best_raw = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:600] best = [x[0] for x in best_raw if x[0] not in STOPWORDS and len(x[0]) > 1] best_words.update(best) best_features = features(best, polarity) bigram_finder = BigramCollocationFinder.from_words(words) bigram_finder.apply_freq_filter(4) bigrams = bigram_finder.nbest(BigramAssocMeasures.pmi, 10) bigram_list = [] for bt in bigrams: x = "%s %s" % (bt[0].lower(), bt[1].lower()) bigram_list.append(x) bigram_features = features(bigram_list, polarity) best_features += bigram_features return best_features cursor.close()
def create_word_scores(): #creates lists of all positive and negative words posWords = [] negWords = [] posRev, negRev = load_file() #print len(negfeats) #posWords = [(make_full_dict(f), 'pos') for f in word_split(posWords)] for f in word_split(negRev): posWords.append(f) for f in word_split(posRev): negWords.append(f) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() #print posWords posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) for word in posWords: word_fd[word.lower()] += 1 #print word #raw_input('>') cond_word_fd['pos'][word.lower()] += 1 #count = count + 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() #print count total_word_count = pos_word_count + neg_word_count #print total_word_count #raw_input('>') #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() #for word in pos: # word_fd.inc(word) # cond_word_fd['pos'].inc(word) #for word in neg: # word_fd.inc(word) # cond_word_fd['neg'].inc(word) for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): #creates lists of all positive and negative words all_positive_words = [] all_negative_words = [] with open(RT_POLARITY_POS_FILE, 'r') as positive_sentences: for i in positive_sentences: #stripping down all the words from extra characters (tabs, punctuation, etc.) positive_word = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) all_positive_words.append(positive_word) with open(RT_POLARITY_NEG_FILE, 'r') as negative_sentences: for i in negative_sentences: #stripping down all the words from extra characters (tabs, punctuation, etc.) negative_word = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) all_negative_words.append(negative_word) all_positive_words = list(itertools.chain(*all_positive_words)) all_negative_words = list(itertools.chain(*all_negative_words)) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels #how often do words appear? word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in negative_words: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negative_word: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in neg: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores