def setup():
    global bestwords

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in movie_reviews.words(categories=['pos']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['pos'].inc(word.lower())

    for word in movie_reviews.words(categories=['neg']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return train(best_bigram_word_features)
def create_word_scores():
    posWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N() #积极词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
Beispiel #3
0
def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #4
0
    def store_feature_scores(self):
        """
        Determine the scores of words based on chi-sq and stores word:score to Redis.
        """
        
        try:
            word_fd = self.pickle_load('word_fd')
            label_word_freqdist = self.pickle_load('label_fd')
        except TypeError:
            print('Requires frequency distributions to be built.')

        word_scores = {}

        pos_word_count = label_word_freqdist['positive'].N()
        neg_word_count = label_word_freqdist['negative'].N()
        total_word_count = pos_word_count + neg_word_count

        for label in label_word_freqdist.conditions():

            for word, freq in word_fd.iteritems():

                pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['positive'][word], (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['negative'][word], (freq, neg_word_count), total_word_count)
            
                word_scores[word] = pos_score + neg_score 
      
        self.pickle_store('word_scores', word_scores)
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #6
0
 def best_word_feats(self, words):
     word_fd = FreqDist()
     label_word_fd = ConditionalFreqDist()
      
     for word in movie_reviews.words(categories=['pos']):
         word_fd.inc(word.lower())
         label_word_fd['pos'].inc(word.lower())
      
     for word in movie_reviews.words(categories=['neg']):
         word_fd.inc(word.lower())
         label_word_fd['neg'].inc(word.lower())
      
     # n_ii = label_word_fd[label][word]
     # n_ix = word_fd[word]
     # n_xi = label_word_fd[label].N()
     # n_xx = label_word_fd.N()
      
     pos_word_count = label_word_fd['pos'].N()
     neg_word_count = label_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
      
     word_scores = {}
      
     for word, freq in word_fd.iteritems():
         pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
             (freq, pos_word_count), total_word_count)
         neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
             (freq, neg_word_count), total_word_count)
         word_scores[word] = pos_score + neg_score
      
     best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
     bestwords = set([w for w, s in best])
     return dict([(word, True) for word in words if word in bestwords])
Beispiel #7
0
  def __init__(self):
    ## Best words feature extraction
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
     
    for word in movie_reviews.words(categories=['pos']):
      word_fd.inc(word.lower())
      label_word_fd['pos'].inc(word.lower())
     
    for word in movie_reviews.words(categories=['neg']):
      word_fd.inc(word.lower())
      label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
     
    word_scores = {}
     
    for word, freq in word_fd.iteritems():
      pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
      neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
      word_scores[word] = pos_score + neg_score
     
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    self.bestwords = set([w for w, s in best])
    self.train_classifier()
def computeFreqDistribution():

	if DEBUG:
		print word_fd

	pos_word_count = label_word_fd['positive'].N()
	neg_word_count = label_word_fd['negative'].N()
	neu_word_count = label_word_fd['neutral'].N()
	total_word_count = pos_word_count + neg_word_count + neu_word_count

	word_scores = {}

	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count)
		neu_score = BigramAssocMeasures.chi_sq(label_word_fd['neutral'][word], (freq, neu_word_count), total_word_count)
		word_scores[word] = pos_score + neg_score + neu_score

	if DEBUG:
		print json.dumps(word_scores, indent = 4)

	threshold = 2

	temp = []

	for item in word_scores:
		if word_scores[item] > threshold:
			temp.append(item)

	if DEBUG:
     
		print temp
                
	return temp
def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #11
0
   def _computeInstanceInformativeWords(self, cf_dist=None, f_dist=None):
      '''using chi_square distribution, computes and returns the words
         that contribute the most significant info. That is words that
         are mostly unique to each set(positive, negative)'''

      buff = self._loadData('informative_words.bin')
      if buff:
         self.informative_words = buff
         return
      elif cf_dist == None or f_dist == None:
         self.informative_words = dict()
         return

      total_num_words = f_dist.N()
      total_positive_words = cf_dist["positive"].N()
      total_negative_words = cf_dist["negative"].N()
      words_score = dict()
        
      for word in f_dist.keys():
         pos_score = BigramAssocMeasures.chi_sq(cf_dist["positive"][word],
                                    (f_dist[word], total_positive_words),
                                    total_num_words)
         neg_score = BigramAssocMeasures.chi_sq(cf_dist["negative"][word],
                                    (f_dist[word], total_negative_words),
                                    total_num_words)


         words_score[word] = pos_score + neg_score

      #Return 1% most useful words 
      self.informative_words = dict(sorted(words_score.iteritems(),
                                 key=lambda (word, score): score,
                                 reverse=True)[:int(0.01*len(words_score))])

      self._saveData('informative_words.bin',self.informative_words)
def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #13
0
    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            if os.path.exists(cache_path):
                bestwords = pickle.load(open(cache_path, 'r'))
                print 'Loaded from cache'
                print 'bestwords count = %d' % (len(bestwords))
                return bestwords
    
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    pos_contents = contents[labels == 1]
    neg_contents = contents[labels != 0]
    
    pos_words = set()
    neg_words = set()
    
    for pos_content in pos_contents:
        pos_words = pos_words.union(word_tokenize(pos_content))
    
    for neg_content in neg_contents:
        neg_words = neg_words.union(word_tokenize(neg_content))
    
    for word in pos_words:
        word_fd.inc(word.lower())
        label_word_fd['pos'].inc(word.lower())
    
    for word in neg_words:
        word_fd.inc(word.lower())
        label_word_fd['neg'].inc(word.lower())
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    
    word_scores = {}
    
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
    bestwords = set([w for w, s in best])
    
    print 'all words count = %d' % (len(word_scores))
    print 'bestwords count = %d' % (len(bestwords))
    
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            f = open(cache_path, 'w')
            pickle.dump(bestwords, f)
            print 'Dumped to cache'
    
    return bestwords
    def __setTermsCHISQUARE__(self,size):
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()
        
        for word in self.reader.words(categories=['pos']):
            word_fd.inc(word.lower())
            label_word_fd['pos'].inc(word.lower())

        for word in self.reader.words(categories=['neg']):
            word_fd.inc(word.lower())
            label_word_fd['neg'].inc(word.lower())
            
        pos_word_count = label_word_fd['pos'].N()
        neg_word_count = label_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        wordScores = {}
        
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                                   (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                                   (freq, neg_word_count), total_word_count)
            wordScores[word] = pos_score + neg_score

        termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size]
        self.terms = [w for (w,s) in termScore];
def create_word_scores():
    posdata = tp.seg_fil_senti_excel(
        "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1
    )
    negdata = tp.seg_fil_senti_excel(
        "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1
    )

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd["pos"][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd["neg"][word] += 1

    pos_word_count = cond_word_fd["pos"].N()
    neg_word_count = cond_word_fd["neg"].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #17
0
def create_word_scores(sentences):
    # logging.info(sentences)
    words = list(itertools.chain(*sentences))
    # logging.info(words)

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in words:
        word_fd.inc(word.lower())
        cond_word_fd['pos'].inc(word.lower())
        cond_word_fd['neg'].inc(word.lower())
        
    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #19
0
def get_best_words(words_list, num_best_words):
	from nltk.probability import FreqDist, ConditionalFreqDist
	from nltk.metrics import BigramAssocMeasures


	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()

	for pair in words_list:
		line,sent = pair
		for word in nltk.word_tokenize(line):
			word_fd.inc(word.lower())
			label_word_fd[sent].inc(word.lower())

	pos_word_count = label_word_fd['pos'].N()
	neg_word_count = label_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count


	word_scores = {}
	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count)
		word_scores[word] = pos_score + neg_score
 
	best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words]
	bestwords = set([w for w, s in best])

	return bestwords
def getBestWords(posWords, negWords):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        label_word_fd["pos"][word.lower()] += 1

    for word in negWords:
        word_fd[word.lower()] += 1
        label_word_fd["neg"][word.lower()] += 1

    pos_word_count = label_word_fd["pos"].N()
    neg_word_count = label_word_fd["neg"].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    # best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)
    bestwords = set([w for w, s in sorted_x])

    return bestwords
Beispiel #21
0
def create_word_scores():

    posWords = list(itertools.chain(*datap)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*datan)) #同理

    word_fd = nltk.FreqDist()
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N() #积极词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
def create_word_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #23
0
def getWordScores():
    posWords = []
    negWords = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords.append(posWord)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    pos = 0
    neg = 0
    for review in posids:
        pos += 1
        if (pos != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
 
    for review in negids:
        neg += 1
        if (neg != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
    
    """
def create_word_scores(posWords, negWords):
    file_scores = file("cn_sample_data/scores.txt", "w")
    #迭代,将多个序列合并
    
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in negWords:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    for key in word_scores:
        file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n")
    file_scores.close()
    return word_scores 
	def create_word_scores(self):
		[posWords, negWords] = self.getAllWords()
		
		posWords = list(itertools.chain(*posWords))
		negWords = list(itertools.chain(*negWords))

		word_fd = FreqDist()
		cond_word_fd = ConditionalFreqDist()
		for word in posWords:
			word_fd.inc(word)
			cond_word_fd['pos'].inc(word)
		for word in negWords:
			word_fd.inc(word)
			cond_word_fd['neg'].inc(word)

		pos_word_count = cond_word_fd['pos'].N()
		neg_word_count = cond_word_fd['neg'].N()
		total_word_count = pos_word_count + neg_word_count

		log("Total number of words: %d" % total_word_count)

		word_scores = {}
		for word, freq in word_fd.iteritems():
			pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
			neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
			word_scores[word] = pos_score + neg_score

		return word_scores
    def get_ranked_ngrams(self, wlist="all", pos=True):
        """
                turn ngram into term: chi_sq associatoin metric
        """

        word_fd = nltk.FreqDist()
        tag_fd = nltk.ConditionalFreqDist()
        for key, tweet in self.tweets.items():
            word_list = self.get_selected_text(tweet)
            label = self.instances[key].label
            for ngram in word_list:
                # do we want the tag here
                word_fd.inc(ngram)
                tag_fd[label].inc(ngram)

        num_pos = tag_fd["positive"].N()
        num_neg = tag_fd["negative"].N()
        # num_neu = tag_fd["neutral"].N() # ignore neutral tweets
        ngram_dict = {}

        total = num_pos + num_neg  # + num_neu
        for ngram, frequency in word_fd.items():
            try:
                # build chi_sq metrics for both positive and negative tags
                pos_metric = BigramAssocMeasures.chi_sq(
                    tag_fd['positive'][ngram], (frequency, num_pos), total)
                neg_metric = BigramAssocMeasures.chi_sq(
                    tag_fd['negative'][ngram], (frequency, num_neg), total)

                #neu_metric = BigramAssocMeasures.chi_sq(tag_fd['neutral'][ngram],(frequency,num_neu),total)
                score = pos_metric + neg_metric
                ngram_dict[ngram] = score  # append score
            except:
                continue
        return ngram_dict
        def GetHighInformationWordsChi(num_bestwords):
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
 
            for word in movie_reviews.words(categories=['pos']):
                word_fd[word.lower()] +=1
                label_word_fd['pos'][word.lower()] +=1
 
            for word in movie_reviews.words(categories=['neg']):
                word_fd[word.lower()] +=1
                label_word_fd['neg'][word.lower()] +=1
 
            pos_word_count = label_word_fd['pos'].N()
            neg_word_count = label_word_fd['neg'].N()
            total_word_count = pos_word_count + neg_word_count
 
            word_scores = {}
 
            for word, freq in word_fd.iteritems():
                pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score
 
            best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
            bestwords = set([w for w, s in best])
            return bestwords
def create_word_scores(posWords,negWords,posTag,negTag):
    from nltk.probability import FreqDist, ConditionalFreqDist
    import itertools 
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd[posTag].N() #积极词的数量
    neg_word_count = cond_word_fd[negTag].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
Beispiel #30
0
    def store_word_scores(self):
        """
        Stores 'word scores' into Redis.
        """
        
        try:
            word_freqdist = pickle.loads(self.r.get('word_fd'))
            label_word_freqdist = pickle.loads(self.r.get('label_fd'))
        except TypeError:
            print('Requires frequency distributions to be built.')

        word_scores = {}

        pos_word_count = label_word_freqdist['pos'].N()
        neg_word_count = label_word_freqdist['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        for word, freq in word_freqdist.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['pos'][word], (freq, pos_word_count), total_word_count)

            neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['neg'][word], (freq, neg_word_count), total_word_count)

            word_scores[word] = pos_score + neg_score
        
        self.r.set('word_scores', word_scores)
def create_word_scores():
    posWords = pickle.load(
        open(
            '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl',
            'r'))
    negWords = pickle.load(
        open(
            '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl',
            'r'))

    posWords = list(itertools.chain(*posWords))  #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords))  #同理

    word_fd = FreqDist()  #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1  #word_fd.inc(word)
        cond_word_fd['pos'][word] += 1  #cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1  #word_fd.inc(word)
        cond_word_fd['neg'][word] += 1  #cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()  #积极词的数量
    neg_word_count = cond_word_fd['neg'].N()  #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['pos'][word], (freq, pos_word_count),
            total_word_count)  #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)  #同理
        word_scores[word] = pos_score + neg_score  #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores  #包括了每个词和这个词的信息量
Beispiel #32
0
def create_word_bigram_scores():
    posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet'
    posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1,
                                     'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1,
                                     'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #33
0
def scores():
    posWords = []
    negWords = []
    with open('pos.txt', 'r') as posSentences:
        for i in posSentences:
            posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWord = bigram_words(posWord, score_fn=BigramAssocMeasures.chi_sq, n=1000)
            posWords.append(posWord)
    with open('neg.txt', 'r') as negSentences:
        for i in negSentences:
            negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWord = bigram_words(negWord, score_fn=BigramAssocMeasures.chi_sq, n=1000)
            negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    # finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    # builds dictionary of word scores based on chi-squared test
    featureScore = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        featureScore[word] = pos_score + neg_score

    return featureScore
def find_best_words(positiveWords, negativWords, dimention_num):
    # positiveWords = word_tokenize(positiveWords)
    # negativWords = word_tokenize(negativWords)
    space = ' '
    positiveWords = word_tokenize(space.join(positiveWords))
    negativWords = word_tokenize(space.join(negativWords))

    cond_word_fd = ConditionalFreqDist()

    scoreF = BigramAssocMeasures.chi_sq

    posBigrams = BCF.from_words(positiveWords).nbest(scoreF, 5000)
    negBigrams = BCF.from_words(negativWords).nbest(scoreF, 5000)

    pos = positiveWords + posBigrams
    neg = negativWords + negBigrams

    all_words = pos + neg
    word_fd = FreqDist(all_words)
    pos_word_fd = FreqDist(pos)
    neg_word_fd = FreqDist(neg)

    pos_word_count = pos_word_fd.N()
    neg_word_count = neg_word_fd.N()
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    best_vals = sorted(word_scores, key=lambda k: word_scores[k],
                       reverse=True)[:dimention_num]
    return best_vals
Beispiel #35
0
def create_word_scores():
	# creates lists of all positive and negative words
	posWords = []
	negWords = []
	with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
		for i in posSentences:
			posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			posWords.append(posWord)
	with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
		for i in negSentences:
			negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			negWords.append(negWord)
	posWords = list(itertools.chain(*posWords))
	negWords = list(itertools.chain(*negWords))

	# build frequency distibution of all words and then frequency distributions of words within positive and negative labels
	word_fd = FreqDist()
	cond_word_fd = ConditionalFreqDist()
	for word in posWords:
		word_fd.inc(word.lower())
		cond_word_fd['pos'].inc(word.lower())
	for word in negWords:
		word_fd.inc(word.lower())
		cond_word_fd['neg'].inc(word.lower())

	# finds the number of positive and negative words, as well as the total number of words
	pos_word_count = cond_word_fd['pos'].N()
	neg_word_count = cond_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count

	# builds dictionary of word scores based on chi-squared test
	word_scores = {}
	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
		word_scores[word] = pos_score + neg_score

	return word_scores
Beispiel #36
0
def getWordScores():
    posWords = []
    negWords = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords.append(posWord)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #37
0
def create_word_scores():
    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in app_reviews.words(categories=['pos']):
        if word not in stopset and not (word.isnumeric()) and word.isalpha():
            word_fd[lemmatizer.lemmatize(word)] += 1
            label_word_fd['pos'][lemmatizer.lemmatize(word)] += 1

    for word in app_reviews.words(categories=['neg']):
        if word not in stopset and not (word.isnumeric()) and word.isalpha():
            word_fd[lemmatizer.lemmatize(word)] += 1
            label_word_fd['neg'][lemmatizer.lemmatize(word)] += 1

    # n_ii = label_word_fd[label][word]
    # n_ix = word_fd[word]
    # n_xi = label_word_fd[label].N()
    # n_xx = label_word_fd.N()

    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}

    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #38
0
    def create_bestwords(self):
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()
        score_fn = BigramAssocMeasures.chi_sq

        for index, row in self.df.iterrows():
            # bigram_finder = BigramCollocationFinder.from_words(row['filtered'])
            for word in row['filtered']:
                word_fd[word] += 1
                label_word_fd[row['obltrans_pz']][word] += 1
            # for bigram in bigrams:
            #     word_fd[bigram] += 1
            #     label_word_fd['pos'][bigram] += 1

        word_count = {}
        total_word_count = 0
        for label in self.label_list:
            word_count[label] = label_word_fd[label].N()
            total_word_count += label_word_fd[label].N()

        word_total_scores = {}
        for word, freq in word_fd.items():
            word_total_scores[word] = 0
            word_label_scores = {}
            for label in self.label_list:
                if label_word_fd[label][word] == 0:
                    continue

                # print(label_word_fd[label][word])
                # print(word_count[label])
                # print(total_word_count)
                word_label_scores[label] = BigramAssocMeasures.chi_sq(
                    label_word_fd[label][word], (freq, word_count[label]),
                    total_word_count)
                word_total_scores[word] += word_label_scores[label]

        best = sorted(word_total_scores.items(),
                      key=lambda tup: tup[1],
                      reverse=True)[:1000]

        print(best)
        bestwords = set([w for w, s in best])
        self.bestwords = bestwords
        print(self.bestwords)

        print(total_word_count)
        print(word_fd['cz0035'])
        for label in self.label_list:
            print(label_word_fd[label]['cz0035'])
            print(word_count[label])
Beispiel #39
0
def create_bigram_scores():
    posdata = pickle.load(open('pos_review.pkl', 'rb'))
    negdata = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 10000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 10000)

    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词
    for word in posBigrams:
        word_fd[word] += 1
        cond_word_fd["pos"][word] += 1
    for word in negBigrams:
        word_fd[word] += 1
        cond_word_fd["neg"][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def best_word_feats(tweets, labels):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    tokenizer = TweetTokenizer()
    tweets = [tokenizer.tokenize(tweet) for tweet in tweets]

    for tweet, label in zip(tweets, labels):
        for word in tweet:
            word_fd[word.lower()] += 1
            if label == 0:
                label_word_fd['0'][word.lower()] += 1
            else:
                label_word_fd['4'][word.lower()] += 1

    total_word_count = word_fd.N()
    pos_word_count = label_word_fd['4'].N()
    neg_word_count = label_word_fd['0'].N()

    word_scores = {}

    for (word, freq) in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['4'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['0'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    best_words = [
        word
        for (word, score
             ) in sorted(word_scores.items(), key=itemgetter(1), reverse=True)
    ][:50000]

    return best_words
def create_word_bigram_scores():
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_pos_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_pos_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    bigram_neg_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_neg_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Beispiel #42
0
def jieba_feature(number):
    posWords = []
    negWords = []
    for items in str1:
        for item in items:
            posWords.append(item)
    for items in str2:
        for item in items:
            negWords.append(item)
    word_fd = FreqDist()  # 可统计所有词的词频
    con_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        con_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        con_word_fd['neg'][word] += 1
    pos_word_count = con_word_fd['pos'].N()  # 积极词的数量
    neg_word_count = con_word_fd['neg'].N()  # 消极词的数量
    # 一个词的信息量等于积极卡方统计量加上消极卡方统计量
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    best_words = []
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(con_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(con_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score
        best_vals = sorted(word_scores.items(),
                           key=lambda item: item[1],
                           reverse=True)[:number]
        best_words = set([w for w, s in best_vals])
    return dict([(word, True) for word in best_words])
def create_word_scores(posWords,negWords,objWords):

    word_fd = FreqDist() #可统计所有词的词频
    print(type(word_fd))
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['pos'].inc(word)
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['neg'].inc(word)
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['neg'].inc(word)
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N() #积极词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极词的数量
    obj_word_count = cond_word_fd['obj'].N() #中性词的数量
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) 
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) 
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count) 
        #一个词的信息量等于积极卡方统计量加上消极卡方统计量
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores #包括了每个词和这个词的信息量
    def compute_word_scores(self):
        #Core module which assigns scores to features and top features are selected based on this score.
        
        freq_dist_obj = FreqDist()
        cond_freq_dist_obj = ConditionalFreqDist()

        #Iterating over pos reviews, to calcutate scores for pos feats
        for review in self.pos_reviews_list:
            review_words = self.apply_preprocessing(review)
            for word in review_words:
                freq_dist_obj.inc(word)
                cond_freq_dist_obj['pos'].inc(word)

        #Iterating over neg reviews, to calculate scores for neg feats
        for review in self.neg_reviews_list:
            review_words = self.apply_preprocessing(review)
            for word in review_words:
                freq_dist_obj.inc(word)
                cond_freq_dist_obj['neg'].inc(word)

        pos_word_count = cond_freq_dist_obj['pos'].N()
        neg_word_count = cond_freq_dist_obj['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_score_dict = {}
    
        #Finding the scores using chi square

        for word, freq in freq_dist_obj.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_freq_dist_obj['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_freq_dist_obj['neg'][word], (freq, neg_word_count), total_word_count)
            word_score_dict[word] = pos_score + neg_score

        #self.best = sorted(word_score_dict.iteritems(), key=lambda (w,s): s, reverse=True)
        
        self.best =  sorted(word_score_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
Beispiel #45
0
 def __init__(self, pos, neg):
             
     self.posFeatures = list(itertools.chain(*pos))
     self.negFeatures = list(itertools.chain(*neg))
     
     #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
     word_fd = FreqDist()
     cond_word_fd = ConditionalFreqDist()
     for word in tqdm(self.posFeatures):
             word_fd[word] += 1
             cond_word_fd['pos'][word] += 1
     for word in tqdm(self.negFeatures):
             word_fd[word] += 1
             cond_word_fd['neg'][word] += 1
         
     #finds the number of positive and negative words, as well as the total number of words
     pos_word_count = cond_word_fd['pos'].N()
     neg_word_count = cond_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
     self.word_scores = {}
     for word, freq in word_fd.iteritems():
         pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
         neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
         self.word_scores[word] = pos_score + neg_score
Beispiel #46
0
def findtopbigrams(bigrams,word_fd,settings):
    nkey = settings['nkey']
    measure = settings['measure']

    bigram_measures = BigramAssocMeasures()
    bigram_fd = FreqDist(bigrams)
    finder = BigramCollocationFinder(word_fd, bigram_fd)

    warning = ""

    if measure == "LR":
        try:
            top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, nkey)
        except:
            warning = "Problem with LR measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "PMI":
        try:
            top_bigrams = finder.nbest(bigram_measures.pmi, nkey)
        except:
            warning = "Problem with PMI measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "CHISQ":
        try:
            top_bigrams = finder.nbest(bigram_measures.chi_sq, nkey)
        except:
            warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "STUDT":
        try:
            top_bigrams = finder.nbest(bigram_measures.student_t, nkey)
        except:
            warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    else:
        top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)


    #score bigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip
    top_bg_with_count = sorted([(bg, count) for (bg, count) in finder.ngram_fd.items() if bg in top_bigrams], key=lambda bgcount:-bgcount[1])
    top_bigrams = [(bg, count) for (bg, count) in top_bg_with_count if count > 1 and bg[0]!=bg[1]]
    return top_bigrams, bigram_fd, warning
def bi_collocations(tokens, num=20):
    from nltk.corpus import stopwords
    ignored_words = stopwords.words('english')

    word_list = [word for sent in tokens for word in sent]
    finder = BigramCollocationFinder.from_words(word_list, 2)
    finder.apply_freq_filter(3)

    finder.apply_ngram_filter(lambda w1, w2:
                                  len(w1) < 3 \
                                  or len(w2) < 3 \
                                  or (len(w1)+len(w2)) < 8 \
                                  or w1.lower() in ignored_words \
                                  or w2.lower() in ignored_words) #length=2 want to keep e.g. rf pulse
    bigram_measures = BigramAssocMeasures()
    collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
    return collocations
Beispiel #48
0
    def get_most_common_ngrams(self, n, nb_ngrams=None):
        """
        Compute and return the set of the most common ngrams in the documents.
        This set is cached inside the object.

        Args:
            n: The number of grams. Must be a positive interger.
            nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'.

        Returns:
            A list of the most common ngrams.
        """
        try:
            # return cached value
            return self._most_common_ngrams[n]
        except KeyError:
            pass

        # compute all ngrams
        all_ngrams = []
        for document in self.training_set["hits"]["hits"]:
            if document["_source"]["external_review_report"] is not None:
                all_ngrams.extend(self.compute_ngrams(document["_source"]["external_review_report"], n))
            if document["_source"]["external_review_form"] is not None:
                all_ngrams.extend(self.compute_ngrams(document["_source"]["external_review_form"], n))


        # get the frequency or return all ngrams
        freq = FreqDist(ngram for ngram in all_ngrams)
        # store and return the nb_ngrams most common ngrams
        word_scores = {}
        if nb_ngrams:
            self._most_common_ngrams[n] = freq.keys()[:nb_ngrams]
            for word, freqs in freq.iteritems():
                score = BigramAssocMeasures.chi_sq(freq[word], (freqs, freq.N()), freq.N() + freq.N())
                word_scores[word] = score

            self.best = []
            self.best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:n]
            self.bestwords = set([w for w, s in self.best])
        else:
            self._most_common_ngrams[n] = freq.keys()
        return self.bestwords #self._most_common_ngrams[n]
def process_bigrams(conn, polarity, total_word_count, best_words):
    cursor = conn.cursor()
    sql = Statements.GRAM_SQL % polarity
    cursor.execute(sql)

    rows = list(cursor.fetchall())
    l = [x[0] for x in rows]
    words_split = map(string.split, l)
    raw_words = [item for sublist in words_split for item in sublist]

    words = []
    for w in raw_words:
        if not (w.startswith("http://") or w.startswith("@")):
            words.append(w)

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in words:
        word_fd.inc(word.lower())
        label_word_fd[polarity].inc(word.lower())

    pos_word_count = label_word_fd[polarity].N()

    word_scores = {}

    for word, freq in word_fd.iteritems():
        score = BigramAssocMeasures.chi_sq(label_word_fd[polarity][word],
                                           (freq, pos_word_count),
                                           total_word_count)
        word_scores[word] = score

    best_raw = sorted(word_scores.iteritems(),
                      key=lambda (w, s): s,
                      reverse=True)[:600]
    best = [x[0] for x in best_raw if x[0] not in STOPWORDS and len(x[0]) > 1]
    best_words.update(best)
    best_features = features(best, polarity)
    return best_features
    cursor.close()
Beispiel #50
0
def chiSQ(priors, likelihood, keep):
	""" Extract the 10000 most informative features using chi-square """
	words = {}
	# Total word count
	twc = sum(priors.values())
	# All words in the counter
	words_unique = [likelihood[section].keys() for section in likelihood.keys()]
	words_unique = set(sum(words_unique, []))
	for word in words_unique:
		# Go past each class
		scores = []
		for c in priors.keys():
			# Class word count
			cwc = priors[c]
			# Get word occurrence over all classes
			totalFreq = sum([likelihood[section][word] for section in priors.keys()])
			# Word count within class
			wc = likelihood[c][word]
			# Get chi-sq
			score = BigramAssocMeasures.chi_sq(wc, (totalFreq, cwc), twc)
			# Append
			scores.append(score)
		# Add to dict
		words[word] = sum(scores)
	# Select best words
	bestWords = sorted(words.iteritems(), key=lambda (w,s): s, reverse=True)[:keep]
	# Save
	with open("chiSQ.txt", 'w') as f:
		print >> f, bestWords
	# Get names
	bestWords = [b[0] for b in bestWords]
	# Filter likelihood
	for c in priors.keys():
		for key in list(likelihood[c]):
			if key not in bestWords:
				del likelihood[c][key]
	# Return
	return(likelihood)
Beispiel #51
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
Beispiel #52
0
def collocRecursively(corp,interp,constructor,threshhold,addUnrelated,addBigram,filters=None):
	bgFinder = constructor(corp)
	if filters:
		bgFinder = applyFilters(bgFinder,filters)
	bgScores = {bg:score for bg,score in bgFinder.score_ngrams(BigramAssocMeasures().likelihood_ratio)}
	print(sorted(list(bgScores.items()),key=lambda tup: tup[1])[-6:])
	idx = 0
	N = len(corp)
	newCorp = list()
	flag = False
	while idx < N-1:
		bg = (corp[idx],corp[idx+1])
		if bgScores.get((interp(bg[0]),interp(bg[1])),0) > threshhold:
			addBigram(newCorp,bg)
			idx += 2
			flag = True
		else:
			addUnrelated(newCorp,bg[0])
			idx += 1
	if idx == N-1:
		addUnrelated(newCorp,corp[idx])
	if flag:
		return collocRecursively(newCorp, interp, constructor, threshhold, addUnrelated, addBigram, filters)
	return newCorp
    label_word_fd['neg'][word.lower()] += 1

# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()`

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}

for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                           (freq, pos_word_count),
                                           total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                           (freq, neg_word_count),
                                           total_word_count)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.iteritems(), key=lambda (w, s): s,
              reverse=True)[:10000]
bestwords = set([w for w, s in best])


def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])

def create_word_scores():
    angerWords, disgustWords, fearWords, joyWords, surpriseWords = [], [], [], [], []
    with open(ANGER_FILE, 'r', errors="ignore",
              encoding="utf-8") as angerSentence:
        for i in angerSentence:
            angerWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            angerWords.append(angerWord)
    with open(DISGUST_FILE, 'r', errors="ignore",
              encoding="utf-8") as disgustSentence:
        for i in disgustSentence:
            disgustWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            disgustWords.append(disgustWord)
    with open(FEAR_FILE, 'r', errors="ignore",
              encoding="utf-8") as fearSentence:
        for i in fearSentence:
            fearWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            fearWords.append(fearWord)
    with open(JOY_FILE, 'r', errors="ignore", encoding="utf-8") as joySentence:
        for i in joySentence:
            joyWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            joyWords.append(joyWord)
    with open(SURPRISE_FILE, 'r', errors="ignore") as surpriseSentence:
        for i in surpriseSentence:
            surpriseWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            surpriseWords.append(surpriseWord)
    angerWords = list(itertools.chain(*angerWords))
    disgustWords = list(itertools.chain(*disgustWords))
    fearWords = list(itertools.chain(*fearWords))
    joyWords = list(itertools.chain(*joyWords))
    surpriseWords = list(itertools.chain(*surpriseWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in angerWords:
        word_fd[word.lower()] += 1
        cond_word_fd['anger'][word.lower()] += 1
    for word in disgustWords:
        word_fd[word.lower()] += 1
        cond_word_fd['disgust'][word.lower()] += 1
    for word in fearWords:
        word_fd[word.lower()] += 1
        cond_word_fd['fear'][word.lower()] += 1
    for word in joyWords:
        word_fd[word.lower()] += 1
        cond_word_fd['joy'][word.lower()] += 1
    for word in surpriseWords:
        word_fd[word.lower()] += 1
        cond_word_fd['surprise'][word.lower()] += 1

    anger_word_count = cond_word_fd['anger'].N()
    disgust_word_count = cond_word_fd['disgust'].N()
    fear_word_count = cond_word_fd['fear'].N()
    joy_word_count = cond_word_fd['joy'].N()
    surprise_word_count = cond_word_fd['surprise'].N()
    total_word_count = anger_word_count + disgust_word_count + fear_word_count + joy_word_count + surprise_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        anger_score = BigramAssocMeasures.chi_sq(cond_word_fd['anger'][word],
                                                 (freq, anger_word_count),
                                                 total_word_count)
        disgust_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['disgust'][word], (freq, disgust_word_count),
            total_word_count)
        fear_score = BigramAssocMeasures.chi_sq(cond_word_fd['fear'][word],
                                                (freq, fear_word_count),
                                                total_word_count)
        joy_score = BigramAssocMeasures.chi_sq(cond_word_fd['joy'][word],
                                               (freq, joy_word_count),
                                               total_word_count)
        surprise_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['surprise'][word], (freq, surprise_word_count),
            total_word_count)
        word_scores[
            word] = anger_score + disgust_score + fear_score + joy_score + surprise_score

    return word_scores
Beispiel #55
0
    def train_classifier(self,
                         dataset,
                         feature_fn_name='word',
                         train_ratio=0.8,
                         verbose=False,
                         token_column='text',
                         target_column='category',
                         best_ratio=0.8,
                         pos_target_val=1,
                         neg_target_val=-1):
        def word_feats(words):
            return dict([(word, True) for word in words])

        def best_word_feats(words):
            return dict([(word, True) for word in words if word in bestwords])

        def best_bigram_word_feats(words,
                                   score_fn=BigramAssocMeasures.chi_sq,
                                   n=200):
            bigram_finder = BigramCollocationFinder.from_words(words)
            bigrams = bigram_finder.nbest(score_fn, n)
            d = dict([(bigram, True) for bigram in bigrams])
            d.update(best_word_feats(words))
            return d

        def best_trigram_word_feats(words,
                                    score_fn=TrigramAssocMeasures.chi_sq,
                                    n=200):
            tcf = TrigramCollocationFinder.from_words(words)
            trigrams = tcf.nbest(score_fn, n)
            d = dict([(trigram, True) for trigram in trigrams])
            d.update(best_bigram_word_feats(words))
            d.update(best_word_feats(words))
            return d

        if verbose:
            print(
                '\nSelected feature function: {}, token column: {}, train ratio: {}'
                .format(feature_fn_name, token_column, train_ratio))
        df = dataset.sample(frac=1).reset_index(drop=True)
        negids = df[df[target_column] == neg_target_val].index
        posids = df[df[target_column] == pos_target_val].index
        feats = df[token_column]

        if feature_fn_name in ['best_word', 'best_bigram', 'best_trigram']:
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
            for tokens in df[df[target_column] ==
                             pos_target_val][token_column]:
                for word in tokens.split():
                    word_fd[word] += 1
                    label_word_fd[self._positive_label][word] += 1

            for tokens in df[df[target_column] ==
                             neg_target_val][token_column]:
                for word in tokens.split():
                    word_fd[word] += 1
                    label_word_fd[self._negative_label][word] += 1

            pos_word_count = label_word_fd[self._positive_label].N()
            neg_word_count = label_word_fd[self._negative_label].N()
            total_word_count = pos_word_count + neg_word_count
            word_scores = {}
            for word, freq in word_fd.items():
                pos_score = BigramAssocMeasures.chi_sq(
                    label_word_fd[self._positive_label][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(
                    label_word_fd[self._negative_label][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score

            best_cnt = int(len(word_scores) * best_ratio)
            best = sorted(word_scores.items(),
                          key=lambda item: item[1],
                          reverse=True)[:best_cnt]
            bestwords = set([w for w, s in best])
            if feature_fn_name == 'best_trigram_word_feats':
                feat_fn = best_trigram_word_feats
            elif feature_fn_name == 'best_bigram':
                feat_fn = best_bigram_word_feats
            else:
                feat_fn = best_word_feats

        else:
            feat_fn = word_feats

        negfeats = [(feat_fn(feats[i].split()), self._negative_label)
                    for i in negids]
        posfeats = [(feat_fn(feats[i].split()), self._positive_label)
                    for i in posids]
        if verbose:
            print('No. of samples: {}, Pos: {}, Neg: {}'.format(
                len(feats), len(posfeats), len(negfeats)))

        negcutoff = int(len(negfeats) * train_ratio)
        poscutoff = int(len(posfeats) * train_ratio)

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        classifier = NaiveBayesClassifier.train(trainfeats)
        refsets = defaultdict(set)
        testsets = defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        metrics = {
            'Accuracy':
            nltk.classify.util.accuracy(classifier, testfeats),
            'Pos precision':
            precision(refsets[self._positive_label],
                      testsets[self._positive_label]),
            'Pos recall':
            recall(refsets[self._positive_label],
                   testsets[self._positive_label]),
            'Neg precision':
            precision(refsets[self._negative_label],
                      testsets[self._negative_label]),
            'Neg recall':
            recall(refsets[self._negative_label],
                   testsets[self._negative_label])
        }
        if verbose:
            print(metrics)

        return classifier, metrics
        #expand contradictions
        for k in punctuation:
            l = l.replace(k, " ")
        l = Contractions.expandContractions(l)
        sentenceWords = nltk.word_tokenize(l)
        for word in sentenceWords:
            word_fd[word.lower()] += 1
            category_fd['neg'][word.lower()] += 1
        negatives.append(l)

pos_wordCnt = category_fd['pos'].N()
neg_wordCnt = category_fd['neg'].N()
total_wordCnt = pos_wordCnt + neg_wordCnt
word_scores = {}
for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(category_fd['pos'][word],
                                           (freq, pos_wordCnt), total_wordCnt)
    neg_score = BigramAssocMeasures.chi_sq(category_fd['neg'][word],
                                           (freq, neg_wordCnt), total_wordCnt)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.items(), key=operator.itemgetter(1),
              reverse=True)[:10000]
bestWords = set([w for w, s in best])

posfeats = []
#positives = movie_reviews.fileids('pos')
for line in positives:
    lineWiki = TextBlob(line.lower())
    words = list(lineWiki.words)
    featset = word_features(words)
    tag = 'pos'
Beispiel #57
0
def cal_word_count():
    global train_word_id
    global pos_info
    global neg_info
    pos_info = []
    neg_info = []
    train_word_id = []

    word_fd = FreqDist()  #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频

    print('Loading POS>>>')
    line_num = 0
    with open(pos_file, 'r') as fin:
        for line in fin:
            line_num += 1
            if not line_num % 10000: print('LINE:%d' % (line_num))
            items = line.split()
            tmp_col = []
            for item in items:
                item_id = term_to_id(item)
                word_fd[item_id] += 1
                cond_word_fd['pos'][item_id] += 1
                tmp_col.append(item_id)
            pos_info.append(tmp_col)

    print('Loading NEG>>>')
    line_num = 0
    with open(neg_file, 'r') as fin:
        for line in fin:
            line_num += 1
            if not line_num % 10000: print('LINE:%d' % (line_num))
            items = line.split()
            tmp_col = []
            for item in items:
                item_id = term_to_id(item)
                word_fd[item_id] += 1
                cond_word_fd['neg'][item_id] += 1
                tmp_col.append(item_id)
            neg_info.append(tmp_col)

    print('Randomize>>>')
    shuffle(pos_info)
    shuffle(neg_info)

    pos_w_count = cond_word_fd['pos'].N()
    neg_w_count = cond_word_fd['neg'].N()
    total_w_count = pos_w_count + neg_w_count
    #print('pos_w_count=%d, neg_w_count=%d, total_w_count=%d'%(pos_w_count, neg_w_count, total_w_count))
    #print('word_fd_count=%d'%(word_fd.N()))

    #计算卡方统计量
    global word_scores
    word_scores = {}

    print("CALC CHI-SQUARE...")
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['pos'][word], (freq, pos_w_count),
            total_w_count)  #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_w_count),
                                               total_w_count)  #同理
        word_scores[word] = pos_score + neg_score  #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    del word_fd
    del cond_word_fd

    return
Beispiel #58
0
from nltk.util import ngrams
from nltk.corpus import alpino

print(alpino.words())
unigrams = ngrams(alpino.words(), 4)  #四元语法
print(unigrams)
# for i in unigrams:
#     print(i)

from nltk.collocations import BigramCollocationFinder
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in stop_words  # 单词长度小于3或是停用词
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)  # 创建实例
print(words)
words.apply_word_filter(stops_filter)
res = words.nbest(BigramAssocMeasures.likelihood_ratio, 5)  # 二元语法,前5个
print(res)

# 使用词汇搭配查找器生成bigrams
import nltk
text1 = "Hardwork is the key to success. Never give up!"
word = nltk.wordpunct_tokenize(text1)
finder = BigramCollocationFinder.from_words(word)
bigram_measures = BigramAssocMeasures()
value = finder.score_ngrams(bigram_measures.raw_freq)
print(sorted(bigram for bigram, score in value))
def information_gain(dataset):
    frequenciaUnigrama = nltk.FreqDist()
    condicionalUnigrama = nltk.ConditionalFreqDist()
    dicionarioUnigrama = []

    frequenciaBigrama = nltk.FreqDist()
    condicionalBigrama = nltk.ConditionalFreqDist()
    dicionarioBigrama = []
    data = dataset.data
    for frase in data[data.Classificacao == 'no']['Unigrama']:
        for word in frase:
            frequenciaUnigrama[word.lower()] += 1
            condicionalUnigrama['pos'][word.lower()] += 1

    for frase in data[data.Classificacao == 'no']['Bigrama']:
        for word in frase:
            frequenciaBigrama[word.lower()] += 1
            condicionalBigrama['pos'][word.lower()] += 1
        
    for frase in data[data.Classificacao == 'yes']['Unigrama']:
        for word in frase:
            frequenciaUnigrama[word.lower()] += 1
            condicionalUnigrama['neg'][word.lower()] += 1

    for frase in data[data.Classificacao == 'yes']['Bigrama']:
        for word in frase:
            frequenciaBigrama[word.lower()] += 1
            condicionalBigrama['neg'][word.lower()] += 1
    pos_word_count_unigrama = condicionalUnigrama['pos'].N()
    pos_word_count_bigrama = condicionalBigrama['pos'].N()
    neg_word_count_unigrama = condicionalUnigrama['neg'].N()
    neg_word_count_bigrama = condicionalBigrama['neg'].N()
    total_word_count_unigrama = pos_word_count_unigrama + neg_word_count_unigrama
    total_word_count_bigrama = pos_word_count_bigrama + neg_word_count_bigrama

    word_scores_unigrama = {}
    word_scores_bigrama = {} 

    for word, freq in frequenciaUnigrama.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(condicionalUnigrama['pos'][word],
            (freq, pos_word_count_unigrama), total_word_count_unigrama)
        neg_score = BigramAssocMeasures.chi_sq(condicionalUnigrama['neg'][word],
            (freq, neg_word_count_unigrama), total_word_count_unigrama)
        word_scores_unigrama[word] = pos_score + neg_score

    for word, freq in frequenciaBigrama.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(condicionalBigrama['pos'][word],
            (freq, pos_word_count_bigrama), total_word_count_bigrama)
        neg_score = BigramAssocMeasures.chi_sq(condicionalBigrama['neg'][word],
            (freq, neg_word_count_bigrama), total_word_count_bigrama)
        word_scores_bigrama[word] = pos_score + neg_score
    if(dataset.name == 'OffComBR3'):
        tamUni = 122
        tamBig = 103
    elif(dataset.name == 'OffComBR2'):
        tamUni = 250
        tamBig = 426
    bestUnigrama = sorted(word_scores_unigrama.iteritems(), key=lambda (w,s): s, reverse=True)[:tamUni]
    bestBigrama = sorted(word_scores_bigrama.iteritems(), key=lambda (w,s): s, reverse=True)[:tamBig]
    dicionarioUnigrama = [w for w, s in bestUnigrama]
    dicionarioBigrama = [w for w, s in bestBigrama]

    dataset.dicUnigrama = dicionarioUnigrama
    dataset.dicBigrama = dicionarioBigrama
    dataset = extraiFeatures(dataset)
    return dataset
Beispiel #60
0
def train_and_test(reviews_pos, reviews_neg):
    """
    훈련 및 테스트
    :param reviews_pos: 긍정 리뷰 list
    :param reviews_neg: 부정 리뷰 list
    :return:
    """

    # 긍정 리뷰, 부정 리뷰 각각에서의 전체 단어에 대한 빈도수 계산
    tot_poswords = [val for l in [r.words for r in reviews_pos] for val in l]
    tot_negwords = [val for l in [r.words for r in reviews_neg] for val in l]

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in tot_poswords:
        word_fd[word.lower()] += 1
        label_word_fd['pos'][word.lower()] += 1
    for word in tot_negwords:
        word_fd[word.lower()] += 1
        label_word_fd['neg'][word.lower()] += 1

    pos_words = len(tot_poswords)
    neg_words = len(tot_negwords)
    tot_words = pos_words + neg_words

    # 각 단어별 점수
    word_scores = {}
    for word, freq in iter(word_fd.items()):
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                               (freq, pos_words), tot_words)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                               (freq, neg_words), tot_words)
        word_scores[word] = pos_score + neg_score
    print('total: ', len(word_scores))

    # 점수가 높은 10000개의 단어만 추출
    best = sorted(iter(word_scores.items()),
                  key=lambda args: args[1],
                  reverse=True)[:10000]
    bestwords = set([w for w, s in best])

    negfeatures = [(best_words_features(r.words, bestwords), 'neg')
                   for r in reviews_neg]
    posfeatures = [(best_words_features(r.words, bestwords), 'pos')
                   for r in reviews_pos]

    # 훈련 집합 80%와 테스트 집합 20% 분리
    portionpos = int(len(posfeatures) * 0.8)
    portionneg = int(len(negfeatures) * 0.8)
    print(portionpos, '-', portionneg)
    trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
    print(len(trainfeatures))

    # 훈련
    classifier = NaiveBayesClassifier.train(trainfeatures)

    # 테스트
    testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
    shuffle(testfeatures)
    err = 0
    print('test on: ', len(testfeatures))
    for r in testfeatures:
        sent = classifier.classify(r[0])
        # print(r[1], '-pred: ', sent)
        if sent != r[1]:
            err += 1.
    print('error rate: ', err / float(len(testfeatures)))