Example #1
0
 def build_topn_best_words(self):
     word_fd = FreqDist()
     label_word_fd = ConditionalFreqDist()
     positivecount = 0;
     negativecount = 0
     with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f:
         reader = csv.reader(f)
         for row in reader:
                 #Positive sentiment tweets
                 if(row[0] == '4' and positivecount < self.corpuslength):
                     tweet = row[5]
                     tokens = WhitespaceTokenizer().tokenize(tweet)
                     #print tweet
                     for token in tokens:                        
                         word_fd.inc(token.lower())    
                         label_word_fd['pos'].inc(token.lower()) 
                     positivecount+=1
                 #Negative sentiment tweets
                 if(row[0] == '0' and negativecount < self.corpuslength):
                     tweet = row[5]
                     tokens = WhitespaceTokenizer().tokenize(tweet)
                     #print tweet
                     for token in tokens:     
                         word_fd.inc(token.lower())    
                         label_word_fd['neg'].inc(token.lower())
                     negativecount+=1
                     
     #print word_fd
     #print label_word_fd
     
     pos_word_count = label_word_fd['pos'].N()
     neg_word_count = label_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
     print "Positive Word Count:", pos_word_count, "Negative Word Count:", neg_word_count, "Total Word count:", total_word_count
     
     word_scores = {}
     for word, freq in word_fd.iteritems():    
         pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)    
         neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)    
         word_scores[word] = pos_score + neg_score
         
     best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
     self.bestwords = set([w for w, s in best])        
     print 'Best Words Count:', len(self.bestwords)#, 'Best Words Set:', self.bestwords
Example #2
0
def create_word_scores():
    # 创建所有正面和负面词汇的清单
    posWords = []
    negWords = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords.append(posWord)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords.append(negWord)

    # 将多维列表转为一维列表
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    #建立所有单词的频率分布,然后建立正负标签内的单词的频率分布
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1
    #找出正面和负面词的数量,以及词的总数
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    #
    #建立基于卡方检验的单词分数字典
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
Example #3
0
 def score_bigrams(ns_counts):
     for bigram, count in ns_counts[2].most_common():
         score = BigramAssocMeasures.likelihood_ratio(
             ns_counts[2][(bigram[0], bigram[1])],
             (
                 ns_counts[1][(bigram[0],)],
                 ns_counts[1][(bigram[1],)],
             ),
             ns_counts[0][()],
         )
         yield bigram, score
Example #4
0
            label_word_fd[label].inc(word)
    handle.seek(0)

word_fd = nltk.probability.FreqDist()
label_word_fd = nltk.probability.ConditionalFreqDist()
update_wordcount(word_fd, label_word_fd, smilefile, POSITIVE)
update_wordcount(word_fd, label_word_fd, frownfile, NEGATIVE)

pos_word_count = label_word_fd[POSITIVE].N()
neg_word_count = label_word_fd[NEGATIVE].N()
total_word_count = pos_word_count + neg_word_count

print "Finding top words"
word_scores = {}
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd[POSITIVE][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd[NEGATIVE][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
print "Best words"
#print bestwords
def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])

posfeats = features(best_word_feats, smilefile, POSITIVE)
negfeats = features(best_word_feats, frownfile, NEGATIVE)
classifier = nltk.NaiveBayesClassifier.train(posfeats + negfeats)
save_classifier(classifier)
Example #5
0

word_fd = nltk.probability.FreqDist()
label_word_fd = nltk.probability.ConditionalFreqDist()
update_wordcount(word_fd, label_word_fd, smilefile, POSITIVE)
update_wordcount(word_fd, label_word_fd, frownfile, NEGATIVE)

pos_word_count = label_word_fd[POSITIVE].N()
neg_word_count = label_word_fd[NEGATIVE].N()
total_word_count = pos_word_count + neg_word_count

print "Finding top words"
word_scores = {}
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd[POSITIVE][word],
                                           (freq, pos_word_count),
                                           total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd[NEGATIVE][word],
                                           (freq, neg_word_count),
                                           total_word_count)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.iteritems(), key=lambda (w, s): s,
              reverse=True)[:10000]
bestwords = set([w for w, s in best])
print "Best words"


#print bestwords
def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])
def count_statistics(candidates, bigram_corpus_size, trigram_corpus_size):
    """The function for counting contingency tables"""
    print('=== Counting association measure ===')
    # Getting word frequencies
    word_counts = {}
    for word in candidates:
        i = 0
        for linkage in candidates[word]:
            for obj in candidates[word][linkage]:
                i += obj.abs_freq
                if not obj.third_word:
                    word_counts[obj.first_word + '_' +
                                obj.second_word] = obj.abs_freq
        word_counts[word] = i
    # Getting frequencies for a contingency table
    for word in candidates:
        for linkage in candidates[word]:
            for obj in candidates[word][linkage]:
                # Contingency tables for trigrams
                if obj.third_word:
                    n_iii = obj.abs_freq  # counts (w1, w2, w3)
                    n_ixx = word_counts[obj.first_word]  # counts (w1, , )
                    n_xix = word_counts[obj.second_word]  # counts ( , w2, )
                    n_xxi = word_counts[obj.third_word]  # counts ( , , w3)
                    if obj.first_word + '_' + obj.second_word in word_counts:
                        n_iix = word_counts[obj.first_word + '_' +
                                            obj.second_word]
                    else:
                        n_iix = 0
                    if obj.first_word + '_' + obj.third_word in word_counts:
                        n_ixi = word_counts[obj.first_word + '_' +
                                            obj.third_word]
                    else:
                        n_ixi = 0
                    if obj.second_word + '_' + obj.third_word in word_counts:
                        n_xii = word_counts[obj.second_word + '_' +
                                            obj.third_word]
                    else:
                        n_xii = 0
                    n_xxx = trigram_corpus_size  # counts any trigram

                    # Counting association measures for trigrams
                    obj.dice = 3 * float(n_iii) / float(n_ixx + n_xix + n_xxi)
                    obj.chi = TrigramAssocMeasures.chi_sq(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    obj.jaccard = TrigramAssocMeasures.jaccard(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    # obj.likelihood_ratio = TrigramAssocMeasures.likelihood_ratio(n_iii,
                    #                                                              (n_iix, n_ixi, n_xii),
                    #                                                              (n_ixx, n_xix, n_xxi),
                    #                                                              n_xxx)
                    obj.mi = TrigramAssocMeasures.mi_like(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    obj.pmi = TrigramAssocMeasures.pmi(n_iii,
                                                       (n_iix, n_ixi, n_xii),
                                                       (n_ixx, n_xix, n_xxi),
                                                       n_xxx)
                    obj.poisson_stirling = TrigramAssocMeasures.poisson_stirling(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    obj.t_score = TrigramAssocMeasures.student_t(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)

                # Contingency tables for bigrams
                else:
                    n_ii = obj.abs_freq  # counts (w1, w2)
                    n_ix = word_counts[obj.first_word]  # counts (w1, )
                    n_xi = word_counts[obj.second_word]  # counts (, w2)
                    n_xx = bigram_corpus_size  # counts any bigram
                    # Counting the Dice statistics for bigrams
                    obj.dice = BigramAssocMeasures.dice(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.chi = BigramAssocMeasures.chi_sq(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.t_score = BigramAssocMeasures.student_t(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.poisson_stirling = BigramAssocMeasures.poisson_stirling(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.pmi = BigramAssocMeasures.pmi(n_ii, (n_ix, n_xi), n_xx)
                    obj.mi = BigramAssocMeasures.mi_like(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.likelihood_ratio = BigramAssocMeasures.likelihood_ratio(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.jaccard = BigramAssocMeasures.jaccard(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.fisher = BigramAssocMeasures.fisher(
                        n_ii, (n_ix, n_xi), n_xx)