def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn( n_ii, (n_ix, n_xi), n_xx ) # n_ii is occurances in a label, n_ix is occurance in total, # n_xi is total words in this category, n_xx total words word_scores[word] = score bestwords = [ word for word, score in word_scores.items() if score >= min_score ] high_info_words |= set(bestwords) # bitwise or operation return high_info_words
def get_high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): ''' Gets the high information words using chi square measure ''' word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def high_information_words(files, score_fn=BigramAssocMeasures.chi_sq, min_score=50): word_dict = FreqDist() ocean_word_dict = ConditionalFreqDist() hiw_categories = [] for file in files: # For each token, add 1 to the overall FreqDist and 1 to the ConditionalFreqDist under the current personality trait for token in file[0]: for trait in file[1]: ocean_word_dict[trait][token] += 1 word_dict[token] += 1 n_xx = ocean_word_dict.N() # Get the total number of recordings in the ConditionalFreqDist high_info_words = set() for condition in ocean_word_dict.conditions(): n_xi = ocean_word_dict[condition].N() # Get the number of recordings for each personality trait word_scores = defaultdict(int) for word, n_ii in ocean_word_dict[condition].items(): n_ix = word_dict[word] # Get total number of recordings of a token score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] bw = list({k for k, v in sorted(word_scores.items(), key=lambda x: x[1], reverse=True)}) high_info_words |= set(bestwords) hiw_categories.append((condition, bw[:10])) return high_info_words, hiw_categories
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=0): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, sentences in labelled_words: for sent in sentences: words = preProcess(sent) for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() labelScore = [] for label in sorted(label_word_fd.conditions()): # if label == 0: # min_score = 1.0 # elif label == 1: # min_score = 1.0 # elif label == 2: # min_score = 1.0 # elif label == 3: # min_score = 1.0 # elif label == 4: # min_score = 1.0 n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) labelScore.append(word_scores) which = 0 for x in labelScore: sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True) labelCSV = pd.DataFrame(sorted_x) fileName = "wang2226_%d.csv" % which labelCSV.to_csv(fileName, index=False, sep=',') which += 1 return high_info_words
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.raw_freq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd.inc(word) label_word_fd[label].inc(word) n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].iteritems(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.iteritems() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def sum_category_word_scores(categorized_words, score_fn): word_fd = FreqDist() category_word_fd = ConditionalFreqDist() for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in category_word_fd[category].iteritems(): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): """ returns a set of words with the highest information """ """ n_ii : frequency for the word for the label n_ix : total freq for the word across all labels n_xi : total freq of all words that occured for the label n_xx : total freq for all words in all labels """ word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [ word for word, score in word_scores.items() if score >= min_score ] high_info_words |= set(bestwords) return high_info_words