コード例 #1
0
def high_information_words(labelled_words,
                           score_fn=BigramAssocMeasures.chi_sq,
                           min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(
                n_ii, (n_ix, n_xi), n_xx
            )  # n_ii is occurances in a label, n_ix is occurance in total,
            # n_xi is total words in this category, n_xx total words
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]

        high_info_words |= set(bestwords)  # bitwise or operation

    return high_info_words
コード例 #2
0
def get_high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    '''
    Gets the high information words using chi square measure
    '''
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
    
    n_xx = label_word_fd.N()
    high_info_words = set()
    
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
        
        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score
        
        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
    
    return high_info_words
コード例 #3
0
def high_information_words(files, score_fn=BigramAssocMeasures.chi_sq, min_score=50):
    word_dict = FreqDist()
    ocean_word_dict = ConditionalFreqDist()
    hiw_categories = []

    for file in files:
        # For each token, add 1 to the overall FreqDist and 1 to the ConditionalFreqDist under the current personality trait
        for token in file[0]:
            for trait in file[1]:
                ocean_word_dict[trait][token] += 1
            word_dict[token] += 1

    n_xx = ocean_word_dict.N()  # Get the total number of recordings in the ConditionalFreqDist
    high_info_words = set()

    for condition in ocean_word_dict.conditions():
        n_xi = ocean_word_dict[condition].N()  # Get the number of recordings for each personality trait
        word_scores = defaultdict(int)

        for word, n_ii in ocean_word_dict[condition].items():
            n_ix = word_dict[word]  # Get total number of recordings of a token
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        bw = list({k for k, v in sorted(word_scores.items(), key=lambda x: x[1], reverse=True)})
        high_info_words |= set(bestwords)
        hiw_categories.append((condition, bw[:10]))

    return high_info_words, hiw_categories
コード例 #4
0
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=0):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, sentences in labelled_words:
        for sent in sentences:
            words = preProcess(sent)
            for word in words:
                word_fd[word] += 1
                label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    labelScore = []
    for label in sorted(label_word_fd.conditions()):
        # if label == 0:
        #     min_score = 1.0
        # elif label == 1:
        #     min_score = 1.0
        # elif label == 2:
        #     min_score = 1.0
        # elif label == 3:
        #     min_score = 1.0
        # elif label == 4:
        #     min_score = 1.0

        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
        labelScore.append(word_scores)

    which = 0
    for x in labelScore:
        sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)
        labelCSV = pd.DataFrame(sorted_x)
        fileName = "wang2226_%d.csv" % which
        labelCSV.to_csv(fileName, index=False, sep=',')
        which += 1

    return high_info_words
コード例 #5
0
ファイル: nb_feature.py プロジェクト: alifars/portfolio
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.raw_freq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
         for word in words:
                word_fd.inc(word)
                label_word_fd[label].inc(word)
    n_xx = label_word_fd.N()
    high_info_words = set()
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
    for word, n_ii in label_word_fd[label].iteritems():
        n_ix = word_fd[word]
        score = score_fn(n_ii, (n_ix, n_xi), n_xx)
        word_scores[word] = score
    bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
    high_info_words |= set(bestwords)
    return high_info_words
コード例 #6
0
ファイル: scoring.py プロジェクト: nikicc/slovene-nltk-tagger
def sum_category_word_scores(categorized_words, score_fn):
    word_fd = FreqDist()
    category_word_fd = ConditionalFreqDist()

    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in category_word_fd[category].iteritems():
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)

    return scores
コード例 #7
0
def high_information_words(labelled_words,
                           score_fn=BigramAssocMeasures.chi_sq,
                           min_score=5):
    """ returns a set of words with the highest information  """
    """
    n_ii : frequency for the word for the label
    n_ix : total freq for the word across all labels
    n_xi : total freq of all words that occured for the label
    n_xx : total freq for all words in all labels

    """

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]
        high_info_words |= set(bestwords)

    return high_info_words