Python ConditionalFreqDist.N Examples

Programming Language: Python

Namespace/Package Name: nltk.probability

Method/Function: N

Examples at hotexamples.com: 7

Python ConditionalFreqDist.N - 7 examples found. These are the top rated real world Python examples of nltk.probability.ConditionalFreqDist.N extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ConditionalFreqDist(30)

conditions(22)

keys(9)

N(7)

B(4)

items(3)

plot(3)

tabulate(3)

__init__(2)

__add__(1)

_fdists(1)

get(1)

iteritems(1)

Example #1

Show file

def high_information_words(labelled_words,
                           score_fn=BigramAssocMeasures.chi_sq,
                           min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(
                n_ii, (n_ix, n_xi), n_xx
            )  # n_ii is occurances in a label, n_ix is occurance in total,
            # n_xi is total words in this category, n_xx total words
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]

        high_info_words |= set(bestwords)  # bitwise or operation

    return high_info_words

Example #2

Show file

def get_high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    '''
    Gets the high information words using chi square measure
    '''
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
    
    n_xx = label_word_fd.N()
    high_info_words = set()
    
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
        
        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score
        
        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
    
    return high_info_words

Example #3

Show file

def high_information_words(files, score_fn=BigramAssocMeasures.chi_sq, min_score=50):
    word_dict = FreqDist()
    ocean_word_dict = ConditionalFreqDist()
    hiw_categories = []

    for file in files:
        # For each token, add 1 to the overall FreqDist and 1 to the ConditionalFreqDist under the current personality trait
        for token in file[0]:
            for trait in file[1]:
                ocean_word_dict[trait][token] += 1
            word_dict[token] += 1

    n_xx = ocean_word_dict.N()  # Get the total number of recordings in the ConditionalFreqDist
    high_info_words = set()

    for condition in ocean_word_dict.conditions():
        n_xi = ocean_word_dict[condition].N()  # Get the number of recordings for each personality trait
        word_scores = defaultdict(int)

        for word, n_ii in ocean_word_dict[condition].items():
            n_ix = word_dict[word]  # Get total number of recordings of a token
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        bw = list({k for k, v in sorted(word_scores.items(), key=lambda x: x[1], reverse=True)})
        high_info_words |= set(bestwords)
        hiw_categories.append((condition, bw[:10]))

    return high_info_words, hiw_categories

Example #4

Show file

def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=0):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, sentences in labelled_words:
        for sent in sentences:
            words = preProcess(sent)
            for word in words:
                word_fd[word] += 1
                label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    labelScore = []
    for label in sorted(label_word_fd.conditions()):
        # if label == 0:
        #     min_score = 1.0
        # elif label == 1:
        #     min_score = 1.0
        # elif label == 2:
        #     min_score = 1.0
        # elif label == 3:
        #     min_score = 1.0
        # elif label == 4:
        #     min_score = 1.0

        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
        labelScore.append(word_scores)

    which = 0
    for x in labelScore:
        sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)
        labelCSV = pd.DataFrame(sorted_x)
        fileName = "wang2226_%d.csv" % which
        labelCSV.to_csv(fileName, index=False, sep=',')
        which += 1

    return high_info_words

Example #5

Show file

File: nb_feature.py Project: alifars/portfolio

def high_information_words(labelled_words, score_fn=BigramAssocMeasures.raw_freq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
         for word in words:
                word_fd.inc(word)
                label_word_fd[label].inc(word)
    n_xx = label_word_fd.N()
    high_info_words = set()
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
    for word, n_ii in label_word_fd[label].iteritems():
        n_ix = word_fd[word]
        score = score_fn(n_ii, (n_ix, n_xi), n_xx)
        word_scores[word] = score
    bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
    high_info_words |= set(bestwords)
    return high_info_words

Example #6

Show file

File: scoring.py Project: nikicc/slovene-nltk-tagger

def sum_category_word_scores(categorized_words, score_fn):
    word_fd = FreqDist()
    category_word_fd = ConditionalFreqDist()

    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in category_word_fd[category].iteritems():
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)

    return scores

Example #7

Show file

File: TwSentiment.py Project: lilyzhi1/fake_news_during_election

def high_information_words(labelled_words,
                           score_fn=BigramAssocMeasures.chi_sq,
                           min_score=5):
    """ returns a set of words with the highest information  """
    """
    n_ii : frequency for the word for the label
    n_ix : total freq for the word across all labels
    n_xi : total freq of all words that occured for the label
    n_xx : total freq for all words in all labels

    """

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]
        high_info_words |= set(bestwords)

    return high_info_words