Python FreqDist.samples Examples

Programming Language: Python

Namespace/Package Name: nltk

Class/Type: FreqDist

Method/Function: samples

Examples at hotexamples.com: 4

Python FreqDist.samples - 4 examples found. These are the top rated real world Python examples of nltk.FreqDist.samples extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

keys(30)

N(30)

values(30)

update(30)

plot(30)

most_common(30)

FreqDist(30)

items(30)

freq(30)

inc(26)

hapaxes(25)

B(22)

get(22)

max(18)

iteritems(7)

pop(6)

copy(5)

tabulate(4)

samples(3)

__delitem__(2)

pformat(2)

sort_values(2)

has_key(1)

__init__(1)

transpose(1)

sort(1)

pprint(1)

reverse(1)

reset_index(1)

r_Nr(1)

_cumulative_frequencies(1)

clear(1)

elements(1)

insert(1)

viewkeys(1)

Example #1

Show file

File: parser.py Project: AtikJam/souma

def extract_doc_feats_counts(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()

    for i in range(0, doc_num):
        doc_features = [0] * len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])

        for (tok, freq) in doc_freqs.items():
            indx = tokens.index(tok)
            doc_features[indx] = freq * doc_freqs.N()

        f_tmp = numpy.asarray(doc_features)
        glob_features[i] = f_tmp.tolist()

    return (glob_features, tokens)

Example #2

Show file

File: stuff.py Project: AtikJam/souma

def extract_doc_feats(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    occurences = defaultdict(lambda: 0)
    for doc in refactorized_documents:
        for x in set(doc): occurences[x] += 1

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()
    glob_features = [{}]*doc_num


    for i in range(0, doc_num):
        doc_features = [0]*len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])
        doc_len = len(refactorized_documents[i])

        for (tok,num) in doc_freqs.items():
            max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len)

            # augmented
            #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq)
            tf = 1+math.log(num,10)
            idf = math.log( float(doc_num) / (float(occurences[tok])) ,10)
            tfidf = tf*idf

            indx = tokens.index(tok)
            doc_features[indx] = tfidf

        f_tmp = numpy.asarray(doc_features)
        f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps)
        glob_features[i] = f_tmp.tolist()

    glob_features = numpy.asarray(glob_features)*glob_freqs.N()
    print "Glob Freqs:", glob_freqs.N()

    return (glob_features,tokens)

Example #3

Show file

File: calc_info_measures.py Project: skunath/NLP_Examples

#
# First
#
# Here we will determine the relative frequencies of English characters in the text
# Then we will calculate the entropy of the distribution

# here we use the expression list(var_name) to turn our string into a list
# this basically separates each character for us to make it so that it works
# directly in the freqdist function
english_unigram_fdist = FreqDist(list(english_model_content))

english_unigram_entropy = 0.0

# now loop and get the entropy for english unigrams
for unigram in english_unigram_fdist.samples():
    english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log(english_unigram_fdist.freq(unigram), 2)

english_unigram_entropy = -english_unigram_entropy

print "The English Unigram Entropy is: " + str(english_unigram_entropy)


#
# Second
#
# Here we will determine the relative frequencies of English bigrams in the text
# Then we will calculate the entropy of the bigram distribution

# create a list to store bigrams in
english_model_bigrams = []

Example #4

Show file

#
# First
#
# Here we will determine the relative frequencies of English characters in the text
# Then we will calculate the entropy of the distribution

# here we use the expression list(var_name) to turn our string into a list
# this basically separates each character for us to make it so that it works
# directly in the freqdist function
english_unigram_fdist = FreqDist(list(english_model_content))

english_unigram_entropy = 0.0

# now loop and get the entropy for english unigrams
for unigram in english_unigram_fdist.samples():
    english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log(
        english_unigram_fdist.freq(unigram), 2)

english_unigram_entropy = -english_unigram_entropy

print "The English Unigram Entropy is: " + str(english_unigram_entropy)

#
# Second
#
# Here we will determine the relative frequencies of English bigrams in the text
# Then we will calculate the entropy of the bigram distribution

# create a list to store bigrams in
english_model_bigrams = []