Example #1
0
def extract_doc_feats_counts(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()

    for i in range(0, doc_num):
        doc_features = [0] * len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])

        for (tok, freq) in doc_freqs.items():
            indx = tokens.index(tok)
            doc_features[indx] = freq * doc_freqs.N()

        f_tmp = numpy.asarray(doc_features)
        glob_features[i] = f_tmp.tolist()

    return (glob_features, tokens)
Example #2
0
def extract_doc_feats(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    occurences = defaultdict(lambda: 0)
    for doc in refactorized_documents:
        for x in set(doc): occurences[x] += 1

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()
    glob_features = [{}]*doc_num


    for i in range(0, doc_num):
        doc_features = [0]*len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])
        doc_len = len(refactorized_documents[i])

        for (tok,num) in doc_freqs.items():
            max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len)

            # augmented
            #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq)
            tf = 1+math.log(num,10)
            idf = math.log( float(doc_num) / (float(occurences[tok])) ,10)
            tfidf = tf*idf

            indx = tokens.index(tok)
            doc_features[indx] = tfidf

        f_tmp = numpy.asarray(doc_features)
        f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps)
        glob_features[i] = f_tmp.tolist()

    glob_features = numpy.asarray(glob_features)*glob_freqs.N()
    print "Glob Freqs:", glob_freqs.N()

    return (glob_features,tokens)
#
# First
#
# Here we will determine the relative frequencies of English characters in the text
# Then we will calculate the entropy of the distribution

# here we use the expression list(var_name) to turn our string into a list
# this basically separates each character for us to make it so that it works
# directly in the freqdist function
english_unigram_fdist = FreqDist(list(english_model_content))

english_unigram_entropy = 0.0

# now loop and get the entropy for english unigrams
for unigram in english_unigram_fdist.samples():
    english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log(english_unigram_fdist.freq(unigram), 2)

english_unigram_entropy = -english_unigram_entropy

print "The English Unigram Entropy is: " + str(english_unigram_entropy)


#
# Second
#
# Here we will determine the relative frequencies of English bigrams in the text
# Then we will calculate the entropy of the bigram distribution

# create a list to store bigrams in
english_model_bigrams = []
Example #4
0
#
# First
#
# Here we will determine the relative frequencies of English characters in the text
# Then we will calculate the entropy of the distribution

# here we use the expression list(var_name) to turn our string into a list
# this basically separates each character for us to make it so that it works
# directly in the freqdist function
english_unigram_fdist = FreqDist(list(english_model_content))

english_unigram_entropy = 0.0

# now loop and get the entropy for english unigrams
for unigram in english_unigram_fdist.samples():
    english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log(
        english_unigram_fdist.freq(unigram), 2)

english_unigram_entropy = -english_unigram_entropy

print "The English Unigram Entropy is: " + str(english_unigram_entropy)

#
# Second
#
# Here we will determine the relative frequencies of English bigrams in the text
# Then we will calculate the entropy of the bigram distribution

# create a list to store bigrams in
english_model_bigrams = []