def extract_doc_feats_counts(refactorized_documents): from nltk import FreqDist from collections import defaultdict import itertools import math import pdb import numpy doc_num = len(refactorized_documents) ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents)) glob_freqs = FreqDist(ref_docs_flat) tokens = glob_freqs.samples() for i in range(0, doc_num): doc_features = [0] * len(tokens) doc_freqs = FreqDist(refactorized_documents[i]) for (tok, freq) in doc_freqs.items(): indx = tokens.index(tok) doc_features[indx] = freq * doc_freqs.N() f_tmp = numpy.asarray(doc_features) glob_features[i] = f_tmp.tolist() return (glob_features, tokens)
def extract_doc_feats(refactorized_documents): from nltk import FreqDist from collections import defaultdict import itertools import math import pdb import numpy doc_num = len(refactorized_documents) occurences = defaultdict(lambda: 0) for doc in refactorized_documents: for x in set(doc): occurences[x] += 1 ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents)) glob_freqs = FreqDist(ref_docs_flat) tokens = glob_freqs.samples() glob_features = [{}]*doc_num for i in range(0, doc_num): doc_features = [0]*len(tokens) doc_freqs = FreqDist(refactorized_documents[i]) doc_len = len(refactorized_documents[i]) for (tok,num) in doc_freqs.items(): max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len) # augmented #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq) tf = 1+math.log(num,10) idf = math.log( float(doc_num) / (float(occurences[tok])) ,10) tfidf = tf*idf indx = tokens.index(tok) doc_features[indx] = tfidf f_tmp = numpy.asarray(doc_features) f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps) glob_features[i] = f_tmp.tolist() glob_features = numpy.asarray(glob_features)*glob_freqs.N() print "Glob Freqs:", glob_freqs.N() return (glob_features,tokens)
# # First # # Here we will determine the relative frequencies of English characters in the text # Then we will calculate the entropy of the distribution # here we use the expression list(var_name) to turn our string into a list # this basically separates each character for us to make it so that it works # directly in the freqdist function english_unigram_fdist = FreqDist(list(english_model_content)) english_unigram_entropy = 0.0 # now loop and get the entropy for english unigrams for unigram in english_unigram_fdist.samples(): english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log(english_unigram_fdist.freq(unigram), 2) english_unigram_entropy = -english_unigram_entropy print "The English Unigram Entropy is: " + str(english_unigram_entropy) # # Second # # Here we will determine the relative frequencies of English bigrams in the text # Then we will calculate the entropy of the bigram distribution # create a list to store bigrams in english_model_bigrams = []
# # First # # Here we will determine the relative frequencies of English characters in the text # Then we will calculate the entropy of the distribution # here we use the expression list(var_name) to turn our string into a list # this basically separates each character for us to make it so that it works # directly in the freqdist function english_unigram_fdist = FreqDist(list(english_model_content)) english_unigram_entropy = 0.0 # now loop and get the entropy for english unigrams for unigram in english_unigram_fdist.samples(): english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log( english_unigram_fdist.freq(unigram), 2) english_unigram_entropy = -english_unigram_entropy print "The English Unigram Entropy is: " + str(english_unigram_entropy) # # Second # # Here we will determine the relative frequencies of English bigrams in the text # Then we will calculate the entropy of the bigram distribution # create a list to store bigrams in english_model_bigrams = []