def plot_word_dist_as_cloud(word_dist, file_name=None, plot=False): prob_dist = MLEProbDist(word_dist) viz_dict = {} for word_tuple in word_dist: string = ' '.join(word_tuple) viz_dict[string] = prob_dist.prob(word_tuple) wordcloud = WordCloud(max_words=100).generate_from_frequencies(viz_dict) if file_name != None: wordcloud.to_file("img/" + file_name +".png") if plot: plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
def main(): DEBUG =1 depRelFile=open(sys.argv[1],'r') #file with dep rel tuples ReadDictFromFile(sys.argv[2],lemma_dict) #lemma file modelFile = open(sys.argv[3],'w') if (len(sys.argv)==5): DEBUG = int(sys.argv[4]) print "---Done loading lemma file---" print "---Computing CDF....---" incompletePairs = ComputeFreqDist(depRelFile) print "---Done computing CDF---" print "incomplete pairs: ",incompletePairs if(DEBUG): print "Info about F(arg)" print "unique samples: ", argFD.B() print "total seen samples: ", argFD.N() print "top arg:", argFD.max() print "count for support: ", argFD['support'] print "Info about CFD(arg|rel,vb)" print "unique conditions seen: ", len(argVbRelCFD.conditions()) print "total seen samples", argVbRelCFD.N() top_CFD1 = sorted(argVbRelCFD[('dobj','enjoy')].items(),key=operator.itemgetter(1), reverse=True)[:10] print "all dobj,enjoy: ", argVbRelCFD[('dobj','enjoy')].N() print "top dobj for enjoy:\n",top_CFD1 print "Info about CFD(arg|vb)" print "unique conditions seen: ", len(argVbCFD.conditions()) print "total seen samples", argVbCFD.N() top_CFD2 = sorted(argVbCFD['enjoy'].items(),key=operator.itemgetter(1), reverse=True)[:10] print "all enjoy: ", argVbCFD['enjoy'].N() print "top arg for enjoy:\n",top_CFD2 print "---Computing MLE PDFs....---" argVbRelPDF = ConditionalProbDist(argVbRelCFD,MLEProbDist) argVbPDF = ConditionalProbDist(argVbCFD,MLEProbDist) argPDF = MLEProbDist(argFD) #I'm not sure here Types is equivelent with argVbRelCFD.conditions() or unique condition+arg #!!!!! lambda should be for each history P(a|v) T = count of unique (v a) pairs starting with v # for each condition v -> sum(CFD[v].B() -> how many unique arguments I've seen after this condition) print "---Computing Witten-Bell smoothed PDFs....---" #for unseen pairs we multiply the backoff_weight with the probability of the backoff model #e.g. if c(rel,vb,arg)=0 and c(vb,arg)>0 then P(arg|rel,vb)=argRelVbPDFWB_backoff_weights[(rel,vb)] * argVbPDFWB[vb].prob(arg) argPDFWB, backoff_uniform = ComputeWBArg(argPDF) argVbPDFWB, argVbPDFWB_backoff_weights, countArgVB = ComputeWBVbArg(argVbPDF,argPDFWB) argRelVbPDFWB,argRelVbPDFWB_backoff_weights, countRelVbArg = ComputeWBRelVbArg(argVbRelPDF,argVbPDFWB) if(DEBUG): print "P(support|dobs,enjoy)" print argVbRelPDF[('dobj','enjoy')].prob('support') print argRelVbPDFWB[('dobj','enjoy')]['support'] print "No args following (dobj,enjoy)", argVbRelCFD[('dobj','enjoy')].B() print "P(support|enjoy)" print argVbPDF['enjoy'].prob('support') print argVbPDFWB['enjoy']['support'] print "P(support)" print argPDF.prob('support') print argPDFWB['support'] WriteToArpaFormat(modelFile, len(argPDFWB),countArgVB,countRelVbArg,argPDFWB,argVbPDFWB,argRelVbPDFWB, backoff_uniform, argVbPDFWB_backoff_weights, argRelVbPDFWB_backoff_weights) if(DEBUG): #print sorted(argVbPDFWB['enjoy'],key=operator.itemgetter(1), reverse=True)[:5] #[('enjoy','support')] for condition in argVbPDFWB.keys()[:10]: sum1 = 0 sum2 = 0 for prob in argVbPDFWB[condition].values(): sum1+=prob for arg in argVbCFD[condition].items(): sum2+=argVbPDF[condition].prob(arg[0]) print "total prob: ", sum1, sum2 print "P_WB(support|dobj, enjoy)" print argRelVbPDFWB[('dobj','enjoy')]['support'] for condition in argRelVbPDFWB.keys()[:10]: sum = 0 for prob in argRelVbPDFWB[condition].values(): sum+=prob print "total prob: ", sum
print("tokenized text: ", tokenized_text, "\n") tokenized_text = nltk.word_tokenize(inputFile) tokenized_text = [word.lower() for word in tokenized_text if word.isalpha()] print("Lower cased text: ", tokenized_text) print("Word Count: ", len(tokenized_text), "\n") freq_dist_uni = nltk.FreqDist(tokenized_text) print("Most common 10 unigram: ", freq_dist_uni.most_common(10), "\n", "least common 3 words: ", freq_dist_uni.most_common()[-3:], "\n") prob_distArray = [] prob_dist_uni = MLEProbDist(freq_dist_uni) for s in prob_dist_uni.samples(): prob_distArray.append(prob_dist_uni.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i]) i += 1 elep = ELEProbDist(freq_dist_uni) for s in elep.samples(): prob_distArray.append(elep.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i], "\n") i += 1 uniqueWords = len(set(tokenized_text)) print("Unique Words: ", uniqueWords, "\n")
from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.probability import MLEProbDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) WhitespaceTokenizer().tokenize(corpus) for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) prob_dist = MLEProbDist(freq_dist) # P(x) = freq(x) prob_dist.prob('the') freq_dist.freq('the') # # Estimating the probability distribution for roll2 # import random from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.probability import MLEProbDist def roll2(): return (random.choice([1,2,3,4,5,6])+ random.choice([1,2,3,4,5,6]))
counter_class = OrderedCounter(counter.keys()) # Array of the classes classes = {} classes = [count for n, count in counter.items() for i in range(count)] # Variables (X,Y) for the machine learning algorithms Y = np.array(classes).T del classes del counter #Calculate entropy with nltk library freq_dist = FreqDist(sorted_counts) prob_dist = MLEProbDist(freq_dist) px = [prob_dist.prob(x) for x, n_x in sorted_counts.items()] e_x = [-p_x * math.log(p_x, 2) for p_x in px] del freq_dist del prob_dist del px # Calculate the prime numbers for Godel Numbers prime_numbers = [] prime_numbers = godel_f.sieve(k) # Calculate Godel Numbers godel_numbers = {} godel_numbers = godel_f.godel(sorted_counts, prime_numbers, godel_numbers) del prime_numbers