Exemple #1
0
def plot_word_dist_as_cloud(word_dist, file_name=None, plot=False):
    prob_dist = MLEProbDist(word_dist)
    viz_dict = {}
    for word_tuple in word_dist:
        string = ' '.join(word_tuple)
        viz_dict[string] = prob_dist.prob(word_tuple)

    wordcloud = WordCloud(max_words=100).generate_from_frequencies(viz_dict)
    if file_name != None:
        wordcloud.to_file("img/" + file_name +".png")

    if plot:
        plt.figure()
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()
def main():
  DEBUG =1
  depRelFile=open(sys.argv[1],'r')	#file with dep rel tuples
  ReadDictFromFile(sys.argv[2],lemma_dict) #lemma file
  modelFile = open(sys.argv[3],'w')
  if (len(sys.argv)==5):
    DEBUG = int(sys.argv[4])

  print "---Done loading lemma file---"
 
  print "---Computing CDF....---"
  incompletePairs = ComputeFreqDist(depRelFile)
  print "---Done computing CDF---"
  print "incomplete pairs: ",incompletePairs
  
  if(DEBUG):
  
    print "Info about F(arg)"
    print "unique samples: ", argFD.B()
    print "total seen samples: ", argFD.N()
    print "top arg:", argFD.max()
    print "count for support: ", argFD['support']
    print "Info about CFD(arg|rel,vb)"
    print "unique conditions seen: ", len(argVbRelCFD.conditions())
    print "total seen samples", argVbRelCFD.N()
    top_CFD1 = sorted(argVbRelCFD[('dobj','enjoy')].items(),key=operator.itemgetter(1), reverse=True)[:10]
    print "all dobj,enjoy: ", argVbRelCFD[('dobj','enjoy')].N()
    print "top dobj for enjoy:\n",top_CFD1
    print "Info about CFD(arg|vb)"
    print "unique conditions seen: ", len(argVbCFD.conditions())
    print "total seen samples", argVbCFD.N()
    top_CFD2 = sorted(argVbCFD['enjoy'].items(),key=operator.itemgetter(1), reverse=True)[:10]
    print "all enjoy: ", argVbCFD['enjoy'].N()
    print "top arg for enjoy:\n",top_CFD2


  print "---Computing MLE PDFs....---"
  argVbRelPDF = ConditionalProbDist(argVbRelCFD,MLEProbDist)
  argVbPDF = ConditionalProbDist(argVbCFD,MLEProbDist)
  argPDF = MLEProbDist(argFD)


    #I'm not sure here Types is equivelent with argVbRelCFD.conditions() or unique condition+arg
    #!!!!! lambda should be for each history P(a|v) T = count of unique (v a) pairs starting with v
    # for each condition v -> sum(CFD[v].B() -> how many unique arguments I've seen after this condition)
   

  print "---Computing Witten-Bell smoothed PDFs....---"

  #for unseen pairs we multiply the backoff_weight with the probability of the backoff model
  #e.g. if  c(rel,vb,arg)=0 and c(vb,arg)>0 then P(arg|rel,vb)=argRelVbPDFWB_backoff_weights[(rel,vb)] * argVbPDFWB[vb].prob(arg)
  argPDFWB, backoff_uniform = ComputeWBArg(argPDF)
  argVbPDFWB, argVbPDFWB_backoff_weights,  countArgVB = ComputeWBVbArg(argVbPDF,argPDFWB)
  argRelVbPDFWB,argRelVbPDFWB_backoff_weights, countRelVbArg = ComputeWBRelVbArg(argVbRelPDF,argVbPDFWB)


  if(DEBUG):
    print "P(support|dobs,enjoy)"
    print argVbRelPDF[('dobj','enjoy')].prob('support')
    print argRelVbPDFWB[('dobj','enjoy')]['support']
    print "No args following (dobj,enjoy)", argVbRelCFD[('dobj','enjoy')].B()
    print "P(support|enjoy)"
    print argVbPDF['enjoy'].prob('support')
    print argVbPDFWB['enjoy']['support']
    print "P(support)"
    print argPDF.prob('support')
    print argPDFWB['support']

  WriteToArpaFormat(modelFile, len(argPDFWB),countArgVB,countRelVbArg,argPDFWB,argVbPDFWB,argRelVbPDFWB, backoff_uniform, argVbPDFWB_backoff_weights, argRelVbPDFWB_backoff_weights)

  if(DEBUG):
    #print sorted(argVbPDFWB['enjoy'],key=operator.itemgetter(1), reverse=True)[:5] #[('enjoy','support')]
    
    for condition in argVbPDFWB.keys()[:10]:
      sum1 = 0
      sum2 = 0
      for prob in argVbPDFWB[condition].values():
        sum1+=prob
      for arg in argVbCFD[condition].items():
        sum2+=argVbPDF[condition].prob(arg[0])
      print "total prob: ", sum1, sum2

    print "P_WB(support|dobj, enjoy)"
    print argRelVbPDFWB[('dobj','enjoy')]['support']

    for condition in argRelVbPDFWB.keys()[:10]:
      sum = 0
      for prob in argRelVbPDFWB[condition].values():
        sum+=prob
      print "total prob: ", sum
Exemple #3
0
print("tokenized text: ", tokenized_text, "\n")

tokenized_text = nltk.word_tokenize(inputFile)
tokenized_text = [word.lower() for word in tokenized_text if word.isalpha()]
print("Lower cased text: ", tokenized_text)
print("Word Count: ", len(tokenized_text), "\n")

freq_dist_uni = nltk.FreqDist(tokenized_text)
print("Most common 10 unigram: ", freq_dist_uni.most_common(10), "\n",
      "least common 3 words: ",
      freq_dist_uni.most_common()[-3:], "\n")

prob_distArray = []
prob_dist_uni = MLEProbDist(freq_dist_uni)
for s in prob_dist_uni.samples():
    prob_distArray.append(prob_dist_uni.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i])
    i += 1

elep = ELEProbDist(freq_dist_uni)
for s in elep.samples():
    prob_distArray.append(elep.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i], "\n")
    i += 1

uniqueWords = len(set(tokenized_text))
print("Unique Words: ", uniqueWords, "\n")
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.probability import MLEProbDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
WhitespaceTokenizer().tokenize(corpus)

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

prob_dist = MLEProbDist(freq_dist)

# P(x) = freq(x)
prob_dist.prob('the')
freq_dist.freq('the')

#
# Estimating the probability distribution for roll2
#

import random
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.probability import MLEProbDist

def roll2():
	return (random.choice([1,2,3,4,5,6])+ random.choice([1,2,3,4,5,6]))
Exemple #5
0
counter_class = OrderedCounter(counter.keys())

# Array of the classes
classes = {}
classes = [count for n, count in counter.items() for i in range(count)]

# Variables (X,Y) for the machine learning algorithms
Y = np.array(classes).T

del classes
del counter

#Calculate entropy with nltk library
freq_dist = FreqDist(sorted_counts)
prob_dist = MLEProbDist(freq_dist)
px = [prob_dist.prob(x) for x, n_x in sorted_counts.items()]
e_x = [-p_x * math.log(p_x, 2) for p_x in px]

del freq_dist
del prob_dist
del px

# Calculate the prime numbers for Godel Numbers
prime_numbers = []
prime_numbers = godel_f.sieve(k)

# Calculate Godel Numbers
godel_numbers = {}
godel_numbers = godel_f.godel(sorted_counts, prime_numbers, godel_numbers)
del prime_numbers