Example #1
0
def getMeTheNGramModel(tokens, n):
   mydict = {}
   position = 0
   for x in tokens[0:-(n-1)]:
      ngram = " ".join( tokens[ position : position + n ] )
      mydict[ngram] = mydict.get(ngram, 0) + 1
      #print( ngram )
      position += 1
   relativizeFP(mydict)
   return mydict
Example #2
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

from math import log


#import corpus
from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP

mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) )   
relativizeFP(mydict)

#for key in mydict:
#   print(key, mydict[key], sep="\t")

mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) )
relativizeFP(mysportsdict)

unktokens = tokenize("""
The young King was eating pomegranates and talking about his soul and other emotional issues.
""")

probpomeg = 0.0
probsports = 0.0
for token in unktokens:
   probpomeg += log(mydict.get(token, 0.00000000000001))
   probsports += log(mysportsdict.get(token, 0.00000000000001))

if probpomeg > probsports:
   print("This text is probably House of Pomeg.")
else:
Example #3
0
youngF = unigrams["young"]
FishermanF = unigrams["Fisherman"]
youngFishermanF = bigrams["young Fisherman"]
total = sum(unigrams.values())

observationtable = [[youngFishermanF, FishermanF - youngFishermanF],
                    [
                        youngF - youngFishermanF,
                        (total - youngF) - (FishermanF - youngFishermanF)
                    ]]
print(observationtable)

expectationtable = [[
    (sum(observationtable[0]) *
     (observationtable[0][0] + observationtable[1][0])) / total
], []]
print(expectationtable)

chi2 = (observationtable[0][0] -
        expectationtable[0][0])**2 / expectationtable[0][0]
print("Chi2-score:", chi2)

relativizeFP(unigrams)
relativizeFP(bigrams)

print("\n\nProbability ratio:")
print("P(young Fisherman):", bigrams["young Fisherman"])
print("P(young) * P(Fisherman):", unigrams["young"] * unigrams["Fisherman"])
print("P(young Fisherman) / ( P(young) * P(Fisherman) ):",
      bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]))
Example #4
0
#print(unigrams)

# prettyPrintFRP(bigrams, myreverse=False)

youngF = unigrams["young"]
FishermanF = unigrams["Fisherman"]
youngFishermanF = bigrams["young Fisherman"]
total = sum(unigrams.values())

observationtable = [ [ youngFishermanF,          FishermanF - youngFishermanF ],
                     [ youngF - youngFishermanF, (total - youngF) - (FishermanF - youngFishermanF)  ] ]
print(observationtable)

expectationtable = [ [ (sum(observationtable[0]) * (observationtable[0][0] + observationtable[1][0])) / total  ],
                     [] ]
print(expectationtable)


chi2 = (observationtable[0][0] - expectationtable[0][0]) ** 2 / expectationtable[0][0]
print("Chi2-score:", chi2)

relativizeFP(unigrams)
relativizeFP(bigrams)

print("\n\nProbability ratio:")
print("P(young Fisherman):", bigrams["young Fisherman"])
print("P(young) * P(Fisherman):", unigrams["young"] * unigrams["Fisherman"])
print("P(young Fisherman) / ( P(young) * P(Fisherman) ):",  bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]))