def getMeTheNGramModel(tokens, n): mydict = {} position = 0 for x in tokens[0:-(n-1)]: ngram = " ".join( tokens[ position : position + n ] ) mydict[ngram] = mydict.get(ngram, 0) + 1 #print( ngram ) position += 1 relativizeFP(mydict) return mydict
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- from math import log #import corpus from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) ) relativizeFP(mydict) #for key in mydict: # print(key, mydict[key], sep="\t") mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) ) relativizeFP(mysportsdict) unktokens = tokenize(""" The young King was eating pomegranates and talking about his soul and other emotional issues. """) probpomeg = 0.0 probsports = 0.0 for token in unktokens: probpomeg += log(mydict.get(token, 0.00000000000001)) probsports += log(mysportsdict.get(token, 0.00000000000001)) if probpomeg > probsports: print("This text is probably House of Pomeg.") else:
youngF = unigrams["young"] FishermanF = unigrams["Fisherman"] youngFishermanF = bigrams["young Fisherman"] total = sum(unigrams.values()) observationtable = [[youngFishermanF, FishermanF - youngFishermanF], [ youngF - youngFishermanF, (total - youngF) - (FishermanF - youngFishermanF) ]] print(observationtable) expectationtable = [[ (sum(observationtable[0]) * (observationtable[0][0] + observationtable[1][0])) / total ], []] print(expectationtable) chi2 = (observationtable[0][0] - expectationtable[0][0])**2 / expectationtable[0][0] print("Chi2-score:", chi2) relativizeFP(unigrams) relativizeFP(bigrams) print("\n\nProbability ratio:") print("P(young Fisherman):", bigrams["young Fisherman"]) print("P(young) * P(Fisherman):", unigrams["young"] * unigrams["Fisherman"]) print("P(young Fisherman) / ( P(young) * P(Fisherman) ):", bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]))
#print(unigrams) # prettyPrintFRP(bigrams, myreverse=False) youngF = unigrams["young"] FishermanF = unigrams["Fisherman"] youngFishermanF = bigrams["young Fisherman"] total = sum(unigrams.values()) observationtable = [ [ youngFishermanF, FishermanF - youngFishermanF ], [ youngF - youngFishermanF, (total - youngF) - (FishermanF - youngFishermanF) ] ] print(observationtable) expectationtable = [ [ (sum(observationtable[0]) * (observationtable[0][0] + observationtable[1][0])) / total ], [] ] print(expectationtable) chi2 = (observationtable[0][0] - expectationtable[0][0]) ** 2 / expectationtable[0][0] print("Chi2-score:", chi2) relativizeFP(unigrams) relativizeFP(bigrams) print("\n\nProbability ratio:") print("P(young Fisherman):", bigrams["young Fisherman"]) print("P(young) * P(Fisherman):", unigrams["young"] * unigrams["Fisherman"]) print("P(young Fisherman) / ( P(young) * P(Fisherman) ):", bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]))