# use this and then prefix all function names with corpus. #import corpus # or use this from corpus import loadTextFromFile, tokenize, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts from math import log mytext = loadTextFromFile("pg873.txt") # tokenize mytext and return list of tokens tokens = tokenize(mytext) # count tokens mydict = getTokenCounts(tokens) relativizeTokenCounts(mydict) # pretty-print tokens and frequencies #prettyPrintFrequencyProfile(mydict, sortbyfrq=True, myreverse=True) mytext = loadTextFromFile("sports-bbc.txt") mysportsdict = getTokenCounts(tokenize(mytext)) relativizeTokenCounts(mysportsdict) unknowntext = """Yesterday we scored ten goals in the last 45 minutest of the game.""" """ Wimbledon 2013: Andy Murray's victory could boost British tennis Comments (149) All too often sporting moments and achievements are given a misplaced historical significance. Not on Sunday.
def getMIScore(bigramprob, unigramprobaA, unigramprobB): return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB) ) def getMIScoreFromFQP( bigrams, unigrams, bigram): tokenA, tokenB = bigram.split() return bigrams[bigram] * log( bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB]) ) tokens = tokenize( loadTextFromFile("pg873.txt") ) unigrams = getNGrams(tokens, 1) relativizeTokenCounts( unigrams ) bigrams = getNGrams(tokens, 2) #prettyPrintFrequencyProfile(bigrams, myreverse=False) relativizeTokenCounts( bigrams ) # young King: likelihood ratio lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]) # young King - pointwise Mutual Information pmi = bigrams["young Fisherman"] * log( lhr ) print("young Fisherman", lhr, pmi, sep="\t")
def getMIScore(bigramprob, unigramprobaA, unigramprobB): return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB)) def getMIScoreFromFQP(bigrams, unigrams, bigram): tokenA, tokenB = bigram.split() return bigrams[bigram] * log(bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB])) tokens = tokenize(loadTextFromFile("pg873.txt")) unigrams = getNGrams(tokens, 1) relativizeTokenCounts(unigrams) bigrams = getNGrams(tokens, 2) #prettyPrintFrequencyProfile(bigrams, myreverse=False) relativizeTokenCounts(bigrams) # young King: likelihood ratio lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]) # young King - pointwise Mutual Information pmi = bigrams["young Fisherman"] * log(lhr) print("young Fisherman", lhr, pmi, sep="\t") # iron chain: likelihood ratio