#!/usr/bin/env python3 # -*- coding: UTF-8 -*- from math import log #import corpus from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) ) relativizeFP(mydict) #for key in mydict: # print(key, mydict[key], sep="\t") mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) ) relativizeFP(mysportsdict) unktokens = tokenize(""" The young King was eating pomegranates and talking about his soul and other emotional issues. """) probpomeg = 0.0 probsports = 0.0 for token in unktokens: probpomeg += log(mydict.get(token, 0.00000000000001)) probsports += log(mysportsdict.get(token, 0.00000000000001)) if probpomeg > probsports: print("This text is probably House of Pomeg.") else:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from corpus import relativizeFP, getTextFromFile, tokenize, removeJunk from operator import itemgetter #mylist = [ "A", "B", "C", "D", "E", "A", "B", "C" ] mytokens = tokenize(getTextFromFile("pg873.txt")) # use this: #junk = " \n\t" #mynewtokens = [] #for x in mytokens: # if x in junk: # continue # mynewtokens.append(x) #mytokens = mynewtokens[:] # or this: mytokens = [e for e in mytokens if e not in junk] def getMeTheNGramModel(tokens, n): mydict = {} position = 0
'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your' ] stopwordsEN = stopwordsEN + [x.capitalize() for x in stopwordsEN] #print(stopwordsEN) mytokens = tokenize(getTextFromFile("pg873.txt")) # filter out empty string tokens mytokens = [x for x in mytokens if x] #print(mytokens) # filter out stopwords mytokens = [x for x in mytokens if x not in stopwordsEN] #print(mytokens) unigrams = getNGramModel(mytokens, 1) bigrams = getNGramModel(mytokens, 2) #print(unigrams) # prettyPrintFRP(bigrams, myreverse=False)
#!/usr/bin/env python3 from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP for x in range (1,6): loadSpam.split_data( x , 5, spamPath) for file in spamList: mytokens = tokenize(getTextFromFile(file) ) mydict = makeFrequencyProfile(mytokens) junk = " ,;:-+=()[]'\"?!%.<>" removeJunk(mydict, junk) if "" in mydict: del mydict[""] prettyPrintFRP (mydict)