Ejemplo n.º 1
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
frqanalysis.py
Open a text file, read text, tokenize it and generate a
frequency profile.
"""

from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk

# mytext = getTextFromFile("pg873.txt")
# read text from file to memory and return a list of tokens
mytokens = tokenize(getTextFromFile("pg873.txt"))

mydict = makeFrequencyProfile(mytokens)

junk = " ,;:-+=()[]'\"?!$%.<>"

removeJunk(mydict, junk)

if "" in mydict:
    del mydict[""]

# generate a nice output
total = sum(mydict.values())
for token in mydict:
    print(token, mydict[token], mydict[token] / total, sep='\t')
Ejemplo n.º 2
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

from math import log


#import corpus
from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP

mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) )   
relativizeFP(mydict)

#for key in mydict:
#   print(key, mydict[key], sep="\t")

mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) )
relativizeFP(mysportsdict)

unktokens = tokenize("""
The young King was eating pomegranates and talking about his soul and other emotional issues.
""")

probpomeg = 0.0
probsports = 0.0
for token in unktokens:
   probpomeg += log(mydict.get(token, 0.00000000000001))
   probsports += log(mysportsdict.get(token, 0.00000000000001))

if probpomeg > probsports:
   print("This text is probably House of Pomeg.")
else:
#!/usr/bin/env python3


from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP


for x in range (1,6):
    loadSpam.split_data( x , 5, spamPath)

for file in spamList:
    mytokens = tokenize(getTextFromFile(file) )

mydict = makeFrequencyProfile(mytokens)

junk = " ,;:-+=()[]'\"?!%.<>"

removeJunk(mydict, junk)

if "" in mydict:
   del mydict[""]

prettyPrintFRP (mydict)