コード例 #1
0
ファイル: generateFRQP.py プロジェクト: dcavar/Py3L
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

from math import log


#import corpus
from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP

mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) )   
relativizeFP(mydict)

#for key in mydict:
#   print(key, mydict[key], sep="\t")

mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) )
relativizeFP(mysportsdict)

unktokens = tokenize("""
The young King was eating pomegranates and talking about his soul and other emotional issues.
""")

probpomeg = 0.0
probsports = 0.0
for token in unktokens:
   probpomeg += log(mydict.get(token, 0.00000000000001))
   probsports += log(mysportsdict.get(token, 0.00000000000001))

if probpomeg > probsports:
   print("This text is probably House of Pomeg.")
else:
コード例 #2
0
ファイル: list-loop-1.py プロジェクト: dcavar/Py3L
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from corpus import relativizeFP, getTextFromFile, tokenize, removeJunk
from operator import itemgetter



#mylist = [ "A", "B", "C", "D", "E", "A", "B", "C" ]

mytokens = tokenize(getTextFromFile("pg873.txt"))


# use this:

#junk = " \n\t"
#mynewtokens = []
#for x in mytokens:
#   if x in junk:
#      continue
#   mynewtokens.append(x)
#mytokens = mynewtokens[:]

# or this:
mytokens = [e for e in mytokens if e not in junk]



def getMeTheNGramModel(tokens, n):
   mydict = {}
   position = 0
コード例 #3
0
ファイル: collocations1.py プロジェクト: dcavar/Py3L
    'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had',
    'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
    'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
    'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no',
    'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
    'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
    'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
    'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we',
    'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why',
    'will', 'with', 'would', 'yet', 'you', 'your'
]
stopwordsEN = stopwordsEN + [x.capitalize() for x in stopwordsEN]
#print(stopwordsEN)

mytokens = tokenize(getTextFromFile("pg873.txt"))

# filter out empty string tokens
mytokens = [x for x in mytokens if x]
#print(mytokens)

# filter out stopwords
mytokens = [x for x in mytokens if x not in stopwordsEN]
#print(mytokens)

unigrams = getNGramModel(mytokens, 1)
bigrams = getNGramModel(mytokens, 2)

#print(unigrams)

# prettyPrintFRP(bigrams, myreverse=False)
コード例 #4
0
#!/usr/bin/env python3


from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP


for x in range (1,6):
    loadSpam.split_data( x , 5, spamPath)

for file in spamList:
    mytokens = tokenize(getTextFromFile(file) )

mydict = makeFrequencyProfile(mytokens)

junk = " ,;:-+=()[]'\"?!%.<>"

removeJunk(mydict, junk)

if "" in mydict:
   del mydict[""]

prettyPrintFRP (mydict)