コード例 #1
0
# html-to-list-1.py

import urllib2
import dh

url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298'

response = urllib2.urlopen(url)
html = response.read()
text = dh.stripTags(html)
wordlist = text.split()
print wordlist[0:120]
コード例 #2
0
# html-to-freq-2.py

import urllib2
import dh

url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298'

response = urllib2.urlopen(url)
html = response.read()
text = dh.stripTags(html).replace(' ', ' ')
fullwordlist = dh.stripNonAlphaNum(text.lower())
wordlist = dh.removeStopwords(fullwordlist, dh.stopwords)
dictionary = dh.wordListToFreqDict(wordlist)
sorteddict = dh.sortFreqDict(dictionary)
for s in sorteddict: print str(s)