# html-to-freq-2.py

import urllib2
import dh

url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298'

response = urllib2.urlopen(url)
html = response.read()
text = dh.stripTags(html).replace(' ', ' ')
fullwordlist = dh.stripNonAlphaNum(text.lower())
wordlist = dh.removeStopwords(fullwordlist, dh.stopwords)
dictionary = dh.wordListToFreqDict(wordlist)
sorteddict = dh.sortFreqDict(dictionary)
for s in sorteddict: print str(s)
# html-to-tag-cloud-kwic.py
 
import dh
 
# create sorted dictionary of word-frequency pairs
url = 'http://framingredpower.org/archive/newspapers/frp.total.xml'    #CHANGE URL HERE
text = dh.webPageToText(url)
fullwordlist = dh.stripNonAlphaNum(text)
wordlist = dh.removeStopwords(fullwordlist, dh.stopwords)
dictionary = dh.wordListToFreqDict(wordlist)
sorteddict = dh.sortFreqDict(dictionary)
 
# create dictionary of n-grams
n = 20
paddinglist = ('# ' * (n//2))
fullwordlist[:0] = paddinglist
fullwordlist.extend(paddinglist)
ngrams = dh.getNGrams(fullwordlist, n)
worddict = dh.nGramsToKWICDict(ngrams)
 
# create tag cloud
cloudsize = 40
maxfreq = sorteddict[0][0]
minfreq = sorteddict[cloudsize][0]
freqrange = maxfreq - minfreq
tempstring = ''
resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize])
for k in resorteddict:
    kfreq = k[0]
    klabel = dh.undecoratedHyperlink('#'+k[1], k[1])    
    scalingfactor = (kfreq - minfreq) / float(freqrange)
import dh
import os

# load a list of filenames to process

collectiondir = 'iroquois'
filelist = dh.getFileNames(collectiondir)

# For each file in the list, index the words in it
completeindex = []
for i in filelist:
    
    # convert file to list of words
    fname = collectiondir + '/' + i
    ftext = dh.localWebPageToText(fname)
    flist = dh.stripNonAlphaNum(ftext)
    
    # replace stopwords with placeholder
    flist = dh.replaceStopwords(flist, dh.stopwords)
    
    # create a list of (word, filename, offset) tuples
    ftuplelist = zip(flist, (fname,)*len(flist), range(0, len(flist)))
    
    # add tuples to complete index list
    completeindex += ftuplelist
    
# remove stop words from complete index list
completeindex = [x for x in completeindex if x[0] != '#']

# print a few members of complete index list
print completeindex[0:16]