# html-to-tag-cloud-kwic.py
 
import dh
 
# create sorted dictionary of word-frequency pairs
url = 'http://framingredpower.org/archive/newspapers/frp.total.xml'    #CHANGE URL HERE
text = dh.webPageToText(url)
fullwordlist = dh.stripNonAlphaNum(text)
wordlist = dh.removeStopwords(fullwordlist, dh.stopwords)
dictionary = dh.wordListToFreqDict(wordlist)
sorteddict = dh.sortFreqDict(dictionary)
 
# create dictionary of n-grams
n = 20
paddinglist = ('# ' * (n//2))
fullwordlist[:0] = paddinglist
fullwordlist.extend(paddinglist)
ngrams = dh.getNGrams(fullwordlist, n)
worddict = dh.nGramsToKWICDict(ngrams)
 
# create tag cloud
cloudsize = 40
maxfreq = sorteddict[0][0]
minfreq = sorteddict[cloudsize][0]
freqrange = maxfreq - minfreq
tempstring = ''
resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize])
for k in resorteddict:
    kfreq = k[0]
    klabel = dh.undecoratedHyperlink('#'+k[1], k[1])    
    scalingfactor = (kfreq - minfreq) / float(freqrange)
# html-to-kwic-2.py

import dh

# create dictionary of n-grams
n = 7
url = 'file:///C:/Documents%20and%20Settings/HP_Administrator/Desktop/ProgrammingHistorian/dcb-34298.html'
# url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298'
text = dh.webPageToText(url)
fullwordlist = ('# ' * (n//2)).split()
fullwordlist += dh.stripNonAlphaNum(text)
fullwordlist += ('# ' * (n//2)).split()
ngrams = dh.getNGrams(fullwordlist, n)
worddict = dh.nGramsToKWICDict(ngrams)

# output KWIC and wrap with HTML
target = 'iroquois'
outstr = '<pre>'
if worddict.has_key(target):
    for k in worddict[target]:
        linkname = dh.prettyPrintKWIC(k)
        keywords = dh.removeStopwords(k, dh.stopwords)
        outstr += dh.keywordListToGoogleSearchLink(keywords, linkname)
        # outstr += '<br />'
else:
    outstr += 'Keyword not found in source'
outstr += '</pre>'
dh.wrapStringInHTML('html-to-kwic-2', url, outstr)