# create tag cloud cloudsize = 40 maxfreq = sorteddict[0][0] minfreq = sorteddict[cloudsize][0] freqrange = maxfreq - minfreq tempstring = '' resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize]) for k in resorteddict: kfreq = k[0] klabel = dh.undecoratedHyperlink('#'+k[1], k[1]) scalingfactor = (kfreq - minfreq) / float(freqrange) tempstring += dh.scaledFontSizeSpan(klabel, scalingfactor) outstring = dh.defaultCSSDiv(tempstring) + '<br />' # create KWIC listings for each item for k in resorteddict: klabel = k[1] tempstring = '' tempstring += '<a name=\"%s\">%s</a> ' % (klabel, klabel) tempstring += dh.undecoratedHyperlink('#', '[back]') outstring += dh.defaultCSSDiv(tempstring, opt='font-size : 24px;') outstring += '<p><pre>' for t in worddict[klabel]: outstring += dh.prettyPrintKWIC(t) outstring += '<br />' outstring += '</pre></p>' # open in Firefox dh.wrapStringInHTML("html-to-tag-cloud-kwic", url, outstring)#!/usr/bin/env python
# html-to-freq-4.py import dh # create sorted dictionary of word-frequency pairs url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298' # url = 'file:///C:/Documents%20and%20Settings/HP_Administrator/Desktop/ProgrammingHistorian/dcb-34298.html' text = dh.webPageToText(url) fullwordlist = dh.stripNonAlphaNum(text) wordlist = dh.removeStopwords(fullwordlist, dh.stopwords) dictionary = dh.wordListToFreqDict(wordlist) sorteddict = dh.sortFreqDict(dictionary) # create Google search link keywords = [] for k in sorteddict[0:5]: keywords.append(str(k[1])) gsearch = dh.keywordListToGoogleSearchLink(keywords, 'Google Search n=5') # compile dictionary into string and wrap with HTML outstring = gsearch + "<br /><br />" for s in sorteddict: outstring += str(s) outstring += "<br />" dh.wrapStringInHTML("html-to-freq-4", url, outstring)
# make directory to store downloaded pages if one doesn't exist if os.path.exists('iroquois') == 0: os.mkdir('iroquois') # download a local copy of each bio urlprefix = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=' for b in biodict: print "Processing bioid: " + str(b) url = urlprefix + str(b) outfile = 'iroquois/dcb-' + str(b) + '.html' if os.path.isfile(outfile) == 0: response = urllib2.urlopen(url) html = response.read() f = open(outfile, 'w') f.write(html) f.close time.sleep(2) else: print "File already downloaded" sys.stdout.flush() # create a page of links to local copies outstring = '' for b in biodict: outfile = 'dcb-' + str(b) + '.html' outstring += dh.undecoratedHyperlink('iroquois/'+outfile, str(b)) outstring += ' ' * 4 outstring += biodict[b] outstring += "<br />" dh.wrapStringInHTML("get-iroquois-bios", searchresultfile, outstring)
# html-to-kwic-2.py import dh # create dictionary of n-grams n = 7 url = 'file:///C:/Documents%20and%20Settings/HP_Administrator/Desktop/ProgrammingHistorian/dcb-34298.html' # url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298' text = dh.webPageToText(url) fullwordlist = ('# ' * (n//2)).split() fullwordlist += dh.stripNonAlphaNum(text) fullwordlist += ('# ' * (n//2)).split() ngrams = dh.getNGrams(fullwordlist, n) worddict = dh.nGramsToKWICDict(ngrams) # output KWIC and wrap with HTML target = 'iroquois' outstr = '<pre>' if worddict.has_key(target): for k in worddict[target]: linkname = dh.prettyPrintKWIC(k) keywords = dh.removeStopwords(k, dh.stopwords) outstr += dh.keywordListToGoogleSearchLink(keywords, linkname) # outstr += '<br />' else: outstr += 'Keyword not found in source' outstr += '</pre>' dh.wrapStringInHTML('html-to-kwic-2', url, outstr)
# create tag cloud cloudsize = 40 maxfreq = sorteddict[0][0] minfreq = sorteddict[cloudsize][0] freqrange = maxfreq - minfreq tempstring = '' resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize]) for k in resorteddict: kfreq = k[0] klabel = dh.undecoratedHyperlink('#'+k[1], k[1]) scalingfactor = (kfreq - minfreq) / float(freqrange) tempstring += dh.scaledFontSizeSpan(klabel, scalingfactor) outstring = dh.defaultCSSDiv(tempstring) + '<br />' # create KWIC listings for each item for k in resorteddict: klabel = k[1] tempstring = '' tempstring += '<a name=\"%s\">%s</a> ' % (klabel, klabel) tempstring += dh.undecoratedHyperlink('#', '[back]') outstring += dh.defaultCSSDiv(tempstring, opt='font-size : 24px;') outstring += '<p><pre>' for t in worddict[klabel]: outstring += dh.prettyPrintKWIC(t) outstring += '<br />' outstring += '</pre></p>' # open in Firefox dh.wrapStringInHTML("html-to-tag-cloud-kwic", url, outstring)
# html-to-tag-cloud.py import dh # create sorted dictionary of word-frequency pairs # url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298' url = "file:///C:/Documents%20and%20Settings/HP_Administrator/Desktop/ProgrammingHistorian/dcb-34298.html" text = dh.webPageToText(url) fullwordlist = dh.stripNonAlphaNum(text) wordlist = dh.removeStopwords(fullwordlist, dh.stopwords) dictionary = dh.wordListToFreqDict(wordlist) sorteddict = dh.sortFreqDict(dictionary) # create tag cloud and open in Firefox cloudsize = 100 maxfreq = sorteddict[0][0] minfreq = sorteddict[cloudsize][0] freqrange = maxfreq - minfreq outstring = "" resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize]) for k in resorteddict: kfreq = k[0] klabel = k[1] scalingfactor = (kfreq - minfreq) / float(freqrange) outstring += " " + dh.scaledFontHeatmapSpan(klabel, scalingfactor) + " " dh.wrapStringInHTML("html-to-tag-cloud", url, dh.defaultCSSDiv(outstring))