# html-to-freq-2.py import urllib2 import dh url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298' response = urllib2.urlopen(url) html = response.read() text = dh.stripTags(html).replace(' ', ' ') fullwordlist = dh.stripNonAlphaNum(text.lower()) wordlist = dh.removeStopwords(fullwordlist, dh.stopwords) dictionary = dh.wordListToFreqDict(wordlist) sorteddict = dh.sortFreqDict(dictionary) for s in sorteddict: print str(s)
# html-to-tag-cloud-kwic.py import dh # create sorted dictionary of word-frequency pairs url = 'http://framingredpower.org/archive/newspapers/frp.total.xml' #CHANGE URL HERE text = dh.webPageToText(url) fullwordlist = dh.stripNonAlphaNum(text) wordlist = dh.removeStopwords(fullwordlist, dh.stopwords) dictionary = dh.wordListToFreqDict(wordlist) sorteddict = dh.sortFreqDict(dictionary) # create dictionary of n-grams n = 20 paddinglist = ('# ' * (n//2)) fullwordlist[:0] = paddinglist fullwordlist.extend(paddinglist) ngrams = dh.getNGrams(fullwordlist, n) worddict = dh.nGramsToKWICDict(ngrams) # create tag cloud cloudsize = 40 maxfreq = sorteddict[0][0] minfreq = sorteddict[cloudsize][0] freqrange = maxfreq - minfreq tempstring = '' resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize]) for k in resorteddict: kfreq = k[0] klabel = dh.undecoratedHyperlink('#'+k[1], k[1]) scalingfactor = (kfreq - minfreq) / float(freqrange)
import dh import os # load a list of filenames to process collectiondir = 'iroquois' filelist = dh.getFileNames(collectiondir) # For each file in the list, index the words in it completeindex = [] for i in filelist: # convert file to list of words fname = collectiondir + '/' + i ftext = dh.localWebPageToText(fname) flist = dh.stripNonAlphaNum(ftext) # replace stopwords with placeholder flist = dh.replaceStopwords(flist, dh.stopwords) # create a list of (word, filename, offset) tuples ftuplelist = zip(flist, (fname,)*len(flist), range(0, len(flist))) # add tuples to complete index list completeindex += ftuplelist # remove stop words from complete index list completeindex = [x for x in completeindex if x[0] != '#'] # print a few members of complete index list print completeindex[0:16]