# html-to-kwic.py import obo # create dictionary of n-grams n = 7 url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' text = obo.webPageToText(url) fullwordlist = ('# ' * (n//2)).split() fullwordlist += obo.stripNonAlphaNum(text) fullwordlist += ('# ' * (n//2)).split() ngrams = obo.getNGrams(fullwordlist, n) worddict = obo.nGramsToKWICDict(ngrams) # output KWIC and wrap with html target = 'black' outstr = '<pre>' if worddict.has_key(target): for k in worddict[target]: outstr += obo.prettyPrintKWIC(k) outstr += '<br />' else: outstr += 'Keyword not found in source' outstr += '</pre>' obo.wrapStringInHTMLMac('html-to-kwic', url, outstr)
#get-keywords.py import obo test = 'this test sentence has eight words in it' ngrams = obo.getNGrams(test.split(), 5) print(obo.nGramsToKWICDict(ngrams))