def count(): form = WordForm() if form.validate_on_submit(): url = form.url.data response = requests.get(url) html = response.content.decode("utf-8") text = obo.stripTags(html).lower() fullwordlist = obo.stripNonAlphaNum(text) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) for s in sorteddict[:21]: flash(str(s)) return redirect(url_for('index')) return render_template('count.html', title='Word Count Application', form=form)
#html-to-freq.py import urllib2, obo url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' response = urllib2.urlopen(url) html = response.read() text = obo.stripTags(html).lower() wordlist = obo.stripNonAlphaNum(text) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) for s in sorteddict: print(str(s))
#html-to-list1.py import urllib2, obo url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33' response = urllib2.urlopen(url) html = response.read() text = obo.stripTags(html) wordlist = text.split() print wordlist[0:120]
# trial-content.py import urllib2, obo url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' response = urllib2.urlopen(url) HTML = response.read() print(obo.stripTags(HTML))
#html-to-list1.py import urllib2, obo url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33' response = urllib2.urlopen(url) html = response.read() text = obo.stripTags(html).lower() # here is the new change we are making today #wordlist = text.split() # old version from Fri 11/9 wordlist = obo.stripNonAlphaNum(text) # the new version will be: # wordlist = obo.stripNonAlphaNum(text) print(wordlist[0:120]) # old version # print(wordlist) # new version
import requests, obo url = 'http://literature.org/authors/shelley-mary/frankenstein/chapter-01.html' pagetext = requests.get(url) HTML = pagetext.text text = obo.stripTags(HTML).lower() # convert to lower case fullwordlist = obo.stripNonAlphaNum(text) # only words, into list wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) # remove common useless words dictionary = obo.wordListToFreqDict( wordlist) # add words and counts to dictionary sorteddict = obo.sortFreqDict(dictionary) # sort word list by frequency if __name__ == '__main__': for s in sorteddict: print(str(s))
#understanding the Return statement import obo myText = "<p>This is my <h1>HTML<h1> message<br/>" theResult = obo.stripTags(myText) print(obo.stripTags(myText))
'--numout', help='Cantidad de numeros antes y despues de la palabra', type=int, required=True) args = parser.parse_args() statusCode = 0 try: r = requests.get(args.url) #cargamos la respuesta de GET de la pàgina statusCode = r.status_code except: print( "Formato de la web erroneo, debe ser del estilo http://www.google.com") if statusCode == 200: text = obo.stripTags(r.text) #quitamos las etiquetas y pasamos a minuscula fullwordlist = obo.stripNonAlphaNum( text) #quitamos los que no son alfanumericos if args.stopwords: fullwordlist = obo.removeStopwords( fullwordlist, args.stopwords ) #eliminamos las palabras de uso comun segun el idioma if args.long: fullwordlist = obo.excludeTwo( fullwordlist, args.long) #eliminamos las palabras con menos de 2 caracteres dictionary = obo.wordListToFreqDict( fullwordlist) #nos devuelve un diccionario palabra - frequencia sorteddict = obo.sortFreqDict( dictionary ) #ordena las palabras por su frequencia (nos han devuelto una lista de listas)
# practice using the stripTags function import obo myText = "This is my <p>HTMdadasdasdasd <madeUpTag> transcirpt about B<br/> message" print(myText) theResult = obo.stripTags(myText) print(theResult) # distinguishing between single and double quotation marks v = '<a href="http://www.sheffield.ac.uk/hri/">'
import requests, obo url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' pagetext = requests.get(url) HTML = pagetext.text text = obo.stripTags(HTML).lower() # make lower case wordlist = obo.stripNonAlphaNum( text) # convert to list of words, no punctuation dictionary = obo.wordListToFreqDict( wordlist) # add words, counts to dictionary sorteddict = obo.sortFreqDict(dictionary) # sort word list by frequency for s in sorteddict: print(str(s))
# trial-content.py import urllib2, obo url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33' response = urllib2.urlopen(url) HTML = response.read() print obo.stripTags(HTML)