Beispiel #1
0
def count():
    form = WordForm()
    if form.validate_on_submit():
        url = form.url.data
        response = requests.get(url)
        html = response.content.decode("utf-8")
        text = obo.stripTags(html).lower()
        fullwordlist = obo.stripNonAlphaNum(text)
        wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
        dictionary = obo.wordListToFreqDict(wordlist)
        sorteddict = obo.sortFreqDict(dictionary)
        for s in sorteddict[:21]:
            flash(str(s))
        return redirect(url_for('index'))
    return render_template('count.html',
                           title='Word Count Application',
                           form=form)
Beispiel #2
0
#html-to-freq.py

import urllib2, obo

url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'

response = urllib2.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower()
wordlist = obo.stripNonAlphaNum(text)
dictionary = obo.wordListToFreqDict(wordlist)
sorteddict = obo.sortFreqDict(dictionary)

for s in sorteddict: print(str(s))
#html-to-list1.py
import urllib2, obo

url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33'

response = urllib2.urlopen(url)
html = response.read()
text = obo.stripTags(html)
wordlist = text.split()
print wordlist[0:120]
# trial-content.py

import urllib2, obo

url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'

response = urllib2.urlopen(url)
HTML = response.read()

print(obo.stripTags(HTML))
#html-to-list1.py
import urllib2, obo

url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33'

response = urllib2.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower() # here is the new change we are making today
#wordlist = text.split() # old version from Fri 11/9
wordlist = obo.stripNonAlphaNum(text)

# the new version will be:
# wordlist = obo.stripNonAlphaNum(text)


print(wordlist[0:120]) # old version

# print(wordlist) # new version
import requests, obo
url = 'http://literature.org/authors/shelley-mary/frankenstein/chapter-01.html'
pagetext = requests.get(url)
HTML = pagetext.text
text = obo.stripTags(HTML).lower()  # convert to lower case
fullwordlist = obo.stripNonAlphaNum(text)  # only words, into list
wordlist = obo.removeStopwords(fullwordlist,
                               obo.stopwords)  # remove common useless words
dictionary = obo.wordListToFreqDict(
    wordlist)  # add words and counts to dictionary
sorteddict = obo.sortFreqDict(dictionary)  # sort word list by frequency

if __name__ == '__main__':
    for s in sorteddict:
        print(str(s))
#understanding the Return statement

import obo

myText = "<p>This is my <h1>HTML<h1> message<br/>"

theResult = obo.stripTags(myText)
print(obo.stripTags(myText))
Beispiel #8
0
                    '--numout',
                    help='Cantidad de numeros antes y despues de la palabra',
                    type=int,
                    required=True)
args = parser.parse_args()

statusCode = 0
try:
    r = requests.get(args.url)  #cargamos la respuesta de GET de la pàgina
    statusCode = r.status_code
except:
    print(
        "Formato de la web erroneo, debe ser del estilo http://www.google.com")

if statusCode == 200:
    text = obo.stripTags(r.text)  #quitamos las etiquetas y pasamos a minuscula
    fullwordlist = obo.stripNonAlphaNum(
        text)  #quitamos los que no son alfanumericos
    if args.stopwords:
        fullwordlist = obo.removeStopwords(
            fullwordlist, args.stopwords
        )  #eliminamos las palabras de uso comun segun el idioma
    if args.long:
        fullwordlist = obo.excludeTwo(
            fullwordlist,
            args.long)  #eliminamos las palabras con menos de 2 caracteres
    dictionary = obo.wordListToFreqDict(
        fullwordlist)  #nos devuelve un diccionario palabra - frequencia
    sorteddict = obo.sortFreqDict(
        dictionary
    )  #ordena las palabras por su frequencia (nos han devuelto una lista de listas)
Beispiel #9
0
# practice using the stripTags function

import obo

myText = "This is my <p>HTMdadasdasdasd <madeUpTag> transcirpt about B<br/> message"
print(myText)

theResult = obo.stripTags(myText)

print(theResult)

# distinguishing between single and double quotation marks

v = '<a href="http://www.sheffield.ac.uk/hri/">'

import requests, obo
url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'
pagetext = requests.get(url)
HTML = pagetext.text
text = obo.stripTags(HTML).lower()  # make lower case
wordlist = obo.stripNonAlphaNum(
    text)  # convert to list of words, no punctuation
dictionary = obo.wordListToFreqDict(
    wordlist)  # add words, counts to dictionary
sorteddict = obo.sortFreqDict(dictionary)  # sort word list by frequency
for s in sorteddict:
    print(str(s))
# trial-content.py

import urllib2, obo
 
url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33'
 
response = urllib2.urlopen(url)
HTML = response.read()
 
print obo.stripTags(HTML)