Python Soup Examples, BeautifulSoup.Soup Python Examples

Example #1

0

Show file

File: wikiquote.py Project: nikolas/jenni

def random_quote(jenni, cat):
    if cat is not None:
        if cat not in CATS:
            jenni.say("I don't know that category, please select from one of: {0}".format(', '.join(CATS)))
            return
    else:
        cat = CATS[randrange(len(CATS))]

    page_title = page_id = None

    # First drill down to the lowest category
    while(True):
        try:
            cat_url = BASE_URL + SUBCATS % cat
            content = json.loads(urllib.urlopen(cat_url).read())
            cat_members = content["query"]["categorymembers"]

            # Select at random
            random_member = choose_random_member(cat_members)
            if random_member is None:
                jenni.say("An error occurred fetching a subcategory")
                return
            
            if random_member["type"] == "subcat":
                cat = random_member["title"]
            else:
                page_title = random_member["title"]
                page_id = random_member["pageid"]
                break
        except Exception as e:
            jenni.say("An error occurred fetching a quote: {0}".format(e))
            return

    # Next select a random quote from the page
    try:
        page_url = BASE_URL + SECTIONS % page_id
        content = json.loads(urllib.urlopen(page_url).read())
        sections = content["parse"]["sections"]

        quote = None
        num_tries = 0
        while quote == None and num_tries < MAX_TRIES:
            section = choose_random_section(sections)
    
            if section is None:
                jenni.say("We accidentally chose a page with no quotes, sorry about that!")
                return
    
            section_index = randrange(len(sections)) + 1
    
            section_url = BASE_URL + SECTION % (page_id, section_index)
            content = json.loads(urllib.urlopen(section_url).read())
            section_title = content["parse"]["title"]
            html = Soup(content["parse"]["text"]["*"])
            all_quotes = []
            for ul in html.findAll('ul'):
                for li in ul.findAll('li'):
                    all_quotes.append(li.text)
    
            for dd in html.findAll('dd'):
                all_quotes.append(dd.text.replace("<b>","").replace("</b>",""))
    
            len_all_quotes = len(all_quotes)
            if len_all_quotes == 0:
                num_tries += 1
            else:
                quote = all_quotes[randrange(len_all_quotes)]
    
        if quote is None:
          jenni.say("We accidentally chose a section of a page with no quotes, sorry about that!")
          return

        jenni.say("{0}: {1}".format(section_title, quote.encode('utf-8')))
    except Exception as e:
        jenni.say("An error occurred fetching a quote: {0}".format(e))
        return

Example #2

0

Show file

File: using-beautifulsoup.py Project: ganeshchand/learning-python

from BeautifulSoup import BeautifulSoup as Soup
import urllib2

file = "/Users/ganeshchand/gh/gc/python/learning-python/src/xml/report.xml"

fileReader = open(file, 'rb')
soup = Soup(fileReader)
# print soup

for dataitem in soup.findAll('dataitem'):
    dataitem_attrs = dict(dataitem.attrs)
    expression = dataitem.find('expression')
    expression_attrs = dict(expression.attrs)
    print dataitem_attrs
    print expression_attrs

for expression in soup.findAll('dataitem'):

    print expression.contents[1].text

Example #3

0

Show file

File: dnshistory.py Project: awknode/tools

#!/usr/bin/env python2
#author: mp
#comment: scrape viewdns.info for a list of IPv4 addresses a domain has pointed to

from BeautifulSoup import BeautifulSoup as Soup
import urllib2
import sys

soup = Soup(
    urllib2.urlopen("http://viewdns.info/iphistory/?domain={}".format(
        sys.argv[1])).read())
for table in soup.findAll("table", {"border": "1"}):
    for tr in table.findAll("tr"):
        print tr.text

Example #4

0

Show file

File: crawler.py Project: aisensiy/rosi-gallery

def get_img_links(url):
    soup = Soup(load_page(url))
    imgs = select(soup, 'a.post-meidaurl img')
    return [img['src'] for img in imgs]

Example #5

0

Show file

File: blockfinder.py Project: awknode/tools

#!/usr/bin/env python2
#author: mp
#comment: pull a list of IPv4 address for a given country

import urllib2
from BeautifulSoup import BeautifulSoup as Soup
import sys

soup = Soup(urllib2.urlopen("http://www.nirsoft.net/countryip/").read())
for i in soup.findAll("a"):
    if sys.argv[1] in i.text:
        csv = Soup(
            urllib2.urlopen("http://www.nirsoft.net/countryip/{}.csv".format(
                i['href'].split(".")[0])))

if csv:
    for line in csv:
        print line

Example #6

0

Show file

File: ddg.py Project: phishing-factory/tools

def get_links(url):
    opener = urllib2.build_opener()
    opener.addheaders = [("User-agent", "Mozilla/5.0")]
    soup = Soup(opener.open(url).read())
    for link in soup.findAll("div", {"class": "url"}):
        print " {}".format(link.text.encode("utf-8").strip())

Example #7

0

Show file

   Wrong:   %s
        """ % (self.category, self.answer, self.dollars, self.order,
               self.question, str(self.right), str(self.wrong))


GAMES = ('617', '618', '619', '620', '621', '732', '736')
collected_clues = {}
collected_scores = {}

# ------ grab clues for the game ------
for game_number in GAMES:

    collected_clues[game_number] = []

    f = open('%s.html' % game_number, 'r')
    soup = Soup(f.read())
    f.close()

    KEY = {
        'clue_value': 'dollars',
        'clue_order_number': 'order',
        'clue_text': 'answer',
    }

    for (round_number, round) in enumerate(select(soup, 'table.round')):

        categories = []
        for category in select(round, 'td.category_name'):
            categories.append(strip_tags(str(category)).strip())
        for (i, clue) in enumerate(select(round, 'td.clue')):

Example #8

0

Show file

File: iplookup.py Project: awknode/tools

#!/usr/bin/env python2
from BeautifulSoup import BeautifulSoup as Soup
import urllib2
import re
import sys

def usage():
    print "./iplookup.py <ipv4 address>"
    sys.exit()

try:
    ip = sys.argv[1]
except:
    usage()

if re.match( r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ip ):
    opener = urllib2.build_opener()
    opener.addheaders = [( "User-agent", "Mozilla/5.0" )]                             
    data = Soup( opener.open( "http://whatismyipaddress.com/ip/{}".format( ip ) ).read() )
    for row in data.findAll( "tr" ):
        line = row.text #print pretty :)
        if "latitude" in line.lower() or "longitude" in line.lower():
            pass #i dont care for these results tbh
        elif "blacklist:" in line.lower():
            pass #scraping these results can be part of another script
        else:
            print line
else:
    print "!!! invalid ipv4 address"
    usage() #regex failed

Example #9

0

Show file

File: movie_calendar.py Project: bee040811/hackday

def craw_a_movie(url):
    fd = urllib.urlopen(url)
    soup = Soup(fd)
    data = {}
    data["location"] = select(soup, 'h1')
    return data

Example #10

0

Show file

File: fetcher.py Project: msgre/djangoproject.cz

 def fetch(self):
     f = urllib.urlopen(self.url)
     self.text = f.read()
     self.soup = Soup(self.text)
     f.close()

Example #11

0

Show file

File: grab_meta.py Project: imclab/kraken

import urllib
import re, os, sys
from BeautifulSoup import BeautifulSoup as Soup
sys.path.insert(0, '/home/ted/alderaan-wc/')
from ngt.utils.tracker import Tracker


rooturl = 'http://pds-imaging.jpl.nasa.gov/data/mgs-m-moc-na_wa-2-sdp-l0-v1.0/'
targetpath = '/home/ted/data/moc_meta'
indexfiles = ['imgindx.lbl','imgindx.tab','imgindex.lbl','imgindex.tab']

root = urllib.urlopen(rooturl)
soup = Soup(root.read())
volpattern = re.compile('^mgsc_\d+/?$')
dirlinks = soup.findAll('a', href=volpattern)

for voldir in Tracker(iter=[l['href'] for l in dirlinks] ):
    try:
        target_dir = os.path.join(targetpath, voldir, 'index')
        os.makedirs(target_dir)
    except os.error:
        pass

    for ifile in indexfiles:
        img_response = urllib.urlopen(rooturl + voldir + 'index/' + ifile)
        if img_response.getcode() == 200:
            out = open(os.path.join(target_dir, ifile), 'w')
            out.write(img_response.read())
            out.close()