def random_quote(jenni, cat): if cat is not None: if cat not in CATS: jenni.say("I don't know that category, please select from one of: {0}".format(', '.join(CATS))) return else: cat = CATS[randrange(len(CATS))] page_title = page_id = None # First drill down to the lowest category while(True): try: cat_url = BASE_URL + SUBCATS % cat content = json.loads(urllib.urlopen(cat_url).read()) cat_members = content["query"]["categorymembers"] # Select at random random_member = choose_random_member(cat_members) if random_member is None: jenni.say("An error occurred fetching a subcategory") return if random_member["type"] == "subcat": cat = random_member["title"] else: page_title = random_member["title"] page_id = random_member["pageid"] break except Exception as e: jenni.say("An error occurred fetching a quote: {0}".format(e)) return # Next select a random quote from the page try: page_url = BASE_URL + SECTIONS % page_id content = json.loads(urllib.urlopen(page_url).read()) sections = content["parse"]["sections"] quote = None num_tries = 0 while quote == None and num_tries < MAX_TRIES: section = choose_random_section(sections) if section is None: jenni.say("We accidentally chose a page with no quotes, sorry about that!") return section_index = randrange(len(sections)) + 1 section_url = BASE_URL + SECTION % (page_id, section_index) content = json.loads(urllib.urlopen(section_url).read()) section_title = content["parse"]["title"] html = Soup(content["parse"]["text"]["*"]) all_quotes = [] for ul in html.findAll('ul'): for li in ul.findAll('li'): all_quotes.append(li.text) for dd in html.findAll('dd'): all_quotes.append(dd.text.replace("<b>","").replace("</b>","")) len_all_quotes = len(all_quotes) if len_all_quotes == 0: num_tries += 1 else: quote = all_quotes[randrange(len_all_quotes)] if quote is None: jenni.say("We accidentally chose a section of a page with no quotes, sorry about that!") return jenni.say("{0}: {1}".format(section_title, quote.encode('utf-8'))) except Exception as e: jenni.say("An error occurred fetching a quote: {0}".format(e)) return
from BeautifulSoup import BeautifulSoup as Soup import urllib2 file = "/Users/ganeshchand/gh/gc/python/learning-python/src/xml/report.xml" fileReader = open(file, 'rb') soup = Soup(fileReader) # print soup for dataitem in soup.findAll('dataitem'): dataitem_attrs = dict(dataitem.attrs) expression = dataitem.find('expression') expression_attrs = dict(expression.attrs) print dataitem_attrs print expression_attrs for expression in soup.findAll('dataitem'): print expression.contents[1].text
#!/usr/bin/env python2 #author: mp #comment: scrape viewdns.info for a list of IPv4 addresses a domain has pointed to from BeautifulSoup import BeautifulSoup as Soup import urllib2 import sys soup = Soup( urllib2.urlopen("http://viewdns.info/iphistory/?domain={}".format( sys.argv[1])).read()) for table in soup.findAll("table", {"border": "1"}): for tr in table.findAll("tr"): print tr.text
def get_img_links(url): soup = Soup(load_page(url)) imgs = select(soup, 'a.post-meidaurl img') return [img['src'] for img in imgs]
#!/usr/bin/env python2 #author: mp #comment: pull a list of IPv4 address for a given country import urllib2 from BeautifulSoup import BeautifulSoup as Soup import sys soup = Soup(urllib2.urlopen("http://www.nirsoft.net/countryip/").read()) for i in soup.findAll("a"): if sys.argv[1] in i.text: csv = Soup( urllib2.urlopen("http://www.nirsoft.net/countryip/{}.csv".format( i['href'].split(".")[0]))) if csv: for line in csv: print line
def get_links(url): opener = urllib2.build_opener() opener.addheaders = [("User-agent", "Mozilla/5.0")] soup = Soup(opener.open(url).read()) for link in soup.findAll("div", {"class": "url"}): print " {}".format(link.text.encode("utf-8").strip())
Wrong: %s """ % (self.category, self.answer, self.dollars, self.order, self.question, str(self.right), str(self.wrong)) GAMES = ('617', '618', '619', '620', '621', '732', '736') collected_clues = {} collected_scores = {} # ------ grab clues for the game ------ for game_number in GAMES: collected_clues[game_number] = [] f = open('%s.html' % game_number, 'r') soup = Soup(f.read()) f.close() KEY = { 'clue_value': 'dollars', 'clue_order_number': 'order', 'clue_text': 'answer', } for (round_number, round) in enumerate(select(soup, 'table.round')): categories = [] for category in select(round, 'td.category_name'): categories.append(strip_tags(str(category)).strip()) for (i, clue) in enumerate(select(round, 'td.clue')):
#!/usr/bin/env python2 from BeautifulSoup import BeautifulSoup as Soup import urllib2 import re import sys def usage(): print "./iplookup.py <ipv4 address>" sys.exit() try: ip = sys.argv[1] except: usage() if re.match( r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ip ): opener = urllib2.build_opener() opener.addheaders = [( "User-agent", "Mozilla/5.0" )] data = Soup( opener.open( "http://whatismyipaddress.com/ip/{}".format( ip ) ).read() ) for row in data.findAll( "tr" ): line = row.text #print pretty :) if "latitude" in line.lower() or "longitude" in line.lower(): pass #i dont care for these results tbh elif "blacklist:" in line.lower(): pass #scraping these results can be part of another script else: print line else: print "!!! invalid ipv4 address" usage() #regex failed
def craw_a_movie(url): fd = urllib.urlopen(url) soup = Soup(fd) data = {} data["location"] = select(soup, 'h1') return data
def fetch(self): f = urllib.urlopen(self.url) self.text = f.read() self.soup = Soup(self.text) f.close()
import urllib import re, os, sys from BeautifulSoup import BeautifulSoup as Soup sys.path.insert(0, '/home/ted/alderaan-wc/') from ngt.utils.tracker import Tracker rooturl = 'http://pds-imaging.jpl.nasa.gov/data/mgs-m-moc-na_wa-2-sdp-l0-v1.0/' targetpath = '/home/ted/data/moc_meta' indexfiles = ['imgindx.lbl','imgindx.tab','imgindex.lbl','imgindex.tab'] root = urllib.urlopen(rooturl) soup = Soup(root.read()) volpattern = re.compile('^mgsc_\d+/?$') dirlinks = soup.findAll('a', href=volpattern) for voldir in Tracker(iter=[l['href'] for l in dirlinks] ): try: target_dir = os.path.join(targetpath, voldir, 'index') os.makedirs(target_dir) except os.error: pass for ifile in indexfiles: img_response = urllib.urlopen(rooturl + voldir + 'index/' + ifile) if img_response.getcode() == 200: out = open(os.path.join(target_dir, ifile), 'w') out.write(img_response.read()) out.close()