def scrape_and_look_for_next_link(url, rest_id): try: html = scraperwiki.scrape(url) except scraperwiki.Error: print "Sleeping for 10 seconds..." time.sleep(10) html = scraperwiki.scrape(url) # print html root = lxml.html.fromstring(html) rest_id = scrape_restaurants(root, rest_id, base_url, city_url) scraperwiki.sqlite.save_var('last_rest_id', rest_id) next_links = root.cssselect("ul.pagination-control li.active a") for link in next_links: if "Next" in link.text: next_link = link.attrib.get('href') else: next_link = None if next_link is not None: next_url = urlparse.urljoin(base_url, next_link) print next_url scraperwiki.sqlite.save_var('last_url_scraped', next_url) scrape_and_look_for_next_link(next_url, rest_id) else: print "Finished scraping!"
def scrape_press_releases(): releases_page = pq(scraperwiki.scrape(BASE_URL + 'news-releases')) for row in releases_page.find('.recordListTitle'): sleep(1) title = '' date = None content = '' attachments = [] links = pq(row).find('a') page = pq(scraperwiki.scrape(links.eq(0).attr('href'))) title = _extract_title_from(page) content = _readable(page.find('.content').html()) date = _extract_date_from(page) for attachment in page.find('.file_link a'): att = pq(attachment) attachments.append({att.text(): att.attr('html')}) args = [title, date, content] kwargs = {} if len(attachments): kwargs.update(attachments=attachments) gasp.add_press_release(*args, **kwargs)
def get_os_id(params): """ Function to get Open States ID. Please do not abuse API key. """ apikey = '49c5c72c157d4b37892ddb52c63d06be' params['apikey'] = apikey os_url = create_os_url(params) raw = scraperwiki.scrape(os_url) os_data = demjson.decode(raw) os_found = len(os_data) os_id = '' # Use first if any found, if not remove last name if os_found > 0: os_id = os_data[0]['id'] else: del params['first_name'] os_url = create_os_url(params) raw = scraperwiki.scrape(os_url) os_data = demjson.decode(raw) os_found = str(len(os_data)) + '-removed-first' if len(os_data) > 0: os_id = os_data[0]['id'] return { 'found': os_found, 'id': os_id }
def geocode_postcode(postcode, provider='geonames'): import urllib2, json if provider=='geonames': resp = scraperwiki.scrape('http://api.geonames.org/postalCodeSearchJSON?postalcode=%s&maxRows=1&username=scraperwiki' % urllib2.quote(postcode)) obj = json.loads(resp) if 'postalCodes' in obj and len(obj['postalCodes']): return (obj['postalCodes'][0]['lat'], obj['postalCodes'][0]['lng']) else: return (None,None) elif provider=='mapit': try: resp = scraperwiki.scrape('http://mapit.mysociety.org/postcode/%s.json' % urllib2.quote(postcode)) except urllib2.HTTPError: return (None, None) else: obj = json.loads(resp) if 'wgs84_lat' in obj and 'wgs84_lon' in obj: return (obj['wgs84_lat'], obj['wgs84_lon']) else: return (None,None) elif provider=='jamiethompson': import requests try: resp = requests.get('http://geo.jamiethompson.co.uk/%s.json' % postcode.replace(' ',''), timeout=5).text except: return (None,None) else: obj = json.loads(resp) if 'geo' in obj and 'lat' in obj['geo'] and 'lng' in obj['geo']: return (obj['geo']['lat'], obj['geo']['lng']) else: return (None,None) else: return(None, None)
def scrapeAndSave(url): html = scraperwiki.scrape(url) root = lxml.html.fromstring(html) for tr in root.cssselect("div#chart_body .chart tr"): tds = tr.cssselect("td") if len(tds) == 11: a = tds[1].cssselect("a")[0] game_url = a.attrib["href"] game_html = scraperwiki.scrape(game_url) game_root = lxml.html.fromstring(game_html) title = game_root.cssselect("h1")[0].text_content() platform = ( game_root.cssselect("table#game_infobox tr")[1].cssselect("td")[0].cssselect("a")[0].text_content() ) genre = game_root.cssselect("table#game_infobox tr")[2].cssselect("td")[1].cssselect("a")[0].text_content() for game_tr in game_root.cssselect("div#game_table_box table tr"): game_tds = game_tr.cssselect("td") if len(game_tds) == 5: data = { "Title": title, "Region": game_tds[2].text_content(), "Release_Date": game_tds[3].text_content(), "Platform": platform, "Genre": genre, "Publisher": game_tds[1].text_content(), } saveToStore(data)
def scrape_and_find_csv(url): html = scraperwiki.scrape(url) root = lxml.html.fromstring(html) #this selects all HTML containing link: <p class="fileicon"><a> csvs = root.cssselect('p.fileicon a') print csvs for link in csvs: #this prints the result of adding the base URL to the relative link grabbed #print "link.attrib.get('href')", link.attrib.get('href') fullurl = baseurl+link.attrib.get('href') print link.attrib.get('href')[-3:] if link.attrib.get('href')[-3:] == "csv": data = scraperwiki.scrape(baseurl+link.attrib.get('href')) reader = csv.DictReader(data.splitlines()) for row in reader: # print row[' Invoice Amount '][0:] row['missingletter'] = row[' Invoice Amount '][0] row[' Invoice Amount_full'] = row[' Invoice Amount '].decode("latin-1") #AttributeError: 'NoneType' object has no attribute 'decode' if row['Invoice Ref'] is not None: row['Invoice Ref'] = row['Invoice Ref'].decode("latin-1") row[' Invoice Amount '] = row[' Invoice Amount '][2:] row['URL'] = fullurl print row if row['Doc Number'] is not None: scraperwiki.sqlite.save(['Doc Number', 'URL'], row)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div.month-entry-title a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) print url if -1 != href.find("file://"): # print "Skipping non-http URL " + url continue subhtml = scraperwiki.scrape(url) subroot = lxml.html.fromstring(subhtml) subhtml = None for subahref in subroot.cssselect("div.related-attachements a"): subhref = subahref.attrib['href'] suburl = urlparse.urljoin(url, subhref) if -1 == suburl.find(".pdf"): continue if parser.is_already_scraped(suburl): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, suburl, errors)
def openPage(pageURL): locallisthtml = scraperwiki.scrape(pageURL) localsoup = BeautifulSoup(locallisthtml) trs = localsoup.find("table",{"width":"770"}).findAll("tr") thisi = 0 for tr in trs: thisi = thisi + 1 tds = tr.findAll("td") if len(tds)>1: thisa = tds[1].find("a")['href'] if thisa.find("../") > - 1: starting_url = siteurl + thisa[3:] else: starting_url = siteurl + localfolder + "/" + thisa print str(thisi) + " " + starting_url if starting_url <> "http://www.pub-explorer.com/olpg/the-blackswan/ashover/index.htm" and starting_url.find("/http") == -1: html = scraperwiki.scrape(starting_url) soup = BeautifulSoup(html) ps = soup.findAll("p",{"align":"RIGHT"}) for p in ps: ptext = p.text #print ptext if ptext.find("Baby Chang") > - 1 or ptext.find("baby chang") > - 1: if starting_url <> "http://www.pub-explorer.com/notts/pub/hutt.htm": getDetails(soup) break
def scrape_application(link): #get data on agent and application person details_page = link print 'Scraping Details: ', details_page page_d = scraperwiki.scrape(details_page) tree_app = html.fromstring(page_d) #get data from decision tab page decision_page = link + '&theTabNo=2' print 'Scraping Decision: ', decision_page page = scraperwiki.scrape(decision_page) tree = html.fromstring(page) data = { 'info_url' : link, 'decision' : remove_characters(tree.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[0].text), 'app_date' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[0].text_content()), 'app_ref' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[1].text), 'reg_date' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[2].text), 'decision_date' : remove_characters(tree.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[1].text), 'app_type' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[4].text), 'ext_date' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[5].text), 'main_loc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[6].text), 'desc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[7].text), 'full_desc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[8].text), 'app_status' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[9].text), 'status_desc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[10].text), 'comment' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[11].text), 'application_company' : remove_characters(tree_app.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[2].text), 'agent' : remove_characters(tree_app.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[8].text) } print 'Scraping Complete :) NEXT! \n\n' scraperwiki.sqlite.save(unique_keys=['app_ref'], data=data)
def main(): #scrape page borough_html = scraperwiki.scrape('http://maps.met.police.uk/php/dataview.php?area=MPS&ct=8') borough_page = BeautifulSoup.BeautifulSoup(borough_html) boroughs = extract_areas(borough_page) for borough in boroughs: ward_html = scraperwiki.scrape(borough['area_link']) ward_page = BeautifulSoup.BeautifulSoup(ward_html) wards = extract_areas(ward_page) for ward in wards: sub_ward_html = scraperwiki.scrape(ward['area_link']) sub_ward_page = BeautifulSoup.BeautifulSoup(sub_ward_html) sub_wards = extract_areas(sub_ward_page) for sub_ward in sub_wards: crimes = extract_crime(sub_ward['area_link']) for crime in crimes: data = { 'borough' : borough['area_name'], 'ward' : ward['area_name'], 'sub_ward' : sub_ward['area_name'], 'super_output_area_code' : sub_ward['area_id'], 'month': crime['month'], 'crime_type': crime['crime_type'], 'crime_rate': crime['crime_rate'], 'crime_count': crime['crime_count'], } datastore.save(unique_keys=['super_output_area_code', 'month', 'crime_type'], data=data)
def congress_scrape(api_call, congress, chamber,bill_type): results = simplejson.loads(scraperwiki.scrape(api_call)) results = results['results'][0]['bills'] for bill in results: bill_uri = bill['bill_uri'] bill_number = bill['number'] title = bill['title'] committees = bill['committees'] num_cosponsors = bill['cosponsors'] last_action = bill['latest_major_action'] date_last_action = bill['latest_major_action_date'] time.sleep(1) if int(num_cosponsors) > 0: cosponsor_call = 'http://api.nytimes.com/svc/politics/v3/us/legislative/congress/%s/bills/%s/cosponsors.json?api-key=c886aef674b84bc2ce2f20439b7fff9c:12:66229250' % (congress, bill_number.replace(".","")) time.sleep(2) print cosponsor_call cosponsor_results = simplejson.loads(scraperwiki.scrape(cosponsor_call))['results'][0] sponsor = cosponsor_results['sponsor'] sponsor_id = cosponsor_results['sponsor_id'] #Cosponsers: List of dictionaries cosponsors = cosponsor_results['cosponsors'] for cosponsor in cosponsors: dict_results = {"Bill_number":bill_number,'Title':title, 'Committee':committees,'Last_action':last_action, "Date_Last_action":date_last_action, "Sponsor":sponsor, "Sponsor_ID":sponsor_id, "Congress_Number":congress,"Chamber": chamber} cosponsor_id = cosponsor['cosponsor_id'] cosponsor_name = cosponsor['name'] dict_results['Cosponsor_ID'] = cosponsor_id dict_results["Cosponsor_Name"] = cosponsor_name dict_results["Unique"] = bill_number+cosponsor_id scraperwiki.sqlite.save(unique_keys=['Unique'], data = dict_results)
def ParseSeatForCandidates(seat): jsontext = scraperwiki.scrape("http://www.yournextmp.com/seats/%s?output=json" % seat) contents = json.loads(jsontext)["result"] # get wikipedia link not in the json dump text = scraperwiki.scrape("http://www.yournextmp.com/seats/%s" % seat) constituencyurl = re.search('<a href="(http://en.wikipedia.org/wiki/[^"]*?\(UK_Parliament_constituency\))"', text).group(1) constituency = contents["name"] result = [ ] for candidate in contents["candidates"]: url = "http://www.yournextmp.com/candidates/%s" % candidate["code"] data = {"name":candidate["name"], "url":url, "party":candidate["party"]["name"], "constituency":constituency, "constituencyurl":constituencyurl} # get wikipedia link not in the json dump ptext = scraperwiki.scrape(url) wpurls = re.findall('<a href="(http://en.wikipedia.org/wiki/.*?)"', ptext) if wpurls: data["wpurl"] = re.sub(" ", "_", wpurls[0]) #result.append(data) #print ptext for w in re.findall('<h3>\s*<a href="(.*?)"', ptext): if not re.match("http://www.labour.org.uk|http://www.conservatives.com|http://www.libdems.org.uk|http://en.wikipedia.org/", w): result.append(w) return result
def series_article_urls(): html = scraperwiki.scrape('http://www.economist.com/blogs/charlemagne') root = lxml.html.fromstring(html) last_page_element = root.cssselect('li.pager-last a') if last_page_element: print last_page_element[0].get('href') regPat = re.compile('.*(\d{2,2})') found = re.match(regPat, last_page_element[0].get('href')) if found: last_page_number = int(found.group(1)) #print "Last page number:", last_page_number #http://www.economist.com/blogs/charlemagne/2012/12/french-muslims data = page_articles(root) for i in range(1,last_page_number+1): print "Page number:",i html = scraperwiki.scrape('http://www.economist.com/blogs/charlemagne?page=' + str(i)) page_root = lxml.html.fromstring(html) new_articles = page_articles(page_root) #print data.items() #print new_article.items() data = data + new_articles print "-------------------------" scraperwiki.sqlite.save(unique_keys=['path'], data=data)
def scrape_meeting(url): html = scraperwiki.scrape(url) #fjarvistir = soup.find(text=re.compile("Fjarvistarleyfi:").findParent) if not "Fjarvistarleyfi" in html: return else: root = lxml.html.fromstring(html) absent = {} absent['meeting'] = root.xpath('//h1/text()')[0] try: absent_link = root.xpath('//b[text()="Fjarvistarleyfi"]/..')[0].attrib['href'] except: return absent['meeting_url'] = absent_link html = scraperwiki.scrape(absent_link) root = lxml.html.fromstring(html) absent['assembly_number'] = re.split(" ",root.xpath('//title/text()')[0])[1][:3] p = root.xpath('//p') if p: for x in p: # print x.text.strip().partition(',')[0] #urrg = x.text.strip().partition(',')[0].encode('utf-8') #print urrg #print string_replace(replacements,urrg) absent['representative'] = (string_replace(replacements,x.text.strip().partition(',')[0].encode('utf-8')).strip()).decode('utf-8') #absent['representative'] = re.sub(", \w.*","",p.text) #print absent scraperwiki.sqlite.save(["meeting", "assembly_number", "representative"], absent,verbose=1)
def process_journal_pdfs(parser, listurl, errors, recurse): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div.items a"): url = urlparse.urljoin(listurl, ahref.attrib['href']) if -1 == url.find("doc_download"): continue consider_url(parser, url, errors) #print url for ahref in root.cssselect("div.item-list a"): suburl = urlparse.urljoin(listurl, ahref.attrib['href']) #print "sub " + suburl subhtml = scraperwiki.scrape(suburl) subroot = lxml.html.fromstring(subhtml) subhtml = None for subahref in subroot.cssselect("div.article a"): href = subahref.attrib['href'] #print href subsuburl = urlparse.urljoin(suburl, href) #print "subsub " + subsuburl if -1 == subsuburl.find("doc_download"): continue consider_url(parser, subsuburl, errors) subroot = None if recurse: seen = { listurl : 1 } for ahref in root.cssselect("div.pagination a"): pageurl = urlparse.urljoin(listurl, ahref.attrib['href']) #print "P: " + pageurl if pageurl not in seen: process_journal_pdfs(parser, pageurl, errors, False) seen[pageurl] = 1
def scrape_member(url_base, url): record = {} council = re.match(r'(.*)\.stm', url) if council: s = '/council/' + url record['url'] = url_base + s else: s = url + 'contact.stm' record['url'] = url_base + url record['source_url'] = url_base + s soup = BeautifulSoup(scraperwiki.scrape(url_base + s)) c = soup.find('div', id='content') td = c.findAll('table')[0].findAll('td') email = False offices = [] if council: record['elected_office'] = 'Councillor' photo_url = c.find('img', 'bio_pic')['src'] # Meta description also has representative and district names, but in once case it is incorrect. record['name'] = soup.find('span', {'class': 'bg90B'}).text.replace('Councillor', '').strip() record['district_name'] = soup.find('span', {'class': 'bg100B'}).text.replace(' Ward', '').strip() qs = urlparse(td[7].find('a')['href']).query qs = parse_qs(qs) rec = qs.get('Recipient', None) if len(rec): email = rec[0] + '@winnipeg.ca' postal = td[1] tel = td[3].text fax = td[5].text else: record['elected_office'] = 'Mayor' record['boundary_url'] = '/boundaries/census-subdivisions/4611040/' lm = soup.find('div', id='left-menu') l = lm.findAll('div', 'section')[1] photo_url = l.find('img')['src'] record['name'] = l.find('a').text.replace('Mayor ', '').strip() email = '*****@*****.**' postal = td[5] tel = '' fax = td[1].text mayor_page = BeautifulSoup(scraperwiki.scrape(url_base + '/interhom/mayor/')) record['name'] = mayor_page.find('img',src="/interhom/Mayor/images/signature.jpg")['alt'] postal = '\n'.join([x.strip() for x in postal.findAll(text=True)]) offices = [] offices.append({ 'type': 'constituency', 'tel': tel, 'fax': fax, 'postal': postal }) record['offices'] = json.dumps(offices) record['photo_url'] = url_base + photo_url if email: record['email'] = email return record
def getAllEntries(base_url, city, branch): url = base_url + "/" +city[0] +"/"+branch+"/" more_results = True page=1 while more_results: if page==1: html = scraperwiki.scrape(url) else: current_page = root.find_class("current") if current_page: next_page = current_page[0].getparent().getnext() if next_page!=None: p = next_page.text_content() html = scraperwiki.scrape(url+str(page)+"/") else: more_results = False else: more_results = False page=page+1 if more_results: root = lxml.html.fromstring(html) results = root.find_class("result-wrap") for result in results: header = result.cssselect("h2") name = header[0].cssselect("a") div = result.find_class("addr") addr = div[0].cssselect("p") street_address = addr[0].text_content() try: street_address = street_address[:street_address.index("/")] except: print "" print street_address geocode_url = 'http://maps.googleapis.com/maps/api/geocode/json?address='+urllib.quote_plus(street_address)+'&sensor=false&output=json' print geocode_url georeq = urllib2.Request(geocode_url) geo_response = urllib2.urlopen(georeq) geocode = simplejson.loads(geo_response.read()) print geocode if geocode['status'] != 'ZERO_RESULTS': data_lat = geocode['results'][0]['geometry']['location']['lat'] data_lng = geocode['results'][0]['geometry']['location']['lng'] print data_lat print data_lng data = { 'city': city[1], 'branch': branch, 'business' : name[0].text_content(), 'address' : street_address, 'data_lat' : data_lat, 'data_lng' : data_lng } scraperwiki.sqlite.save(unique_keys=['business'], data=data)
def scrapePage(page='0'): html = scraperwiki.scrape("http://opendataphilly.org/opendata/?sort=name&filter=data&page="+str(page)) root = lxml.html.fromstring(html) lxml.html.resolve_base_href(root) #not doing anything for res in root.cssselect("#results_list li.resource"): title = res.cssselect("#resource_title") desc = res.cssselect("#resource_desc") for link in title[0].iterlinks(): # scrape each item i.e. http://opendataphilly.org/opendata/resource/11/topographic-contours-2ft/ itemUrl = "http://opendataphilly.org"+str(link[2]) itemHtml = scraperwiki.scrape(itemUrl) item = lxml.html.fromstring(itemHtml) data = { 'title' : title[0].text_content(), 'desc' : desc[0].text_content(), 'itemUrl' : itemUrl #'itemHtml' : itemHtml } tab_data = item.cssselect("#tab_data")[0] for counter,info in enumerate(tab_data.cssselect("div")): if (counter == 0 or counter % 2): key = str(info.text_content()) else: data[key] = str(info.text_content()) print data scraperwiki.sqlite.save(unique_keys=['title'], data=data)
def scrape(url): for n in [1, 2, 3]: try: return scraperwiki.scrape(url) except: continue return scraperwiki.scrape(url)
def scrape_site(start_url, domaine): html_content = scraperwiki.scrape(start_url) p_num=1 r_num=0 while True: root = lxml.html.fromstring(html_content) data_list = root.cssselect('div[class="post clearfix"]') if len(data_list)==0: print 'SPIDER-STOP' break else: for i in xrange(len(data_list)): abs_link = start_url scrape_info(abs_link, r_num) r_num+=1 break for attempt in range(5): try: html_content = scraperwiki.scrape(s_url+'?Page='+str(p_num+1)) p_num+=1 break except: pass
def scrape_fgmarket(): scraperwiki.sqlite.save_var("source", "www.fgmarket.com ") scraperwiki.sqlite.save_var("author", "Alex Maslakov") root_html = lxml.html.fromstring(scraperwiki.scrape(base_domain_url)) #for each outer category for category_item_html in root_html.cssselect("div.home-section div.home-content div.box.last-content div.box-columns div.box-column-half ul.box-column-half-inner.home-category-set li a.home-category-main")[0:1]: first_page_url = get_first_page_url(category_item_html.get('href')) #parse the 1st page category html first_page_html_raw = lxml.html.fromstring(scraperwiki.scrape(first_page_url)) #scrape each company on the first page premium_companies_html = first_page_html_raw.cssselect("div.category-content div.category-listings div.prem-listing") if len(premium_companies_html) > 0: for company_item_html in premium_companies_html: scrape_premium_company(company_item_html, 1) #scrape each company on the other pages next_page_html_raw = first_page_html_raw counter = 2 while next_page_exists(next_page_html_raw): #next page url next_page_url_html_raw = next_page_html_raw.cssselect("div.category-content div.category-listings nav.pagination a")[-1] next_page_url = get_next_page_url(category_item_html.get('href'), next_page_url_html_raw.get('href')) next_page_html_raw = lxml.html.fromstring(scraperwiki.scrape(next_page_url)) #scrape each company on the first page for company_item_html in next_page_html_raw.cssselect("div.category-content div.category-listings div.prem-listing"): scrape_premium_company(company_item_html, counter) counter += 1
def process_page(page_url): print 'Processing page', page_url page_html = scraperwiki.scrape(page_url) page_rdf = scraperwiki.scrape('http://any23.org/any23', {'type' : 'text/html', 'format' : 'turtle', 'body' : page_html}) page_dom = lxml.html.fromstring(page_html) info = None email = None website = None try: info = lxml.html.tostring( page_dom.cssselect('img[alt="info evento"]')[0].getparent().getnext() ) print "PROCESS_INFO", process_info(page_url, info) except BaseException as e: print 'Info not found.', e try: email = page_dom.cssselect('img[alt="e-mail"]')[0].getnext().text_content() except BaseException as e: print 'EMail not found.', e try: website = page_dom.cssselect('img[alt="Sito Web Esterno"]')[0].getnext().get('href') except BaseException as e: print 'Website not found.', e if email: page_rdf += '<{0}> vcard:email "{1}".\n'.format(page_url, email) if website: page_rdf += '<{0}> vcard:url "{1}".\n'.format(page_url, website) page_rdf = page_rdf.replace(DEFAULT_ANY23_PAGE, '<{0}>'.format(page_url)) data = { 'url' : page_url, 'turtle' : page_rdf } scraperwiki.sqlite.save(unique_keys=['url'], data=data)
def scrape(url): html = scraperwiki.scrape(url) root = lxml.html.fromstring(html) for tr in root.cssselect("table#listtable tr"): tds = tr.cssselect("td") if tds[0].text_content() == 'ID': continue game_url = tds[1].cssselect("a")[0].attrib["href"] game_html = scraperwiki.scrape(game_url) game_root = lxml.html.fromstring(game_html) title = game_root.cssselect("div#gameTitle h1")[0].text_content() platform = game_root.cssselect("div#gameInfo h2 a")[0].text_content() publisher = "" details = game_root.cssselect("div#gameVitals p")[0].text_content() genres_index = details.find('Genres') release_date_index = details.find('Release Date') genres = game_root.cssselect("div#gameVitals p")[0].text_content()[genres_index + 7 : release_date_index].strip() if len(game_root.cssselect("div#gameVitals p img")) == 1: publisher = game_root.cssselect("div#gameVitals p img")[0].attrib["title"] elif len(game_root.cssselect("div#gameVitals p img")) == 2: publisher = game_root.cssselect("div#gameVitals p img")[1].attrib["title"] else: publisher_index = details.find("Publisher") if(publisher_index != -1): publisher = details[publisher_index + 10:].strip() data = { 'Title' : title, 'Platform' : platform, 'Publisher' : publisher, 'Genres' : genres } saveToStore(data)
def import_meta(): rows = scraperwiki.scrape(base_url + spine_meta).splitlines() rows = csv.DictReader(rows) for row in rows: scraperwiki.sqlite.save(['Variable'], row, table_name='spine_meta') rows = scraperwiki.scrape(base_url + absence_meta).splitlines() rows = csv.DictReader(rows) for row in rows: scraperwiki.sqlite.save(['Variable'], row, table_name='absence_meta') rows = scraperwiki.scrape(base_url + census_meta).splitlines() rows = csv.DictReader(rows) for row in rows: scraperwiki.sqlite.save(['Variable'], row, table_name='census_meta') rows = scraperwiki.scrape(base_url + spend_meta).splitlines() rows = csv.DictReader(rows) for row in rows: scraperwiki.sqlite.save(['Variable'], row, table_name='spend_meta') rows = scraperwiki.scrape(base_url + workforce_meta).splitlines() rows = csv.DictReader(rows) for row in rows: scraperwiki.sqlite.save(['Variable'], row, table_name='workforce_meta')
def main(): phones = [] querypage_url = "http://www.verkkokauppa.com/fi/s?s=1&q=&submit=Hae&category=22-658-4814&brand=Nokia" querypage_html = scraperwiki.scrape(querypage_url) querypage_root = lxml.html.fromstring(querypage_html) for el in querypage_root.cssselect("div.productRow"): stock = el.cssselect("div.stock")[0].text if "ennakkotilaa" not in stock: product_url = el.cssselect("a.productInfo")[0].attrib["href"] phones.append({'url':product_url}) for phone in phones: url_parts = phone["url"].split("/") phone_model = url_parts[-1] product_code = url_parts[-3] + url_parts[-2] phone_html = scraperwiki.scrape(phone["url"]) phone_root = lxml.html.fromstring(phone_html) for el in phone_root.cssselect("div#latestSoldList li"): if el.text: try: sold_date = datetime.strptime(el.text[3:], "%d.%m.%Y %H:%M") except: sold_date = parse_sold_text(el.text) if sold_date: sold_pieces = el.cssselect("span")[0].text sold_pieces = [int(s) for s in sold_pieces.split() if s.isdigit()][0] # because "8 kpl", "yli 20 kpl" cases scraperwiki.sqlite.save(unique_keys=['pcode','date'], data=({"pcode":product_code,"model":phone_model,"date":sold_date,"pieces":sold_pieces}))
def scrape_saint_data(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) if html.startswith('<script language="JavaScript">'): found = re.search('href=\".*\"', soup.contents[0].contents[0].string).span() url = soup.contents[0].contents[0].string[found[0]+6:found[1]-1] print url html = scraperwiki.scrape(url) soup = BeautifulSoup(html) title = soup.find("h1").contents[0].string if re.search('Saints.SQPN.com', title): title = soup.find("h2").contents[0].contents[0] print title # Now the data names = soup.findAll("ins") for name in names: if name.find("a"): name = name.find("a") print name.contents[0] next = name while next.nextSibling==None: next = next.parent print next while type(next.nextSibling).__name__ == 'NavigableString': next = next.nextSibling print next.contents[0]
def main(): decoder = json.JSONDecoder() # 1000000 is a stupidly large number? scrapers = decoder.decode(scraperwiki.scrape("https://api.scraperwiki.com/api/1.0/scraper/search?format=jsondict&maxrows=1000000")) # would be nicer if there was an "offset" to avoid generating superhuge JSON objects # even nicer if we could do it incrementally r = Relation() for d in scrapers: short_name = d["short_name"] if short_name[-8:] == ".emailer": continue print short_name details = decoder.decode(scraperwiki.scrape("https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1&quietfields=runevents%%7Cdatasummary%%7Cuserroles%%7Chistory"%short_name))[0] code = details["code"] # search code for mentions views v = re.compile("views\\.scraperwiki\\.com/run/([A-Za-z0-9_]*)") for other in re.findall(v,code): scraperwiki.sqlite.save(unique_keys=["from","to","type"],data={"from":short_name, "to":other, "type":"mention"}) # prepare table of hashes to find common code for h in rolling_hash(stripped_python(code)): r.relate(h,short_name) # find attachments for other in details["attachables"]: scraperwiki.sqlite.save(unique_keys=["from","to","type"],data={"from":short_name, "to":other, "type":"attachment"}) # now find the common code for ((x,y),n) in r.pairs_by_keys_in_common().iteritems(): scraperwiki.sqlite.save(unique_keys=["from","to","type"],data={"from":x, "to":y, "type":"common code", "strength":n})
def do_scrape(): az_html = scraperwiki.scrape('http://www.lambeth.gov.uk/Services/') list_root = lxml.html.fromstring(az_html) for a in list_root.cssselect("div.AZ li a"): try: page_title = a.text page_link = 'http://www.lambeth.gov.uk' + a.get('href') print "scraping " + page_link page_full_html = scraperwiki.scrape(page_link) page_root = lxml.html.fromstring(page_full_html) #pull out the section details print page_root.cssselect('div.breadCrumb a')[2].text sections_csv = page_root.cssselect('div.breadCrumb a')[2].text #check it is a content page, not a nav page if page_full_html.find('cScape.Lambeth.GenericTemplates/ServiceCategory.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/DocumentSummary.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/GroupDocument.aspx') <0: content_fragment = page_root.cssselect('div.page')[0] for toplink in content_fragment.cssselect('div.topLink'): content_fragment.remove(toplink) content_html = lxml.html.tostring(content_fragment) content_html = clean_html(content_html) scraperwiki.sqlite.save(unique_keys=["source_url"], data={"source_url":page_link, "title":page_title, "content": content_html, 'sections_csv': sections_csv}) else: print "ignoring nav page" except: print "something went wrong" pass
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div#placeholder-content-main-left-column a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find('/postjournal/article'): # print "Skipping non-http URL " + url continue subhtml = scraperwiki.scrape(url) subroot = lxml.html.fromstring(subhtml) subhtml = None for subahref in subroot.cssselect("div.article-content a"): subhref = subahref.attrib['href'] suburl = urlparse.urljoin(listurl, subhref) if -1 == suburl.find(".pdf"): continue if parser.is_already_scraped(suburl): True # print "Skipping already scraped " + suburl else: # print "Will process " + suburl process_pdf(parser, suburl, errors)
def scrape_site(start_url, domaine): html_content = scraperwiki.scrape(start_url) p_num=1 r_num=0 while True: root = lxml.html.fromstring(html_content) data_list = root.cssselect('td[class="lightbg exhlist_company"]') if len(data_list)==0: print 'SPIDER-STOP' break for i in range(len(data_list)): temp_link = data_list[i].attrib.get('onclick') rel_link = temp_link.split('(')[1].split(',')[0].replace("'","") abs_link = domaine+rel_link scrape_info(abs_link, r_num) r_num+=1 for attempt in range(5): try: html_content = scraperwiki.scrape(s_url+'?Page='+str(p_num+1)) p_num+=1 break except: pass
import scraperwiki import simplejson import urllib2 QUERY = 'crref OR chref OR ccref OR chref12' GEOINFO = '53.5,-8,257km' RESULTS_PER_PAGE = '100' LANGUAGE = 'en' NUM_PAGES = 15 for page in range(1, NUM_PAGES+1): base_url = 'http://search.twitter.com/search.json?q=%s&geocode=%s&rpp=%s&lang=%s&page=%s' \ % (urllib2.quote(QUERY), urllib2.quote(GEOINFO), RESULTS_PER_PAGE, LANGUAGE, page) try: results_json = simplejson.loads(scraperwiki.scrape(base_url)) for result in results_json['results']: data = {} data['id'] = result['id'] data['text'] = result['text'] data['from_user'] = result['from_user'] print data['from_user'], data['text'] scraperwiki.sqlite.save(["id"], data) except: print 'Oh dear, failed to scrape %s' % base_url ################################################################################### # Twitter API scraper - designed to be forked and used for more interesting things ################################################################################### import scraperwiki
import scraperwiki import lxml.etree import re from geopy import geocoders url = "http://www.regionofwaterloo.ca/en/gettingAround/resources/RegionalTrafficVolumes-AllLocations.pdf" pdfdata = scraperwiki.scrape(url) print "The pdf file has %d bytes" % len(pdfdata) xmldata = scraperwiki.pdftoxml(pdfdata) print "After converting to xml it has %d bytes" % len(xmldata) print "The first 2000 characters are: ", xmldata[:2000] root = lxml.etree.fromstring(xmldata) pages = list(root) localMuni = '' lastStreet = '' count = 0 munis = {'CAM': 'Cambridge', 'KIT': 'Kitchener', 'NDF': 'North Dumfries', 'WAT': 'Waterloo', 'WIL': 'Wilmot', 'WEL': 'Wellesley', 'WOO': 'Woolwich'} g = geocoders.Google(domain='maps.google.ca') for page in pages: for el in list(page):
def process_catalog_page(page_index): catalog_page_html = scraperwiki.scrape(CATALOG_PAGINATION + str(page_index)) catalog_page_dom = lxml.html.fromstring(catalog_page_html) for dataset_link in catalog_page_dom.cssselect('.cerca_td_nome a'): process_dataset_page(dataset_link.get('href'))
import scraperwiki html = scraperwiki.scrape('http://inmo.ie/6022') import lxml.html root = lxml.html.fromstring(html) # turn our HTML into an lxml object tds = root.cssselect('td') # get all the <td tags for td in tds: print td.text_content() # just the text inside the HTML tag for td in tds: record = { "td" : td.text_content() } # column name and value scraperwiki.sqlite.save(["td"], record) # save the records one by one
import scraperwiki from bs4 import BeautifulSoup # documentation at http://www.crummy.com/software/BeautifulSoup/bs4/doc/ scraperwiki.sqlite.attach( "dfid-contracts" ) # Attaching scraper https://scraperwiki.com/scrapers/dfid-contracts/ links = scraperwiki.sqlite.select( "URL from `dfid-contracts`.swdata" ) # Selecting the URLs collected from the search results from contract finder # Getting the html from the links for link in links: url = link["URL"] print url html = scraperwiki.scrape(url) soup = BeautifulSoup(html) #print soup title = soup.find("h2", "legend-edit").get_text().replace("English", "").strip() # Paragraph tags contain information like reference, duration, nature of contract, etc. ps = soup.find_all("p", "clearfix") for p in ps: span_text = "" span = p.find("span") if span: span_text = span.get_text().strip() span.clear() if span_text != "":
############################################################################### # Air pollution monitoring site scraper ############################################################################### import scraperwiki import lxml.html import time import datetime today_date = str(datetime.datetime.now()) # retrieve the page starting_url = 'http://www.londonair.org.uk/london/asp/publicstats.asp?region=0&site=MY7&bulletin=hourly&la_id=&statyear=2011&postcode=&MapType=Google&zoom=9&lat=51.431751825946115&lon=-0.17578125&Species=All' html = scraperwiki.scrape(starting_url) # get all the td tags in the stats table root = lxml.html.fromstring(html) tds = root.cssselect('table#sitestatssub td span') # save them i = 1 for td in tds: record = {"date_scraped": today_date, "td": td.text, "no": i} print record scraperwiki.sqlite.save(["td"], record) i = i + 1 ############################################################################### # Air pollution monitoring site scraper ############################################################################### import scraperwiki
import scraperwiki import lxml.html import datetime html = scraperwiki.scrape("http://mg.co.za/zapiro/") root = lxml.html.fromstring(html) title = root.cssselect("li.last_crumb a")[0].text_content() link = root.cssselect("div#cartoon a")[0].attrib['href'] cartoon = root.cssselect("div#cartoon_full_size img")[0].attrib['src'] now = datetime.datetime.now() data = { 'link': link, 'title': "Zapiro:" + title, 'description': '<img src="'+cartoon+'" border="0" /><br /><br />Like Zapiro? Then check out my <a href="http://feeds.feedburner.com/MadamEve">Unofficial Madam & Eve Feed</a>. All Madam & Eve cartoons are taken directly from the <a href="http://www.madamandeve.co.za">official website</a> as it is updated.', 'pubDate': str(now) , } scraperwiki.sqlite.save(unique_keys=['link'],data=data)import scraperwiki import lxml.html import datetime html = scraperwiki.scrape("http://mg.co.za/zapiro/") root = lxml.html.fromstring(html) title = root.cssselect("li.last_crumb a")[0].text_content() link = root.cssselect("div#cartoon a")[0].attrib['href'] cartoon = root.cssselect("div#cartoon_full_size img")[0].attrib['src']
import scraperwiki html = scraperwiki.scrape("http://pretraga2.apr.gov.rs/EnterprisePublicSearch/details/EnterpriseBusinessName/1335330?code=620B52581BD2D831C00E2A0D0DC04739C1C3B6FB") print htmlimport scraperwiki html = scraperwiki.scrape("http://pretraga2.apr.gov.rs/EnterprisePublicSearch/details/EnterpriseBusinessName/1335330?code=620B52581BD2D831C00E2A0D0DC04739C1C3B6FB") print html
import scraperwiki import lxml.html html = scraperwiki.scrape("https://scraperwiki.com/") root = lxml.html.fromstring(html) for el in root.cssselect("div.tags li a"): print el.text, el.attrib['href'] html_for_tag_page = scraperwiki.scrape("https://scraperwiki.com" + el.attrib['href']) root_for_tag_page = lxml.html.fromstring(html_for_tag_page) matching_python_scrapers = 0 for tag_el in root.cssselect("table.code_about tr.python"): matching_python_scrapers = matching_python_scrapers + 1 scraperwiki.sqlite.save(unique_keys=['url'], data={ 'title': el.text, 'url': el.attrib['href'], 'num_python_scrapers': matching_python_scrapers }) import scraperwiki import lxml.html html = scraperwiki.scrape("https://scraperwiki.com/") root = lxml.html.fromstring(html) for el in root.cssselect("div.tags li a"):