def urltohtml(url="http://www.madingley.org/uploaded/Hansard_08.07.2010.pdf"): import scraperwiki, urllib2, lxml.etree lazycache=scraperwiki.swimport('lazycache') pdfdata = lazycache.lazycache(url) xmldata = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(xmldata) pages = list(root) # this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>" def gettext_with_bi_tags(el): res = [ ] if el.text: res.append(el.text) for lel in el: res.append("<%s>" % lel.tag) res.append(gettext_with_bi_tags(lel)) res.append("</%s>" % lel.tag) if el.tail: res.append(el.tail) return "".join(res) # print the first hundred text elements from the first page text=[] for page in pages: for el in list(page)[:100]: if el.tag == "text": text.append(gettext_with_bi_tags(el)) return '\n'.join(text)
def load(): index = [u'shipment_year', u'taxon_family'] data = scraperwiki.swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/top10.csv') for row in data: del(row['']) scraperwiki.sqlite.save(index, data, 't')
def main(): #Load xml=swimport('dsp').dsp('http://www.vbsmutualbank.co.za/ContactUs/ContactUs.htm') #Parse branches=xml.xpath('//table[@width="556"]/tr[position()>1 and position()<last()]') branches_text=[branch.xpath('td[position()=last()]')[0].text_content() for branch in branches] d=[parse_branch_text(t) for t in branches_text]
def main(): #Load xml=swimport('dsp').dsp('http://www.khula.org.za/Admin/Contacts/RegionalContacts.aspx',False) #Parse t_nodes=xml.xpath('//table[@width="100%"]') assert len(t_nodes)==1 table=t_nodes[0] d=parse_table(table) t=time() for row in d: row["date_scraped"]=t d=moreparsing(d) save([],d,'final')
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/') save([], d, 'scraped') if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv') save([], d, 'scraped') execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"') commit() if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
from scraperwiki import swimport from scraperwiki.sqlite import save keyify=swimport('keyify').keyify dsp=swimport('dsp').dsp from time import time from copy import copy DATE=time() strip_address = swimport('strip_address').strip_address URLBASE="http://www.marang.co.za/" class ParseError(Exception): pass #Main controls def main(): links=get_links() for path in links: #Select only the interesting ones #And something's wrong with the history page. if "-" in path and path!='marang-history.asp': url='%s%s'%(URLBASE,path) print "Downloading %s" % url xml=dsp(url,False) if path=="branch-regional.asp": parse_regional_offices(xml,url) elif path=="marang-map.asp": parse_map(xml) elif path[0:6]=="branch": try: parse_branch(xml,url,path)
# -*- coding: UTF-8 -*- import scraperwiki import json from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html import resource import sys import urlparse import re # Make sure Scraperwiki believe this is the source from this database scraperwiki.scrape("http://www.hive.no/postjournal/") lazycache = scraperwiki.swimport("lazycache") postlistelib = scraperwiki.swimport("postliste-python-lib") agency = "Høgskolen i Vestfold" def report_errors(errors): if 0 < len(errors): print "Errors:" for e in errors: print e exit(1) def out_of_cpu(arg, spent, hard, soft): report_errors(arg)
# Blank Python import scraperwiki utils=scraperwiki.swimport('hildenae_utils') import lxml.html from lxml import etree #scraperwiki.utils.httpresponseheader("Content-Type", 'text/plain; charset="utf-8"') #scraperwiki.sqlite.attach("sopp-middag-view", "src") def extractTable(root): for el in root.cssselect("div.content table.contentpaneopen table"): tableSource = lxml.html.tostring(el) if "Ukedag" in tableSource: return el def cleanup(table): etree.strip_tags(table,'span','strong','div', 'tbody') for tag in table.iter(): for att in tag.attrib.keys(): tag.attrib.pop(att) if tag.tag == "table": tag.set('border','1') return table; def tds(td): tdstr = lxml.etree.tostring(td) cleaned = tdstr.replace(' ','').replace(' ', ' ').replace('/n', '').replace('</td>', '').replace('<td>', '').replace('<p>','').replace('<br />','<br>'); cleaned2 = utils.removeDoubleSpaces(cleaned).replace('</p>','<br>').replace('<br><br>','<br>').replace('> ','>').replace(' <','<'); if cleaned2.endswith("<br>"):
import scraperwiki import sys import urllib import lxml.etree import re import datetime wikipedia_utils = scraperwiki.swimport("wikipedia_utils") nowtime = datetime.datetime.now() def UpdateCategory(): ldata = wikipedia_utils.GetWikipediaCategoryRecurse("Caves_of_the_United_Kingdom") for data in ldata: data["updatedate"] = nowtime scraperwiki.sqlite.save(["title"], ldata, "cavepages") scraperwiki.sqlite.execute("delete from cavepages where updatedate<>?", nowtime.isoformat()) def ExtractInfo(): rdata = scraperwiki.sqlite.select("title, link from cavepages") ldata = [ ] for pdata in rdata: try: val = wikipedia_utils.GetWikipediaPage(pdata["title"]) except IOError: print "Skipping", pdata["title"] continue res = wikipedia_utils.ParseTemplates(val["text"]) #print dict(res["templates"]).keys() data = dict(res["templates"]).get("Infobox ukcave")
import scraperwiki search = scraperwiki.swimport('twitter_search_extended').search search(['viaplay'], num_pages=25) import scraperwiki search = scraperwiki.swimport('twitter_search_extended').search search(['viaplay'], num_pages=25)
import scraperwiki search = scraperwiki.swimport("twitter_search_extended").search search(["olympics"], num_pages=5) # Blank Python import scraperwiki search = scraperwiki.swimport("twitter_search_extended").search search(["olympics"], num_pages=5) # Blank Python
qdict = dict(cgi.parse_qsl(os.getenv("QUERY_STRING", ""))) url = qdict.get("url", 'http://www.wellcome.ac.uk/stellent/groups/corporatesite/@msh_peda/documents/web_document/wtd003419.pdf') try: sdata = scraperwiki.sqlite.select("url, pages, isbns, final_isbn from swdata where url=? limit 1", (url,)) except scraperwiki.sqlite.SqliteError: sdata = [ ] if sdata: data = sdata[0] data = { "url":sdata[0]["url"], "pages":sdata[0]["pages"], "isbns":json.loads(sdata[0]["isbns"]), "final_isbn":sdata[0]["final_isbn"] } else: lazycache=scraperwiki.swimport('lazycache') pdfdata=lazycache.lazycache(url) xmldata = scraperwiki.pdftoxml(pdfdata) root=lxml.html.fromstring(xmldata) pages = list(root) isbns = [ ] fisbn = "" for i, page in enumerate(pages): pagetext = " ".join(t.text_content() for t in page) for isbn_org, isbn in re.findall("(ISBN[\s:]*([\d\s\-]+))(?i)", pagetext): isbn = re.sub("[^\d]", "", isbn) if not stdnum.isbn.is_valid(isbn): isbn = "" else: fisbn = isbn isbns.append({"isbn_org":isbn_org, "isbn":isbn})
import simplejson import urllib2 from scraperwiki import swimport def get_followers(twitter_handle): base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle results_json = simplejson.loads(scraperwiki.scrape(base_url)) return results_json['ids'] followers = get_followers("gormno") myfollowers_str = map(str, followers) swimport('twitter_bulk_users_lookup_3').bulklookup("gormno", myfollowers_str) followers = get_followers("gormer") myfollowers_str = map(str, followers) swimport('twitter_bulk_users_lookup_3').bulklookup("gormer", myfollowers_str) import scraperwiki import simplejson import urllib2 from scraperwiki import swimport def get_followers(twitter_handle): base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle results_json = simplejson.loads(scraperwiki.scrape(base_url))
from scraperwiki import swimport swimport('twitter_search_extended').search(['#actuary'], num_pages = 5)from scraperwiki import swimport swimport('twitter_search_extended').search(['#actuary'], num_pages = 5)
from scraperwiki import swimport from scraperwiki.sqlite import save, execute, commit, show_tables from lxml.html import fromstring, tostring from requests import get options=swimport('options').options keyify=swimport('keyify').keyify randomsleep=swimport('randomsleep').randomsleep from time import time import re strip_address = swimport('strip_address').strip_address DATE=time() DOMAIN = 'http://www.thuthukani.co.za' def getprovincepages(): html = fromstring(get('http://www.thuthukani.co.za/branches-mpumalanga.php').content) selects = html.cssselect('select') assert 1 == len(selects) d = options(selects[0], textname = 'province', valuename = 'url') for row in d: row['url'] = DOMAIN + '/' + row['url'] row['date_scraped'] = DATE return d def parseprovincepage(province): raw = get(province['url']).content withspaces = raw.replace('<br />', '\n').replace('<font', '\n\n<font') html = fromstring(withspaces) fonts = html.xpath('//td[@valign="top"]/font[img[@src="images/branches.jpg"]]') assert 1==len(fonts)
import scraperwiki import urllib2 import urlparse import re import os import time filename = "" # to do. # Detect dates # Do equates # Detect use of DistoX to distinguish computer generated numbers print "2" irebyurl = "http://cave-registry.org.uk/svn/Yorkshire/mmmmc/survexdata/all.svx" treextract = scraperwiki.swimport("apache_directory_tree_extractor") print treextract.ParseSVNRevPageTree("http://cave-registry.org.uk/svn/Yorkshire/mmmmc/survexdata/") print "3" allteam = set() def Main(): survexblock = SurvexBlock(name="root", survexpath="caves", parent=None, begin_char=0, cave="Ireby", survexfile=irebyurl, totalleglength=0.0) fin = GetFile(irebyurl) textlines = [ ] #print "before", fin RecursiveLoad(survexblock, irebyurl, fin, textlines) return survexblock def GetFile(url): try:
from scraperwiki import swimport TABLE_NAME='nytm_sos' swimport('meetup').scrape('http://www.meetup.com/ny-tech/events/47879702/',TABLE_NAME) swimport('swversion').swversion(TABLE_NAME) print """ How to get the current table: SELECT * from (SELECT value_blob FROM `swvariables` WHERE name="nytm_sos_current"); """from scraperwiki import swimport TABLE_NAME='nytm_sos' swimport('meetup').scrape('http://www.meetup.com/ny-tech/events/47879702/',TABLE_NAME) swimport('swversion').swversion(TABLE_NAME) print """ How to get the current table: SELECT * from (SELECT value_blob FROM `swvariables` WHERE name="nytm_sos_current"); """
import scraperwiki from scraperwiki import swimport swimport('twitter_scrape').statuses('regjeringen', 'departementene', 1) import scraperwiki from scraperwiki import swimport swimport('twitter_scrape').statuses('regjeringen', 'departementene', 1)
from urllib2 import urlopen from lxml.html import fromstring, tostring import scraperwiki import datetime keyify = scraperwiki.swimport('keyify').keyify #call the page page = urlopen('http://dashboard.ed.gov/statecomparison.aspx?i=j&id=0&wt=40') rawtext = page.read() html = fromstring(rawtext) #Name the table you are trying to build tables = html.cssselect('table') #Call the first table table = tables[0] #print the entire table print tostring(table) #get the rows inside table 0 rows = table.cssselect('tr') headers = [th.text_content() for th in rows[0].cssselect('th')] #pull headers from database expected_headers = [ 'State', 'Total', u'\xa0', 'White', 'Black', 'Hispanic', 'Asian', 'Native Hawaiian/Pacific Islander', 'American Indian/Alaska Native', 'Two or more races' ] assert headers == expected_headers, headers
from time import time from scraperwiki.sqlite import save,select from scraperwiki import swimport from lxml.etree import fromstring from urllib2 import urlopen import re strip_address = swimport('strip_address').strip_address DATE=time() URL="http://www.saccol.org.za/saccos_in_saccol.php" def main(): d=download() save([],d,'initial') d=clean() save([],d,'final') def download(): #Load page raw=urlopen(URL).read() cleaned=clean_page(raw) #Load table table=fromstring(cleaned) d=parse_table(table) return d def clean(): d=select('* from `initial`') for row in d: row['date_scraped']=DATE
def main(): x=swimport('dsp').dsp(URL,False) satellite=x.xpath('//div[@class="grid_3 alpha omega block"]/p/text()') branch=x.xpath('//div[@class="grid_6 alpha omega block"]/p/text()') save(["satellite"],[{"date_scraped":DATE,"satellite":clean(s),"branch":clean(b)} for (s,b) in zip(satellite,branch)],"final")
import scraperwiki import json import requests import lxml.html import itertools import datetime swutils=scraperwiki.swimport('swutils') user_agent = swutils.get_user_agent() requests.defaults.defaults['max_retries'] = 10 def main(): print 'scraping November 2012 json feed' json_url = "http://ec2-46-51-135-144.eu-west-1.compute.amazonaws.com/gaza/Nov2012/json" json_response = requests.get(json_url).text json_dict = json.loads(json_response) batch = [] alreadygot = [] try: rows = scraperwiki.sqlite.select('id from features') for row in rows: alreadygot.append(row['id']) except: pass if len(json_dict['features']) - len(alreadygot) > 0: print 'scraping details for %s new features' % (len(json_dict['features']) - len(alreadygot))
# See also # https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf import scraperwiki import json from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html import urlparse import re scraperwiki.scrape("http://www.bydel-ullern.oslo.kommune.no/postjournal/") #lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') agency = 'Oslo kommune, Ullern bydel' def report_errors(errors): if 0 < len(errors): print "Errors:" for e in errors: print e raise ValueError("Something went wrong") def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit from scraperwiki import swimport from requests import session from lxml.html import fromstring, tostring import re from time import time, sleep keyify=swimport('keyify').keyify URL="http://www.ncr.org.za/register_of_registrants/index.php" #DEV=True DEV=False DATE = get_var('DATE', time()) RE={ 'leftpadding':re.compile(r'^ *') , 'rightpadding':re.compile(r' *$') } def cp1(): execute(''' CREATE TABLE IF NOT EXISTS `businessPremises` ( `date_scraped` REAL, `businessPremisesURL` TEXT, FOREIGN KEY (date_scraped, businessPremisesUrl) REFERENCES cp1(date_scraped, businessPremisesUrl) ) ''') if get_var('crashed') == 1:
from scraperwiki import swimport from scraperwiki.sqlite import save keyify = swimport('keyify').keyify dsp = swimport('dsp').dsp from time import time from copy import copy DATE = time() strip_address = swimport('strip_address').strip_address URLBASE = "http://www.marang.co.za/" class ParseError(Exception): pass #Main controls def main(): links = get_links() for path in links: #Select only the interesting ones #And something's wrong with the history page. if "-" in path and path != 'marang-history.asp': url = '%s%s' % (URLBASE, path) print "Downloading %s" % url xml = dsp(url, False) if path == "branch-regional.asp": parse_regional_offices(xml, url) elif path == "marang-map.asp": parse_map(xml)
#Scraperwiki som hentar veglengder frå veglistene til vegvesenet #Av Gnonthgol import scraperwiki import urllib2 import lxml.etree import re utils=scraperwiki.swimport('hildenae_utils') # for caching + pretty-print res = {} #Henta frå http://www.vegvesen.no/Kjoretoy/Yrkestransport/Veglister+og+dispensasjoner/Veglister+2012 url = "http://www.vegvesen.no/_attachment/314828/binary/553942" xmldata = utils.findInCache(url) if xmldata is None: pdfdata = urllib2.urlopen(url).read() xmldata = scraperwiki.pdftoxml(pdfdata) utils.putInCache(url, xmldata) root = lxml.etree.fromstring(xmldata) pages = list(root) for page in pages: ref = None for el in list(page): if el.tag == "text" and el.text: if el.text.find('FV ') == 0 or el.text.find('KV ') == 0: ref = el.text.strip(" *") if ref and re.match("(\d*,\d{3})", el.text): if not res.has_key(ref):
# -*- coding: UTF-8 -*- import scraperwiki import json from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html import resource import sys import urlparse import re # Make sure Scraperwiki believe this is the source from this database scraperwiki.scrape("http://hist.no/content/34570/Offentleg-Journal") lazycache = scraperwiki.swimport('lazycache') postlistelib = scraperwiki.swimport('postliste-python-lib') agency = 'Høgskolen i Vestfold' def report_errors(errors): if 0 < len(errors): print "Errors:" for e in errors: print e exit(1) def out_of_cpu(arg, spent, hard, soft): report_errors(arg)
import scraperwiki import simplejson import urllib2 from scraperwiki import swimport myfollowers = [] twitter_handle = 'GotToDance_Sky1' base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle results_json = simplejson.loads(scraperwiki.scrape(base_url)) myfollowers = results_json['ids'] myfollowers_str = map(str, myfollowers) swimport('twitter_bulk_users_lookup').bulklookup(myfollowers_str) ''' See https://scraperwiki.com/scrapers/twitter_bulk_users_lookup/ for the code for the script Still to do -add parameter for ID and username (usertype) '''import scraperwiki import simplejson import urllib2 from scraperwiki import swimport myfollowers = [] twitter_handle = 'GotToDance_Sky1'
from scraperwiki import swimport swimport('twitter_search').search(['@thomaslevine', 'from:thomaslevine'])from scraperwiki import swimport swimport('twitter_search').search(['@thomaslevine', 'from:thomaslevine'])
from scraperwiki import swimport csv2dict = swimport('csv2sw').csv2dict # Write a csv f = open('foobar.csv', 'w') f.write('foo,bar,baz\n4,63,3') f.close() # Convert it to a dictionary g = open('foobar.csv', 'r') d = csv2dict(g) print dfrom scraperwiki import swimport csv2dict = swimport('csv2sw').csv2dict # Write a csv f = open('foobar.csv', 'w') f.write('foo,bar,baz\n4,63,3') f.close() # Convert it to a dictionary g = open('foobar.csv', 'r') d = csv2dict(g) print d
import scraperwiki search = scraperwiki.swimport('twitter_search_extended').search search(['olympics'], num_pages=5) import scraperwiki search = scraperwiki.swimport('twitter_search_extended').search search(['olympics'], num_pages=5)
"Because this website's data span many pages, this scraper needs to be run about nine times to make a full scrape." from lxml.html import fromstring #from lxml.etree import fromstring from time import time import requests from scraperwiki.sqlite import save,save_var,get_var,show_tables,select,commit,execute from scraperwiki import swimport options=swimport('options').options keyify=swimport('keyify').keyify from json import loads,dumps strip_address = swimport('strip_address').strip_address URLS={ "main":"http://www.nedbank.co.za/website/content/map/branches.asp" , "suburbs-base":"http://www.nedbank.co.za/website/content/map/getSuburbs.asp?q=" , "cities-base":"http://www.nedbank.co.za/website/content/map/getData.asp?q=" } def main(): if get_var('province')=='step2': separate_addresses() execute('DELETE FROM swvariables WHERE name = "province"') commit() print(""" ================================ This run is finished! ================================ """) else: download()
from scraperwiki import swimport bucketwheel = swimport('bucketwheel') from scraperwiki.sqlite import execute, commit execute('drop table if exists stack') commit() bucketwheel.seed([bucketwheel.GetLinks('http://thomaslevine.com')])from scraperwiki import swimport bucketwheel = swimport('bucketwheel') from scraperwiki.sqlite import execute, commit execute('drop table if exists stack') commit() bucketwheel.seed([bucketwheel.GetLinks('http://thomaslevine.com')])
from lxml.html import fromstring from urllib2 import urlopen from time import time import re from scraperwiki.sqlite import save, select from scraperwiki import swimport randomsleep=swimport('randomsleep').randomsleep DATE=time() DOMAIN="http://www.bk.rw" STARTURL="http://www.bk.rw/index.php?option=com_content&view=article&id=54&Itemid=57" def download(): url=STARTURL d=[] while url!=None: a=Article(url) d.extend(a.parse()) url=a.next() randomsleep() save(['date-scraped','branch-number'],d,'raw') class MultipleNextButtons(Exception): pass class Article: def __init__(self,url): self.url=url self.x=fromstring(urlopen(url).read())
from requests import get from string import ascii_lowercase from lxml.html import fromstring from scraperwiki import swimport from scraperwiki.sqlite import save,select,get_var,save_var,execute keyify=swimport('keyify').keyify from time import time import re DATE=time() def main(): if None==get_var('downloaded'): download() save_var('downloaded',1) execute('DROP TABLE IF EXISTS `final`') clean() save_var('downloaded',None) def clean(): d=select('* FROM `swdata` WHERE `date_scraped`=(SELECT max(`date_scraped`) FROM `swdata`);') POSTCODE=re.compile(r'[0-9]*$') for row in d: row['postcode']=re.findall(POSTCODE,row['Postal_Address'])[0] save([],d,'final') def download(): d=[] for letter in ascii_lowercase: x=search_letter(letter) branch_tables=x.cssselect('table.locatorTable table')
import scraperwiki import urllib pdf2xml = scraperwiki.swimport("pdf2xml") pdfurl = "http://samplepdf.com/sample.pdf" # example PDF pdfurl = 'http://samplepdf.com/sample.pdf' pdf = urllib.urlopen(pdfurl).read() # The C executable: print scraperwiki.pdftoxml(pdf) # The PDFminder version from https://github.com/zejn/pypdf2xml/blob/master/pdf2xml print pdf2xml.pdf2xml(pdf) # Notes: # It may be that is better delivered as a scraperwiki view where you pass in the url as an argument, and the view # downloads and parses it, so it can be used in all languages. # Disadvantage is you don't get to hold and be able to cache the PDF source itself. # The software version is better at "handling non-ascii or otherwise custom glyphs, which supplied glyph to # character maps." -- [example?] # Also it is my (Julian_Todd's) intention to get a version that preserves borders of table cells, which nothing does.
from scraperwiki import swimport from scraperwiki.sqlite import save, select, execute, commit eventbrite = swimport('eventbrite') def union(year, eventbrite_name, table_name): eventbrite.scrape('http://%s.eventbrite.com' % eventbrite_name, table_name) d = select('* from `%s`' % table_name) for row in d: row['year'] = year save([], d, 'tcamp') def download(): execute('CREATE TABLE IF NOT EXISTS `tcamp`' '(`year` integer, `first_scraped` real, `Twitter handle` text, `Intro for your fellow campers` text)') execute('DELETE FROM tcamp') union(2013, 'tcamp13', '2013') union(2012, 'tcamp12', '2012') union(2011, 'tcamp11', '2011') def aggregate(): execute('create table if not exists twitter (handle text, times integer)') execute('create unique index if not exists twitter_handle on twitter(handle)') execute('delete from twitter where 1 = 1') execute('insert into twitter ' 'select replace(`twitter handle`, "@", ""), count(*) from `tcamp` ' 'where `twitter handle` is not null group by [twitter handle]' ) commit() download() aggregate()from scraperwiki import swimport
from scraperwiki import swimport from scraperwiki.sqlite import save dsp=swimport('dsp').dsp from time import time DATE=time() #Load xml=dsp('http://www.tebabank.co.za/dist_atm.php',False) #Parse towns=xml.xpath('//ul[@class="bulletlist"]') for town in towns: townnames=town.xpath('preceding-sibling::h2[position()=1]/text()') l=len(townnames) if l!=1: raise ParseError("There is supposed to be exactly one town name, but %d were found." % l) else: townname=townnames[0] addresses=town.xpath('li/text()') #Save for address in addresses: save([],{ "town":townname , "address":address , "date_scraped":DATE },'swdata') from scraperwiki import swimport
import scraperwiki import simplejson import urllib2 from scraperwiki import swimport myfollowers = [] twitter_handle = 'urban_nerds' base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle results_json = simplejson.loads(scraperwiki.scrape(base_url)) myfollowers = results_json['ids'] myfollowers_str = map(str, myfollowers) swimport('twitter_bulk_users_lookup').bulklookup(myfollowers_str) ''' See https://scraperwiki.com/scrapers/twitter_bulk_users_lookup/ for the code for the script Still to do -add parameter for ID and username (usertype) '''import scraperwiki import simplejson import urllib2 from scraperwiki import swimport myfollowers = [] twitter_handle = 'urban_nerds'
# Blank Python from scraperwiki import swimport swimport('twitter_search').search(['@dmsilvabr', 'from:dmsilvabr'])# Blank Python from scraperwiki import swimport swimport('twitter_search').search(['@dmsilvabr', 'from:dmsilvabr'])
import scraperwiki import lxml.html import requests import xlrd, re import dateutil.parser import urlparse import json html2text = scraperwiki.swimport('html2text') xlurl = 'https://dl.dropbox.com/s/ugyauhxhs3xuf1z/dft_scraping_instructions_new.xls?dl=1' #'dft scraping instructions new' doc emailed by dragon on sep 12. # julians xls loader def LoadXLlinks(): book = xlrd.open_workbook( file_contents=requests.get(xlurl, verify=False).content) for i in [0, 2, 4, 6, 8, 10, 12]: sheet = book.sheet_by_index(i) print sheet.name, "rows:", sheet.nrows ldata = [] for i in range(sheet.nrows): sheetvalue = sheet.cell(i, 0).value.strip() if sheetvalue: ldata.append({ "sheetname": sheet.name, "i": i, "url": sheetvalue }) scraperwiki.sqlite.save(["sheetname", "i"], ldata, "xllinks")
from scraperwiki.sqlite import save,get_var,save_var except ImportError: def save(a,b,c): print(b) __VARS={} def get_var(a): if __VARS.has_key(a): return __VARS[a] def save_var(a,b): __VARS[a]=b def options(*args,**kwargs): return [{"branchId":"174","branchName":"DUNNO"}] else: options=swimport('options').options URL="http://www.postbank.co.za/contact.aspx?ID=3" def log(foo): print(foo) if get_var('previous_branchId')==None: save_var('DATE',time()) FIRST_RUN=True else: FIRST_RUN=False DATE=get_var('DATE') def main():
import scraperwiki from scraperwiki import swimport search = swimport('twitter_search').search search(['from:peterwalker99']) #print search import scraperwiki from scraperwiki import swimport search = swimport('twitter_search').search search(['from:peterwalker99']) #print search
from scraperwiki import swimport swimport('eb').scrape('http://jdcny.eventbrite.com/')from scraperwiki import swimport swimport('eb').scrape('http://jdcny.eventbrite.com/')from scraperwiki import swimport swimport('eb').scrape('http://jdcny.eventbrite.com/')from scraperwiki import swimport swimport('eb').scrape('http://jdcny.eventbrite.com/')
import scraperwiki import sys import urllib import lxml.etree import re import datetime wikipedia_utils = scraperwiki.swimport("wikipedia_utils") nowtime = datetime.datetime.now() def UpdateCategory(): ldata = wikipedia_utils.GetWikipediaCategoryRecurse( "Caves_of_the_United_Kingdom") for data in ldata: data["updatedate"] = nowtime scraperwiki.sqlite.save(["title"], ldata, "cavepages") scraperwiki.sqlite.execute("delete from cavepages where updatedate<>?", nowtime.isoformat()) def ExtractInfo(): rdata = scraperwiki.sqlite.select("title, link from cavepages") ldata = [] for pdata in rdata: try: val = wikipedia_utils.GetWikipediaPage(pdata["title"]) except IOError: print "Skipping", pdata["title"] continue
from scraperwiki import swimport swimport('twitter_search_extended').search(['multiple sclerosis'], num_pages = 100)from scraperwiki import swimport swimport('twitter_search_extended').search(['multiple sclerosis'], num_pages = 100)
from scraperwiki import swimport swimport('twitter_search_extended').search(['Bosone di Higgs'], num_pages=100) from scraperwiki import swimport swimport('twitter_search_extended').search(['Bosone di Higgs'], num_pages=100)
import scraperwiki import lxml.html import requests #import xlrd, re import dateutil.parser #import urlparse #import json from hashlib import sha1 from random import random import re date_check = re.compile("\d+ \S+ \d\d\d\d") html2text=scraperwiki.swimport('html2text') # julians xls loader def LoadXLlinks(): book = xlrd.open_workbook(file_contents=requests.get(xlurl,verify=False).content) for i in [0,2,4,6,8,10,12]: sheet = book.sheet_by_index(i) print sheet.name, "rows:", sheet.nrows ldata = [ ] for i in range(sheet.nrows): sheetvalue = sheet.cell(i, 0).value.strip() if sheetvalue: ldata.append({"sheetname":sheet.name, "i":i, "url":sheetvalue}) scraperwiki.sqlite.save(["sheetname", "i"], ldata, "xllinks") # julians raw scraper
import scraperwiki import lxml.html lazycache=scraperwiki.swimport('lazycache') #url='http://www.mod.uk/DefenceInternet/AboutDefence/CorporatePublications/ConsultationsandCommunications/PublicConsultations/GpssConsultation.htm' url='http://www.mod.uk/DefenceInternet/DefenceNews/DefencePolicyAndBusiness/FreeTravelForMilitaryPersonnelSupportingSecurityForThe2012Games.htm' html=lazycache.lazycache(url) root=lxml.html.fromstring(html) root.make_links_absolute(url) print html def parsenews(root): data={} divs= root.xpath("//div[@id='left-column']/div[not(@class)]/p[1]/..") for x in divs: print x.attrib, lxml.html.tostring(x),'*' parsenews(root) def parseconsult(root): data={} data['title']=root.cssselect('h1')[0].text #data['body']=lxml.html.tostring(root.xpath("//div[@class='consultationcontent'][2]")[0]) data['ref'] = root.xpath("//div[@property='dc:identifier']/text()")[0] data['assoc_org']='MoD' data['attachments']=[] for attachment in root.xpath("//a[@rel='DC:haspart']"): url=attachment.attrib['href'] name=attachment.text
from scraperwiki.sqlite import save, select from scraperwiki import swimport from requests import session from lxml.html import fromstring, tostring import re from time import time keyify=swimport('keyify').keyify randomsleep=swimport('randomsleep').randomsleep URL="http://www.ncr.org.za/register_of_registrants/index.php" #DEV=True DEV=False DATE=time() RE={ 'leftpadding':re.compile(r'^ *') , 'rightpadding':re.compile(r' *$') } def headquarters(): cp1() #dc_and_cb1() def cp1(): p=Page('CP1') while p.lastpage()==False: tables=p.table().subtables() d = []
from scraperwiki import swimport swimport('twitter_search_extended').search(['politikk', 'kommunikasjon'])from scraperwiki import swimport swimport('twitter_search_extended').search(['politikk', 'kommunikasjon'])
# -*- coding: UTF-8 -*- import scraperwiki import json from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html import resource import sys import urlparse import re lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') agency = 'Luftambulansetjenesten ANS' def report_errors(errors): if 0 < len(errors): print "Errors:" for e in errors: print e exit(1) def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): errors = [] postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl)
def load_disclosures(): csv=urlopen(URL) d=swimport('csv2sw').csv2json(csv) save([],d,'disclosures')
import scraperwiki from scraperwiki import swimport search = swimport('twitter_search').search search(['from:random_robbie']) #print search
import scraperwiki import json import requests import lxml.html import itertools import datetime swutils = scraperwiki.swimport('swutils') user_agent = swutils.get_user_agent() requests.defaults.defaults['max_retries'] = 10 def main(): print 'scraping November 2012 json feed' json_url = "http://ec2-46-51-135-144.eu-west-1.compute.amazonaws.com/gaza/Nov2012/json" json_response = requests.get(json_url).text json_dict = json.loads(json_response) batch = [] alreadygot = [] try: rows = scraperwiki.sqlite.select('id from features') for row in rows: alreadygot.append(row['id']) except: pass if len(json_dict['features']) - len(alreadygot) > 0: