def main(): if get_var('columns_to_do') == None: columns = COLUMNS else: columns = loads(get_var('columns_to_do')) while len(columns) > 0: column = columns[0] d = load_data(column) out = [] for row in d: p = Place(row[column], (row['latitude'], row['longitude'])) row_geocode = p.geocode() row_geocode.update({ "address-column": column, "branchId": row['branchId'] }) out = row_geocode sleep(3) save([], out, 'geocoded') columns.remove(column) if len(columns) == 0: save_var('columns_to_do', None) else: save_var('columns_to_do', dumps(columns))
def main(): if get_var('columns_to_do') == None: columns = COLUMNS else: columns = loads(get_var('columns_to_do')) while len(columns) > 0: column = columns[0] d = load_data(column) out = [] for row in d: p = Place(row[column], (row['latitude'], row['longitude']) ) row_geocode = p.geocode() row_geocode.update({ "address-column":column, "branchId": row['branchId'] }) out = row_geocode sleep(3) save([], out, 'geocoded') columns.remove(column) if len(columns) == 0: save_var('columns_to_do',None) else: save_var('columns_to_do',dumps(columns))
def main(): if get_var('step')==None: save_var('step',0) while get_var('step')!=None: if get_var('step')==0: download() save_var('step',1) elif get_var('step')==1: moreparsing_map() save_var('step',2) else: #Scraper is finished; reset save_var('step',None)
def atomic(): if "client"==pagetype(get_var('previous_href')): table_names=CLIENT_TABLES elif "lobbyist"==pagetype(get_var('previous_href')): table_names=LOBBYIST_TABLES else: raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href')) if "clients_urls" in show_tables(): sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s'] for table_name in table_names: execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl)) commit() return sourceUrl
def getroutes(): skip = get_var('skip') json = urlopen( "http://coach.iriscouch.com/routes/_design/coach/_view/fullRoutes?skip=%d&limit=%d" % (skip, skip + 1000)).read() table = loads(json)['rows'] return table
def main(): if get_var('skip')==None: save_var('skip',0) routesTable=getroutes() for row in routesTable: if row['key'][0:2]!=row['key'][2:4]: get_route_schedules(row['id'],row['key'])
def main(): if None==get_var('DATE'): save_var('DATE',time()) searchTerms=get_searchTerms() for searchTerm in searchTerms: d=paginate(searchTerm) for row in d: row['date_scraped']=get_var('DATE') row['searchTerm']=searchTerm save_var('previous_searchTerm',searchTerm) save(['date_scraped', 'Name'],d,'initial') save_var('previous_searchTerm',None) save_var('DATE',None)
def shallow_scrape(): br = mechanize.Browser() c = sqlite.get_var("last_page", 0) + 1 max_c = c + 6 resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml?page=%d" % c) while c < max_c: print "" print "Handling page %d..." % c print " [" + br.geturl() + "]" ### extract data from page page = html.parse(resultspage) for u in page.getroot().findall("body/div/div/div/div/table/tr/td/table/tbody/tr/td/a"): urn = re.search("urn=([0-9]{6})", u.get("href")).group(1) yield urn ### get new page try: resultspage = br.follow_link(text="Next") sqlite.save_var("last_page", c) c += 1 if c % 2 == 0: time.sleep(10) except mechanize.LinkNotFoundError: c += 1 sqlite.save_var("last_page", 0) break
def get_searchTerms(): searchTerm=get_var('previous_searchTerm') if searchTerm==None: i=0 else: i=ascii_lowercase.index(searchTerm)+1 return ascii_lowercase[i:]
def main(): #finalpage=get_var('finalpage') prevpage = get_var('prevpage') #if None==finalpage: if True: finalpage = int(get_lastpage(getpage(1))) save_var('finalpage', finalpage) if None == prevpage: prevpage = 1 if prevpage < finalpage: step1(prevpage, finalpage) elif prevpage == finalpage: if not "step2completion" in show_tables(): execute( 'create table `step2completion` (`url` text, `browsed` boolean)' ) execute(""" INSERT INTO `step2completion` ( url , browsed ) SELECT url, 0 as "browsed" FROM locations """) commit() step2()
def main(): if None==get_var('downloaded'): download() save_var('downloaded',1) execute('DROP TABLE IF EXISTS `final`') clean() save_var('downloaded',None)
def main(): if get_var('skip') == None: save_var('skip', 0) routesTable = getroutes() for row in routesTable: if row['key'][0:2] != row['key'][2:4]: get_route_schedules(row['id'], row['key'])
def main(): foo=get_var('runId') runId=1 if foo==None else foo+1 save_var('runId',runId) try: nonsense() except: try: nonsense() except: exceeded(runId)
def main(): foo = get_var('runId') runId = 1 if foo == None else foo + 1 save_var('runId', runId) try: nonsense() except: try: nonsense() except: exceeded(runId)
def main(): if get_var('province') == 'step2': separate_addresses() execute('DELETE FROM swvariables WHERE name = "province"') commit() print(""" ================================ This run is finished! ================================ """) else: download()
def main(): if get_var('province')=='step2': separate_addresses() execute('DELETE FROM swvariables WHERE name = "province"') commit() print(""" ================================ This run is finished! ================================ """) else: download()
def download(abridge=False): d = [] #Resume the saved provinces provinces = getprovinces() province = get_var('province', provinces[0]) #Put the date in. This will get passed along, so this is the only time I add it. province['date_scraped'] = get_var('DATE', int(time())) #Get the cities cities = getcities(province['provinceId']) for city in cities: #Pass along the province city.update(province) branches = getbranches_with_info(city['cityId']) for branch in branches: #print branch branch.update(city) d.append(branch) if abridge: break i = provinces.index(province) + 1 print provinces if i < len(provinces): save_var('province', dumps(provinces[i])) print('Finished with branches in %s' % province['provinceName']) else: save_var('province', None) print('Finished with all the downloading!') save([], d, 'initial')
def download(abridge=False): d=[] #Resume the saved provinces provinces=getprovinces() province=get_var('province', provinces[0]) #Put the date in. This will get passed along, so this is the only time I add it. province['date_scraped']=get_var('DATE', int(time())) #Get the cities cities=getcities(province['provinceId']) for city in cities: #Pass along the province city.update(province) branches=getbranches_with_info(city['cityId']) for branch in branches: #print branch branch.update(city) d.append(branch) if abridge: break i=provinces.index(province)+1 print provinces if i<len(provinces): save_var('province',dumps(provinces[i])) print('Finished with branches in %s' % province['provinceName']) else: save_var('province',None) print('Finished with all the downloading!') save([],d,'initial')
def cp1(): execute(''' CREATE TABLE IF NOT EXISTS `businessPremises` ( `date_scraped` REAL, `businessPremisesURL` TEXT, FOREIGN KEY (date_scraped, businessPremisesUrl) REFERENCES cp1(date_scraped, businessPremisesUrl) ) ''') if get_var('crashed') == 1: pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)'] print "Resuming from page %d" % pagenum p = Page('CP1') p = Page('CP1', s=p.s, pagenum=pagenum) else: print "Starting a new run" p = Page('CP1') while p.lastpage()==False: print "Beginning page %d" % p.pagenum tables=p.table().subtables() d = [] for table in tables: row = table.parse() row['businessPremisesURL'] = table.business_premises_url() try: business_premises_data, more_registrant_data = table.business_premises(p.s) except Exception, msg: print "Error on %s: msg" % table.business_premises_url() sleep(60) print "Trying again" business_premises_data, more_registrant_data = table.business_premises(p.s) row['date_scraped']=DATE row['pagenum']=p.pagenum row['url']=URL+"?page=%d"%p.pagenum row.update(more_registrant_data) save([], business_premises_data, 'businessPremises') save(['date_scraped', 'businessPremisesURL'],row,'cp1') sleep(1) save_var('crashed', 1) p=p.next25()
def get_route_schedules(routeId,route): #Check that it's not a route within one city assert route[0:2]!=route[2:4] xml,theurl=grab(route) save(['routeId','url'],{ "routeId":routeId , "url":theurl },'urls') try: table=get_table(xml) except: save([],{"url":theurl},'errors') else: d_raw=parse_table(table) d=[] for row_raw in d_raw: row_clean={} for key in row_raw: if key==":Route/Trip": row_clean['routeNum']=row_raw[key] else: foo,bar,baz=key.split(':') if foo=="From": row_clean['fromCity']=bar row_clean['fromStop']=baz row_clean['fromTime']=row_raw[key] elif foo=="To": row_clean['toCity']=bar row_clean['toStop']=baz row_clean['toTime']=row_raw[key] row_clean['routeId']=routeId if row_clean['toStop']=='megabus.com stop' and row_clean['fromStop']=='megabus.com stop': table_name='megabus' else: table_name='schedules' save([],row_clean,table_name) save_var('skip',get_var('skip')+1)
def get_route_schedules(routeId, route): #Check that it's not a route within one city assert route[0:2] != route[2:4] xml, theurl = grab(route) save(['routeId', 'url'], {"routeId": routeId, "url": theurl}, 'urls') try: table = get_table(xml) except: save([], {"url": theurl}, 'errors') else: d_raw = parse_table(table) d = [] for row_raw in d_raw: row_clean = {} for key in row_raw: if key == ":Route/Trip": row_clean['routeNum'] = row_raw[key] else: foo, bar, baz = key.split(':') if foo == "From": row_clean['fromCity'] = bar row_clean['fromStop'] = baz row_clean['fromTime'] = row_raw[key] elif foo == "To": row_clean['toCity'] = bar row_clean['toStop'] = baz row_clean['toTime'] = row_raw[key] row_clean['routeId'] = routeId if row_clean['toStop'] == 'megabus.com stop' and row_clean[ 'fromStop'] == 'megabus.com stop': table_name = 'megabus' else: table_name = 'schedules' save([], row_clean, table_name) save_var('skip', get_var('skip') + 1)
def jobs(limit=2): print("Scraping individual job information") previous_href=get_var('previous_href',verbose=False) if previous_href==None: hrefs=[row['href'] for row in select('href FROM links ORDER BY href LIMIT %d' % limit,verbose=False)] else: hrefs=getnexthrefs(limit,previous_href) previous_url=atomic() if MORE_QUERY_STRING['base'] in previous_url: print "Resuming from %s" % previous_url url,startpage_str=re.split(MORE_QUERY_STRING['re'],previous_url) href_resume=url.replace('http://www.nyc.gov/lobbyistsearch/','') startpage=int(startpage_str) paginate_result(href_resume,startpage=startpage) randomsleep() for href in hrefs: paginate_result(href,startpage=1) randomsleep() save_var('previous_href',hrefs[-1],verbose=False)
def shallow_scrape(): br = mechanize.Browser() c = sqlite.get_var('last_page', 0) + 1 max_c = c + 6 resultspage = br.open( "http://www.education.gov.uk/edubase/quickSearchResult.xhtml?page=%d" % c) while c < max_c: print "" print "Handling page %d..." % c print " [" + br.geturl() + "]" ### extract data from page page = html.parse(resultspage) for u in page.getroot().findall( "body/div/div/div/div/table/tr/td/table/tbody/tr/td/a"): urn = re.search("urn=([0-9]{6})", u.get("href")).group(1) yield urn ### get new page try: resultspage = br.follow_link(text="Next") sqlite.save_var('last_page', c) c += 1 if c % 2 == 0: time.sleep(10) except mechanize.LinkNotFoundError: c += 1 sqlite.save_var('last_page', 0) break
from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit from scraperwiki import swimport from requests import session from lxml.html import fromstring, tostring import re from time import time, sleep keyify=swimport('keyify').keyify URL="http://www.ncr.org.za/register_of_registrants/index.php" #DEV=True DEV=False DATE = get_var('DATE', time()) RE={ 'leftpadding':re.compile(r'^ *') , 'rightpadding':re.compile(r' *$') } def cp1(): execute(''' CREATE TABLE IF NOT EXISTS `businessPremises` ( `date_scraped` REAL, `businessPremisesURL` TEXT, FOREIGN KEY (date_scraped, businessPremisesUrl) REFERENCES cp1(date_scraped, businessPremisesUrl) ) ''') if get_var('crashed') == 1:
def select_branchIds(branches): branchIds=[unicode(branch['branchId']) for branch in branches] previous_branchId=get_var('previous_branchId') branchIds.sort() i=branchIds.index(previous_branchId) return branchIds[i:]
if __VARS.has_key(a): return __VARS[a] def save_var(a,b): __VARS[a]=b def options(*args,**kwargs): return [{"branchId":"174","branchName":"DUNNO"}] else: options=swimport('options').options URL="http://www.postbank.co.za/contact.aspx?ID=3" def log(foo): print(foo) if get_var('previous_branchId')==None: save_var('DATE',time()) FIRST_RUN=True else: FIRST_RUN=False DATE=get_var('DATE') def main(): b=PostbankBrowser() branches=b.get_branch_list() if FIRST_RUN: save_branches(branches) for branchId in select_branchIds(branches): b.load_branch(branchId)
import json import requests import scraperwiki.sqlite as db import time begin = 1 counciltype = json.loads(requests.get('http://mapit.mysociety.org/areas/LBO').content) for council, data1 in counciltype.items(): if(db.get_var('id') == council and begin == 0): begin = 1 if(begin == 1): print data1['name'] db.save_var('id', council) children = json.loads(requests.get('http://mapit.mysociety.org/area/%s/children' % council).content) for id, data in children.items(): #time.sleep(1) json.loads(requests.get('http://mapit.mysociety.org/area/%s' % id).content) if (data['type'] == 'LBW'): #time.sleep(0.1) kml = requests.get('http://mapit.mysociety.org/area/%s.kml' % id).content councildata = {'type': data['type'], 'parent_name': data1['name'], 'id': int(id), 'name': data['name'], 'kml': kml[85:-7]} db.save(['id'], councildata, verbose=0)import json import requests import scraperwiki.sqlite as db import time
from scraperwiki.sqlite import save,get_var,save_var from lxml import html URL = "http://www.e-ships.net/new/?View=ShipSearchResult" URL += "&ship_name=&fdwt=&tdwt=&last_ex_name=&fgt=&tgt=&imo=&fnrt=&tnrt=&ship_type=-1&fteu=&tteu=&" URL += "flag=-1&floa=&tloa=&ship_class=-1&fbeam=&tbeam=&call_sign=&fdraft=&tdraft=&owner_id=" URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total=" URL += "&tengine_hp_total=&sortby=ship_name&p=%s" i=get_var('page') if i==None: i=0 while i<=1174: doc = html.parse(URL % i).getroot() rows = doc.xpath('//tr') if len(rows) == 1: break d=[] for row in rows: link = row.find('td/a') if link is None or not 'ShipDetails' in link.get('href'): continue number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row) d.append({ 'number': number, 'name': name, 'type': type, 'dwt': dwt, 'built': built,
from urllib2 import urlopen from scraperwiki import pdftoxml from scraperwiki.sqlite import save_var, get_var from lxml.etree import fromstring, tostring from unidecode import unidecode #pdfxml = pdftoxml(urlopen('http://www.hydepark.org/schools/HPKCC%20Youth%20Programs%20Database-%20Version%203.pdf').read()) #save_var('pdfxml', unidecode(pdfxml)) pdfxml = get_var('pdfxml') x = fromstring(pdfxml) for page in x.xpath('//page'): bs = page.xpath('descendant::b/text()') if len(bs) == 10 and bs[0] == 'HPKCC Youth Programs Database': del (bs[0]) if len(bs) != 9: raise ValueError('Wrong number of bold text boxes') elif bs[0:7] != [ 'Program Name', 'Program Desc.', 'Program Website', 'Program Address', 'Contact Name', 'Contact Number', 'Contact Email' ]: raise ValueError('Wrong table keys') updated, pagenumber = bs[7:9] lefts = map(int, list(set(page.xpath('text/@left')))) lefts.sort() for left in lefts: print[
def resume(levels=(4, 3, 2, 1)): """Resume an incomplete scrape.""" for level in levels: js = get_var(str(level)) if js != None: resume_siblings(js, level)
from scraperwiki import scrape from scraperwiki.sqlite import save,get_var from urllib2 import urlopen from lxml.html import fromstring from datetime import * part1 = 'http://wwe1.osc.state.ny.us/transparency/contracts/contractresults.cfm?PageNum_rsContract=' part2 = '&sb=a&searchBy=&a=Z0000&au=0&ac=&v=%28Enter+Vendor+Name%29&vo=B&cn=&c=-1&m1=0&y1=0&m2=0&y2=0&am=0&b=Search&entitytype=Agency&order=PAYEE_NAME&sort=ASC' start_page=get_var('start_page') if start_page==None: start_page=1 urlstrings = [ part1 + str(i) +part2 for i in range(start_page,992)] headers = [ 'Vendor','Agency','Contract_Number','Current_Contract_Amount', 'Spending_to_Date','Contract_Start_Date','Contract_End_Date', 'Contract_Description','Contract_Type', 'Contract_Approval_Date' ] for urlstring in urlstrings: page_data=scrape(urlstring) page_data=fromstring(page_data).cssselect('#tableData tr') dict_rows=[] for row in page_data: dict_row=dict(zip(headers, [cell.text_content().strip() for cell in row.cssselect('td') if cell.text_content().strip() != None])) dict_row['url']=urlstring if dict_row: try: dict_row['Current_Contract_Amount']=float(dict_row.get('Current_Contract_Amount','').replace('$', '').replace(',', '').replace('(', '-').replace(')', '')) dict_row['Spending_to_Date']=float(dict_row.get('Spending_to_Date','').replace('$', '').replace(',', '').replace('(', '-').replace(')', ''))
def gen_urls(): for skip in xrange(first_skip, last_skip + SKIP_STEP, SKIP_STEP): url = '%s&skip=%s' % (start_url, skip) if not db.get_var(url, 0, verbose=0): yield url
row.update({'date_scraped': time(), 'ScraperRun': scraper_run, 'url': self.url, 'Record': int(self.url.split('=')[-1])}) data.append(row) save([], data, 'BusinessPremises') execute('CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)') execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)') execute('CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))') execute('CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)') commit() if "stack" not in show_tables() or select('count(*) as "c" from stack')[0]['c'] == 0: save_var('scraper_run', int(time())) scraper_run = get_var('scraper_run', None) if scraper_run == None: raise NameError('scraper_run is not defined.') seed([SearchResults(None)]) #seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables from scraperwiki import swimport #from requests import session import requests from lxml.html import fromstring, tostring import re from time import time, sleep keyify=swimport('keyify').keyify randomsleep=swimport('randomsleep').randomsleep # --------------------------------------------------
return branch def parse_maphref(maphref): html=maphref.split("'")[1].replace('<br>','') x=fromstring(html) keys=["map_%s" % keyify(key) for key in x.xpath('strong/text()')] values=x.xpath('text()') return dict(zip(keys,values)) execute('CREATE TABLE IF NOT EXISTS provinces (provinceUrl TEXT )') execute('CREATE TABLE IF NOT EXISTS cities (provinceUrl TEXT, cityUrl TEXT, FOREIGN KEY(provinceUrl) REFERENCES provinces(provinceUrl) )') execute('CREATE TABLE IF NOT EXISTS branches (cityUrl TEXT, branchUrl TEXT, FOREIGN KEY(cityUrl) REFERENCES cities(cityUrl) )') commit() scraperrun = get_var('scraperrun', int(time())) save_var('scraperrun', scraperrun) seed([Menu(URLS['main'])]) execute('delete from swvariables where name = "scraperrun"') commit()from lxml.html import fromstring #from lxml.etree import fromstring from time import time import requests from scraperwiki.sqlite import save,save_var, get_var, select, commit, execute from scraperwiki import swimport options=swimport('options').options keyify=swimport('keyify').keyify randomsleep=swimport('randomsleep').randomsleep from json import loads,dumps strip_address = swimport('strip_address').strip_address
from urllib2 import urlopen from scraperwiki import pdftoxml from scraperwiki.sqlite import save_var, get_var from lxml.etree import fromstring, tostring from unidecode import unidecode #pdfxml = pdftoxml(urlopen('http://www.hydepark.org/schools/HPKCC%20Youth%20Programs%20Database-%20Version%203.pdf').read()) #save_var('pdfxml', unidecode(pdfxml)) pdfxml = get_var('pdfxml') x = fromstring(pdfxml) for page in x.xpath('//page'): bs = page.xpath('descendant::b/text()') if len(bs) == 10 and bs[0] == 'HPKCC Youth Programs Database': del(bs[0]) if len(bs) != 9: raise ValueError('Wrong number of bold text boxes') elif bs[0:7] != ['Program Name', 'Program Desc.', 'Program Website', 'Program Address', 'Contact Name', 'Contact Number', 'Contact Email']: raise ValueError('Wrong table keys') updated, pagenumber = bs[7:9] lefts = map(int, list(set(page.xpath('text/@left')))) lefts.sort() for left in lefts: print [t.xpath('string()') for t in page.xpath('text[@left = "%d"]' % left)] # tops = map(int, list(set(page.xpath('text/@top')))) # tops.sort() # print len(tops)from urllib2 import urlopen
def resume(levels=(4,3,2,1)): """Resume an incomplete scrape.""" for level in levels: js=get_var(str(level)) if js!=None: resume_siblings(js,level)
text = '\n'.join(text.split('\n')[2:4]).replace("document.getElementById('bizdir_directory').innerHTML = '", '') text = re.sub(r"';\s*document.getElementById('bizdir_search').disabled = false;", '', text).replace(" 8</div>';", ' 8</div>').replace("\\'", '') html = fromstring(text) bizdir_directory = [] for tr in html.cssselect('#bizdir_directory tr'): try: assert tr.xpath('count(td)') == 1 name = element(tr, 'td/b').text_content() description = element(tr, 'td/p/text()') bizdir_directory.append({'name': name, 'description': description, 'pageOffset': self.offset, 'scraperrun': scraperrun}) except: print tostring(tr) raise save(['scraperrun', 'pageOffset', 'name'], bizdir_directory, 'organizations') scraperrun = get_var('scraperrun', time()) save_var('scraperrun', scraperrun) seed([Directory('http://www.chicagoistheworld.org/notalone/directory-of-youth-organizations/')]) execute('DROP TABLE stack') execute('DROP TABLE swvariables') commit()from lxml.html import fromstring, tostring from time import time, sleep import requests from scraperwiki.sqlite import save,save_var, get_var, select, commit, execute import re # -------------------------------------------------- # Begin Bucket-Wheel # -------------------------------------------------- class Stack: "A fancier stack, at some point"
#!/usr/bin/env python """Download postsecret images""" from urllib2 import urlopen from lxml.html import fromstring from scraperwiki.sqlite import save,select,NoSuchTableSqliteError,get_var,save_var import base64 URL='http://www.postsecret.com' WAYBACK_URL=get_var('wayback_url') #End imports #----------- def wayback(url): """Download from the wayback machine.""" xml=pull(url) try: parse(url,xml,suffix='_wayback') url=xml.xpath('//a[img[@src="http://staticweb.archive.org/images/toolbar/wm_tb_prv_on.png"]]')[0].attrib['href'] print url wayback(url) except: save_var('wayback_url',url) def parse(url,xml=None,suffix=''): if xml==None: xml=pull(url) sunday=xml.xpath('//h2[@class="date-header"]')[0].text
def getroutes(): skip=get_var('skip') json=urlopen("http://coach.iriscouch.com/routes/_design/coach/_view/fullRoutes?skip=%d&limit=%d" % (skip,skip+1000)).read() table=loads(json)['rows'] return table
'CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)' ) execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)') execute( 'CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))' ) execute( 'CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)' ) commit() if "stack" not in show_tables() or select( 'count(*) as "c" from stack')[0]['c'] == 0: save_var('scraper_run', int(time())) scraper_run = get_var('scraper_run', None) if scraper_run == None: raise NameError('scraper_run is not defined.') seed([SearchResults(None)]) #seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables from scraperwiki import swimport #from requests import session import requests from lxml.html import fromstring, tostring import re from time import time, sleep keyify = swimport('keyify').keyify randomsleep = swimport('randomsleep').randomsleep
import json import requests import scraperwiki.sqlite as db import time begin = 0 counciltype = json.loads(requests.get("http://mapit.mysociety.org/areas/DIS").content) time.sleep(1) for council, data1 in counciltype.items(): print data1["name"] if db.get_var("id") == council and begin == 0: begin = 1 if begin == 1: db.save_var("id", council) children = json.loads(requests.get("http://mapit.mysociety.org/area/%s/children" % council).content) time.sleep(1) for id, data in children.items(): json.loads(requests.get("http://mapit.mysociety.org/area/%s" % id).content) time.sleep(1) if data["type"] == "DIW": kml = requests.get("http://mapit.mysociety.org/area/%s.kml" % id).content time.sleep(1) councildata = { "type": data["type"], "parent_name": data1["name"], "id": int(id), "name": data["name"], "kml": kml[85:-7], } db.save(["id"], councildata, verbose=0)
def grab(from_city,from_state,to_city,to_state): theurl=url(from_city,from_state,to_city,to_state) opener = build_opener(HTTPCookieProcessor()) try: o=opener.open(theurl) except BadStatusLine: return None xml=fromstring(o.read()) if not route_exists(xml): return None try: table=xml.xpath('//table[tr[@class="tableHilightHeader"]]')[0] except: save([],{ "from_city":from_city , "from_stat":from_state , "to_city":to_city , "to_state":to_state },'errors') return None #cities=table.xpath('tr[position()=1]/td') schedules=table.xpath('tr[position()>2]') columns=get_columns(table) #Get the id odId=get_var('origin_destination_id') sId=get_var('schedule_id') if None==odId: odId=1 if None==sId: sId=1 #Initialize for the loop d=[] on_fromstops=True for schedule in schedules: times=schedule.xpath('td/child::node()[position()=1]') #times.pop() #times.append(schedule.xpath('td/text()')[-1]) print zip(times,columns) #assert False for value,column in zip(times,columns): if "days"==column: row={"key":"days"} elif "arrow"==column: on_fromstops=False continue elif "Route/Trip"==column: row={"key":"route_code"} elif on_fromstops: row={ "key":"fromstop" , "stop":column } elif not on_fromstops: row={ "key":"tostop" , "stop":column } #End if statement row.update({ "value":value , "sId":sId , "odId":odId }) d.append(row) #End for loop sId+=1 #End for loop #Save origin-destination information save(['id'],{ "id":odId , "from_city":from_city , "from_stat":from_state , "to_city":to_city , "to_state":to_state },'origin_destinations') #Save schedule information save([],d,'schedules') odId+=1 save_var('origin_destination_id',odId) save_var('schedule_id',sId)