def main(): """Check what has been scraped so far, then resume. It might be good to check for gaps in the scraping. Or maybe a recursive approach isn't the best for search pages like this.""" #What's already been scraped recently? if not 'directory' in show_tables(): last_searched=0 else: #Only skip things from the current scraper completion attempt. if 'scrape_completions' in show_tables(): raw_ids=select('scrape_ids from scrape_completions order by completion_id desc limit 1')[0]['scrape_ids'] max_to_ignore=max(map(int,raw_ids.split(','))) min_to_scrape=max_to_ignore+1 else: min_to_scrape=1 incomplete_scrape=select('max("search_id") as m from directory where scrape_id>='+str(min_to_scrape))[0]['m'] if incomplete_scrape!=None: last_searched=incomplete_scrape else: last_searched=0 if 'scrape_times' in show_tables(): last_id=select('max("scrape_id") as m from scrape_times')[0]['m'] else: last_id=0 #Time of scrape start scrape_id=last_id+1 save(['scrape_id'],{"scrape_id":scrape_id,"scrape_time":time()},'scrape_times') grab(last_searched+1,{"scrape_id":scrape_id},oncompletion=oncompletion)
def main(): if 'splitnames' in show_tables(): print "Already finished" elif 'lobbyists' in show_tables(): parsenames() else: download() parsenames()
def main(): #What has already been scraped if 'contributions' in show_tables(): scraped=[row['querystring'] for row in select('querystring from contributions')] else: scraped=[] pagenumber=0 while True: pagenumber=pagenumber+1 xml=load(pagenumber) #Get the header row rows=xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]')[0].getchildren()[1:] keys=['name','contestant_party_district','date_received','class_and_partnum','association','monetary','non-monetary'] #Get the data rows ds=[] d={} for row in rows: cells=row.getchildren() contributor=cells.pop(0).getchildren()[0] d['querystring']=contributor.attrib['href'].replace("javascript:PopUp('contributor.aspx?",'').replace("', '300', '300');",'') d[keys[0]]=contributor.text for i in range(1,len(cells)): d[keys[i]]=cells[i].text ds.append(d) #Don't run again if already run if ds[0]['querystring'] in scraped: break else: save(['querystring'],ds,'contributions')
def main(): #finalpage=get_var('finalpage') prevpage = get_var('prevpage') #if None==finalpage: if True: finalpage = int(get_lastpage(getpage(1))) save_var('finalpage', finalpage) if None == prevpage: prevpage = 1 if prevpage < finalpage: step1(prevpage, finalpage) elif prevpage == finalpage: if not "step2completion" in show_tables(): execute( 'create table `step2completion` (`url` text, `browsed` boolean)' ) execute(""" INSERT INTO `step2completion` ( url , browsed ) SELECT url, 0 as "browsed" FROM locations """) commit() step2()
def get_page(url,table_name="pages"): if not table_name in show_tables(): raise PageNotSavedError(url) else: rows=select("`text` from %s where url=?" % table_name,[url]) l=len(rows) if l==0: raise PageNotSavedError(url) elif l>1: raise DatastoreError(url,"Multiple rows match this url.") elif l==1: if not 'text' in rows[0].keys(): raise DatastoreError(url,"The database does not have a `text` column.") else: return rows[0]['text'] #Tests #import unittest #class TestGetPage(unittest.TestCase): # def test_good_page(self): # url="https://scraperwiki.com/scrapers/dbgetpy/" # get_page(url) # row=select('* from `pages` where url=?',[url])[0] # assertEqual(set(row.keys()),set(["url","text"])) # assertIn("dbget=swimport('dbgetpy')",row['text']) #if __name__ == '__main__': # print "Running tests" # unittest.main() #else: # import os # print "Running from bash" # print os.execvp("python",["script.py"])
def swversion(table_name='swdata'): if table_name in show_tables(): timestamp = select("max(date_extracted) as m from %s;" % table_name)[0]['m'] execute("ALTER TABLE `%s` RENAME TO `%s_%d`;" % (table_name, table_name, timestamp)) commit()
def nextid(): defaultquery = [{"id": 0}] if not OBS in show_tables(): idquery = defaultquery else: idquery = select('max(id) as id from %s' % OBS) if len(idquery) == 0: idquery = defaultquery id = idquery[0]['id'] return id
def scrape(url,table_name="swdata", how_many = 10000): listurl=attendeelisturl(url) d=getattendeelist(listurl) d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many) if table_name in show_tables(): scraped_so_far=select('count(*) as "c" from `%s`'%table_name)[0]['c'] saveattendeelist(d[0:-scraped_so_far],table_name) else: saveattendeelist(d,table_name)
def nextid(): defaultquery=[{"id":0}] if not OBS in show_tables(): idquery=defaultquery else: idquery=select('max(id) as id from %s' % OBS) if len(idquery)==0: idquery=defaultquery id=idquery[0]['id'] return id
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/') save([], d, 'scraped') if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def scrape(url, table_name="swdata", how_many=10000): listurl = attendeelisturl(url) d = getattendeelist(listurl) d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many) if table_name in show_tables(): scraped_so_far = select('count(*) as "c" from `%s`' % table_name)[0]['c'] saveattendeelist(d[0:-scraped_so_far], table_name) else: saveattendeelist(d, table_name)
def check_identical_screenshot(image_base64): """Check whether there's an identical screenshot already saved""" #If,else to handle new tables if 'images' in show_tables(): identical_screenshot=select('screenshot_id from images where image="'+image_base64+'" limit 1') else: identical_screenshot=[] if len(identical_screenshot)==0: #No identical screenshot if 'images' in show_tables(): screenshot_id=select('max(screenshot_id) as id from images')[0]['id']+1 else: screenshot_id=1 return (False,{ "screenshot_id":screenshot_id , "image":image_base64 }) elif len(identical_screenshot)==1: return (True,identical_screenshot[0])
def check_identical_screenshot(image_base64): """Check whether there's an identical screenshot already saved""" #If,else to handle new tables if 'images' in show_tables(): identical_screenshot = select( 'screenshot_id from images where image="' + image_base64 + '" limit 1') else: identical_screenshot = [] if len(identical_screenshot) == 0: #No identical screenshot if 'images' in show_tables(): screenshot_id = select( 'max(screenshot_id) as id from images')[0]['id'] + 1 else: screenshot_id = 1 return (False, {"screenshot_id": screenshot_id, "image": image_base64}) elif len(identical_screenshot) == 1: return (True, identical_screenshot[0])
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv') save([], d, 'scraped') execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"') commit() if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def go(number=1,pagetype="SCRAPERS"): foo=scrapepage(number,pagetype) is_end=('scraper_urls' in show_tables()) and (foo['lasturl'] in select('url from scraper_urls')) #Save after checking whether it's the end because that's how I check. save(['url'],foo['scraper_urls'],'scraper_urls') if foo['lastpage']: #End when we reach the last page print "I scraped all the scrapers!" elif is_end: #End when we reach page where a scraper has already been scraped print "I scraped all of the new scrapers!" else: go(number+1,pagetype)
def atomic(): if "client"==pagetype(get_var('previous_href')): table_names=CLIENT_TABLES elif "lobbyist"==pagetype(get_var('previous_href')): table_names=LOBBYIST_TABLES else: raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href')) if "clients_urls" in show_tables(): sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s'] for table_name in table_names: execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl)) commit() return sourceUrl
def main(): if not 'cities_done' in show_tables(): cities_done=[] else: cities_done=select('* from cities_done') for fromcity in CITIES_NY: for tocity in CITIES_NY: if fromcity==tocity: print 'Skipping within-%s route' % fromcity elif {"from":fromcity,"to":tocity} in cities_done: print 'Already scraped %s to %s' % (fromcity,tocity) else: grab(fromcity,"NY",tocity,"NY") save([],{"from":fromcity,"to":tocity},'cities_done')
def get_page(url,table_name="pages"): if not table_name in show_tables(): raise PageNotSavedError(url) else: rows=select("`text` from %s where url=?" % table_name,[url]) l=len(rows) if l==0: raise PageNotSavedError(url) elif l>1: raise DatastoreError(url,"Multiple rows match this url.") elif l==1: if not 'text' in rows[0].keys(): raise DatastoreError(url,"The database does not have a `text` column.") else: return rows[0]['text']
def get_scraper_state(): all_views=[row['value'] for row in select('value FROM views ORDER BY value', verbose=False)] if 'links' not in show_tables(): years_to_do=[row['value'] for row in select('value FROM years ORDER BY value', verbose=False)] remaining_views_this_year=all_views else: finished=select('max(view) as "view",year from links where year=(select max(year) from links)', verbose=False) years_to_do=[row['value'] for row in select('value FROM years WHERE value>"%s" ORDER BY value' % finished[0]['year'], verbose=False)] remaining_views_this_year=[row['value'] for row in select('value from views where value>"%s"' % finished[0]['view'], verbose=False)] del(finished) return { "all-views":all_views , "years-to-do":years_to_do , "remaining-views-this-year":remaining_views_this_year }
def moreparsing_map(): "Map along the most recent results in the table (like a Couch map) and return a new one" d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);") for row in d: row['street-address'],row['postal-code']=splitAddress(row['Address_']) row['town']=extractTown(row['branchName']) if 'final' in show_tables(): execute('DROP TABLE `final`;') d_final = [] for row in d: if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]: d_final.append(row) save([],d_final,'final')
def geocode(): if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def parse(url, xml=None, suffix=''): if xml == None: xml = pull(url) print "Loading the page" scrapers = xml.xpath(PATH) for scraper in scrapers: if 'observations' in show_tables(): observation_id = select( 'max(observation_id) as id from observations')[0]['id'] + 1 else: observation_id = 1 identifiers = {"observation_id": observation_id} info = copy(identifiers) screenshot_identity = copy(identifiers) identifiers['time_scraped'] = time() identifiers['url'] = scraper.xpath('a')[0].attrib['href'] print "Extracting metadata" info['owner'], info['title'] = scraper.xpath('a/h4')[0].text.split( '/', 1) info['language'], info['type'] = re.split( r'[^a-zA-Z]+', scraper.xpath('a/span[@class="about"]')[0].text) info['created'] = scraper.xpath('a/span[@class="when"]')[0].text screenshot_identity['url'] = scraper.xpath('a/img')[0].attrib['src'] print "Checking whether I've already saved the screenshot" exists, image = check_identical_screenshot( getimage(screenshot_identity['url'])) if exists: #If I have, don't do anything with theimage print "Screenshot already saved" else: #If I haven't, save a new image print "Saving the new screenshot" image['observation_scraped_on'] = observation_id save(['observation_scraped_on', 'screenshot_id'], image, 'images') #Either way, link the observation to the saved image screenshot_identity['screenshot_id'] = image['screenshot_id'] save(['observation_id'], screenshot_identity, 'screenshot_identidies') #Save these at the end to avoid partial rows print "Saving" save(['observation_id'], info, 'homepage_metadata') save(['observation_id'], identifiers, 'observations')
def oncompletion(): scrape_ids=[str(row['scrape_id']) for row in select('scrape_id from scrape_times')] if 'scrape_completions' in show_tables(): #Increment id completion_id=1+select('max("completion_id") as m from scrape_completions')[0]['m'] #Remove old scrape_ids completion_rows=[row['scrape_ids'] for row in select('scrape_ids from scrape_completions')] old_scrapes=(','.join(completion_rows)).split(',') for old_scrape in old_scrapes: scrape_ids.remove(old_scrape) else: completion_id=1 d={ "completion_id":completion_id , "scrape_ids":','.join(scrape_ids) } save(['completion_id'],d,'scrape_completions')
def _parse_and_save(self,SpecificDataRow,maintable): "Clean up stuff" #Skip the raw parse #job_raw=self.rawparse() #for row in job_raw: # row['url']=self.url #save([],job_raw,maintable+'raw',verbose=False) for tr in self.getTableRows(): #Get the next jobId if maintable in show_tables(): jobId=select('max(jobId) as "jobId" from `%s`' % maintable, verbose=False)[0]['jobId']+1 else: jobId=1 r=SpecificDataRow(tr,jobId,self.url) r.parse_and_save()
def main(): #What has already been scraped if 'contributions' in show_tables(): scraped = [ row['querystring'] for row in select('querystring from contributions') ] else: scraped = [] pagenumber = 0 while True: pagenumber = pagenumber + 1 xml = load(pagenumber) #Get the header row rows = xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]' )[0].getchildren()[1:] keys = [ 'name', 'contestant_party_district', 'date_received', 'class_and_partnum', 'association', 'monetary', 'non-monetary' ] #Get the data rows ds = [] d = {} for row in rows: cells = row.getchildren() contributor = cells.pop(0).getchildren()[0] d['querystring'] = contributor.attrib['href'].replace( "javascript:PopUp('contributor.aspx?", '').replace("', '300', '300');", '') d[keys[0]] = contributor.text for i in range(1, len(cells)): d[keys[i]] = cells[i].text ds.append(d) #Don't run again if already run if ds[0]['querystring'] in scraped: break else: save(['querystring'], ds, 'contributions')
def main(): if "urls" not in show_tables(): copyUrlsDb() for url in getUrls(): slug = getScraperSlug(url) code, user = getCode(slug) if code != None: c = code.lower() save(['url'], { "code":code, "user": user, "url": url, "has_join": " join " in c, "has_attach": "attach" in c, "has_twitter": "twitter" in c, }) execute('UPDATE `urls` SET `scraped`=1 WHERE `url` = ?', url) commit() d = select('`user`, count(*) AS "attach-and-join-count" from `swdata` WHERE (`has_join` = 1 and `has_attach` = 1) GROUP BY `user`') save(['user'], d, 'results')
def parse(url,xml=None,suffix=''): if xml==None: xml=pull(url) print "Loading the page" scrapers=xml.xpath(PATH) for scraper in scrapers: if 'observations' in show_tables(): observation_id=select('max(observation_id) as id from observations')[0]['id']+1 else: observation_id=1 identifiers={"observation_id":observation_id} info=copy(identifiers) screenshot_identity=copy(identifiers) identifiers['time_scraped']=time() identifiers['url']=scraper.xpath('a')[0].attrib['href'] print "Extracting metadata" info['owner'],info['title']=scraper.xpath('a/h4')[0].text.split('/',1) info['language'],info['type']=re.split(r'[^a-zA-Z]+',scraper.xpath('a/span[@class="about"]')[0].text) info['created']=scraper.xpath('a/span[@class="when"]')[0].text screenshot_identity['url']=scraper.xpath('a/img')[0].attrib['src'] print "Checking whether I've already saved the screenshot" exists,image=check_identical_screenshot(getimage(screenshot_identity['url'])) if exists: #If I have, don't do anything with theimage print "Screenshot already saved" else: #If I haven't, save a new image print "Saving the new screenshot" image['observation_scraped_on']=observation_id save(['observation_scraped_on','screenshot_id'],image,'images') #Either way, link the observation to the saved image screenshot_identity['screenshot_id']=image['screenshot_id'] save(['observation_id'],screenshot_identity,'screenshot_identidies') #Save these at the end to avoid partial rows print "Saving" save(['observation_id'],info,'homepage_metadata') save(['observation_id'],identifiers,'observations')
def resume_siblings(js,level): if level==1: print "Finished resuming" elif not OBS in show_tables(): pass else: parent=select('parentjs from %s order by date_scraped desc limit 1' % OBS)[0]['parentjs'] foo,bar,baz=(eval(parent.replace('getlaw',''))) xml=fromstring(getlaw(foo,bar,baz)) links=get_law_links(xml,parent) linkslist=[link['observation']['js'] for link in links] if not js in linkslist: #It looks like the last sibling scraped was the last child of its parent; #None of its siblings need to be scraped pass else: first=linkslist.index(js)+1 last=len(linkslist) print level,first,last if first<last: for link in linkslist[first:last]: search_directory_tree(link,level)
def main(): if 'productlines' not in show_tables(): save(['href'],getproductlinelinks(MENU),'productlines') hrefs=[row['href'] for row in select('href from productlines')] for href in hrefs: p=ProductLine(href) t=p.current_models_table() #Overview save(['href'],p.overview(),'overview') #Specifications save([],t.specifications(units="english"),'specifications') save([],t.specifications(units="metric"),'specifications') #Links to models model_links=t.model_links() for model_link in model_links: model_link['product-line-href']=p.href save(['href'],model_links,'models') #Links to non-current models save([],p.noncurrent_models_link(),'current_noncurrent')
def resume_siblings(js, level): if level == 1: print "Finished resuming" elif not OBS in show_tables(): pass else: parent = select('parentjs from %s order by date_scraped desc limit 1' % OBS)[0]['parentjs'] foo, bar, baz = (eval(parent.replace('getlaw', ''))) xml = fromstring(getlaw(foo, bar, baz)) links = get_law_links(xml, parent) linkslist = [link['observation']['js'] for link in links] if not js in linkslist: #It looks like the last sibling scraped was the last child of its parent; #None of its siblings need to be scraped pass else: first = linkslist.index(js) + 1 last = len(linkslist) print level, first, last if first < last: for link in linkslist[first:last]: search_directory_tree(link, level)
from scraperwiki.sqlite import save_var,execute,commit,show_tables import os if "swvariables" in show_tables(): execute("DROP TABLE swvariables;") before=set(os.listdir('.')) save_var('foo','bar') #os.system('rm *.pyc') after=set(os.listdir('.')) #print before-after #print after s=[] for f in after: if f[0:4]!='data' and f[-3:-1]!='pyc': s.append(f) print s baz=[] baz.append('script.rb') baz.append('.cache') for f in baz: print open(f).read() from scraperwiki.sqlite import save_var,execute,commit,show_tables import os if "swvariables" in show_tables(): execute("DROP TABLE swvariables;")
from scraperwiki.sqlite import save_var, execute, commit, show_tables import os if "swvariables" in show_tables(): execute("DROP TABLE swvariables;") before = set(os.listdir('.')) save_var('foo', 'bar') #os.system('rm *.pyc') after = set(os.listdir('.')) #print before-after #print after s = [] for f in after: if f[0:4] != 'data' and f[-3:-1] != 'pyc': s.append(f) print s baz = [] baz.append('script.rb') baz.append('.cache') for f in baz: print open(f).read() from scraperwiki.sqlite import save_var, execute, commit, show_tables import os if "swvariables" in show_tables(): execute("DROP TABLE swvariables;")
row.update({'premises_name': premises_name, 'town': town,}) else: row['enter_manually'] = 1 row.update({'date_scraped': time(), 'ScraperRun': scraper_run, 'url': self.url, 'Record': int(self.url.split('=')[-1])}) data.append(row) save([], data, 'BusinessPremises') execute('CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)') execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)') execute('CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))') execute('CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)') commit() if "stack" not in show_tables() or select('count(*) as "c" from stack')[0]['c'] == 0: save_var('scraper_run', int(time())) scraper_run = get_var('scraper_run', None) if scraper_run == None: raise NameError('scraper_run is not defined.') seed([SearchResults(None)]) #seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables from scraperwiki import swimport #from requests import session import requests from lxml.html import fromstring, tostring import re from time import time, sleep keyify=swimport('keyify').keyify
save([], data, 'BusinessPremises') execute( 'CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)' ) execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)') execute( 'CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))' ) execute( 'CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)' ) commit() if "stack" not in show_tables() or select( 'count(*) as "c" from stack')[0]['c'] == 0: save_var('scraper_run', int(time())) scraper_run = get_var('scraper_run', None) if scraper_run == None: raise NameError('scraper_run is not defined.') seed([SearchResults(None)]) #seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables from scraperwiki import swimport #from requests import session import requests from lxml.html import fromstring, tostring import re from time import time, sleep
def is_new(this): """Check whether I've already saved it""" if 'tweets' not in show_tables(): return True else: return 0==select('count(*) as c from tweets where id="%s"' % this._tweet['id'])[0]['c']
def swversion(table_name='swdata'): if table_name in show_tables(): timestamp=select("max(date_extracted) as m from %s;" % table_name)[0]['m'] execute("ALTER TABLE `%s` RENAME TO `%s_%d`;"%(table_name,table_name,timestamp)) commit()